├── aws-examples
    ├── blog-post
    │   ├── aws-creds
    │   │   ├── iam.tfvars
    │   │   ├── variables.tf
    │   │   └── iam.tf
    │   ├── aws
    │   │   ├── your-cluster.tfvars
    │   │   ├── outputs.tf
    │   │   ├── variables.tf
    │   │   ├── efs.tf
    │   │   ├── autoscaler.tf
    │   │   └── main.tf
    │   ├── LICENSE
    │   └── README.md
    ├── minimal-deployment-tutorial
    │   ├── your-cluster.tfvars
    │   ├── ec2-intro-tutorial
    │   │   ├── your-ec2-values.tfvars
    │   │   ├── ec2-variables.tf
    │   │   ├── ec2-outputs.tf
    │   │   └── ec2-main.tf
    │   ├── terraform-aws-vpc-eks-deployment.png
    │   ├── variables.tf
    │   ├── outputs.tf
    │   ├── main.tf
    │   └── README.md
    ├── hackweek-infrastructure
    │   ├── iam-permissions
    │   │   ├── variables.tf
    │   │   └── iam.tf
    │   ├── deployment-notes
    │   │   ├── ohw_users_over_time.png
    │   │   ├── ohw_memory_usage_comparison.png
    │   │   ├── icesat_2_hackweek_users_over_time.png
    │   │   ├── icesat_2_hackweek_memory_usage_comparison.png
    │   │   ├── oceanhackweek2020-notes.md
    │   │   └── icesat2-hackweek-2020-notes.md
    │   ├── s3-backend
    │   │   ├── variables.tf
    │   │   └── main.tf
    │   ├── infrastructure
    │   │   ├── mount_volume.sh
    │   │   ├── prometheus-values-min.yaml
    │   │   ├── your-cluster.tfvars.template
    │   │   ├── aws-node-termination-handler.tf
    │   │   ├── grafana-values-min.yaml
    │   │   ├── outputs.tf
    │   │   ├── variables.tf
    │   │   ├── bastion.tf
    │   │   ├── monitoring.tf
    │   │   ├── s3-data-bucket.tf
    │   │   ├── autoscaler.tf
    │   │   ├── efs.tf
    │   │   └── main.tf
    │   └── README.md
    └── README.md
├── aws
    ├── ecr.tf
    ├── cluster-autoscaler-values.yml
    ├── your-cluster.tfvars.template
    ├── file-output.tf
    ├── variables.tf
    ├── outputs.tf
    ├── efs.tf
    ├── autoscaler.tf
    ├── iam.tf
    └── main.tf
├── gcp-examples
    ├── gke-dask-gateway
    │   ├── your-cluster.tfvars
    │   ├── dask-gateway-test-env.yaml
    │   ├── variables.tf
    │   ├── dask-gateway-config.yaml
    │   ├── main.tf
    │   ├── dask-gateway-test.ipynb
    │   └── README.md
    └── vm_instance_example
    │   ├── your-values.tfvars
    │   ├── variables.tf
    │   ├── main.tf
    │   └── README.md
├── .gitignore
├── .github
    └── workflows
    │   └── terraform.yaml
├── LICENSE
├── README.md
└── aws-creds
    └── iam.tf


/aws-examples/blog-post/aws-creds/iam.tfvars:
--------------------------------------------------------------------------------
1 | # Put the profile you will use for awcli
2 | profile = "default"


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/your-cluster.tfvars:
--------------------------------------------------------------------------------
1 | region = ""
2 | 
3 | profile = ""
4 | 
5 | deployment_name = ""
6 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/ec2-intro-tutorial/your-ec2-values.tfvars:
--------------------------------------------------------------------------------
1 | region = ""
2 | 
3 | profile = ""
4 | 
5 | deployment_name = ""
6 | 


--------------------------------------------------------------------------------
/aws/ecr.tf:
--------------------------------------------------------------------------------
1 | # FIXME: Support multiple images here
2 | resource "aws_ecr_repository" "primary_user_image" {
3 |   name = "${var.cluster_name}-user-image"
4 | }


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws-creds/variables.tf:
--------------------------------------------------------------------------------
1 | variable "region" {
2 |   default = "us-east-1"
3 | }
4 | 
5 | variable "profile" {
6 |   default = "default"
7 | }


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/your-cluster.tfvars:
--------------------------------------------------------------------------------
 1 | credential_file = ""
 2 | 
 3 | deployment_name = ""
 4 | 
 5 | project = ""
 6 | 
 7 | region = ""
 8 | 
 9 | zone = ""
10 | 


--------------------------------------------------------------------------------
/gcp-examples/vm_instance_example/your-values.tfvars:
--------------------------------------------------------------------------------
 1 | credential_file = ""
 2 | 
 3 | deployment_name = ""
 4 | 
 5 | project = ""
 6 | 
 7 | region = ""
 8 | 
 9 | zone = ""
10 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/iam-permissions/variables.tf:
--------------------------------------------------------------------------------
1 | variable "new-user-name" {
2 |   description = "Name for the IAM user to be created."
3 |   default = "terraform-bot"
4 | }
5 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/ohw_users_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pangeo-data/terraform-deploy/HEAD/aws-examples/hackweek-infrastructure/deployment-notes/ohw_users_over_time.png


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/terraform-aws-vpc-eks-deployment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pangeo-data/terraform-deploy/HEAD/aws-examples/minimal-deployment-tutorial/terraform-aws-vpc-eks-deployment.png


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/ohw_memory_usage_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pangeo-data/terraform-deploy/HEAD/aws-examples/hackweek-infrastructure/deployment-notes/ohw_memory_usage_comparison.png


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/icesat_2_hackweek_users_over_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pangeo-data/terraform-deploy/HEAD/aws-examples/hackweek-infrastructure/deployment-notes/icesat_2_hackweek_users_over_time.png


--------------------------------------------------------------------------------
/aws/cluster-autoscaler-values.yml:
--------------------------------------------------------------------------------
1 | rbac:
2 |   create: true
3 |   serviceAccountAnnotations:
4 |     eks.amazonaws.com/role-arn: "arn:aws:iam::783380859522:role/cluster-autoscaler"
5 | 
6 | cloudProvider: aws
7 | 
8 | autoDiscovery:
9 |   enabled: true


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   default = "us-west-2"
 3 | }
 4 | 
 5 | variable "profile" {
 6 |   default = "default"
 7 | }
 8 | 
 9 | variable "deployment_name" {
10 |   default = "z2jh-test-"
11 | }
12 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/icesat_2_hackweek_memory_usage_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pangeo-data/terraform-deploy/HEAD/aws-examples/hackweek-infrastructure/deployment-notes/icesat_2_hackweek_memory_usage_comparison.png


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/s3-backend/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "profile" {
 2 |   default = "default"
 3 | }
 4 | 
 5 | variable "region" {
 6 |   default = "us-west-2"
 7 | }
 8 | 
 9 | variable "bucket_name" {
10 |   default = "hackweek-terraform-state-bucket"
11 | }
12 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/ec2-intro-tutorial/ec2-variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   default = "us-west-2"
 3 | }
 4 | 
 5 | variable "profile" {
 6 |   default = "default"
 7 | }
 8 | 
 9 | variable "deployment_name" {
10 |   default = "terraform-test-ec2"
11 | }
12 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/your-cluster.tfvars:
--------------------------------------------------------------------------------
 1 | # Put your cluster where your data is
 2 | region = "us-east-1"
 3 | 
 4 | # Put the profile you will use for awcli
 5 | profile = "terraform-bot"
 6 | 
 7 | # Name of your vpc
 8 | vpc_name = ""
 9 | 
10 | # Name of your cluster
11 | cluster_name = ""


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/dask-gateway-test-env.yaml:
--------------------------------------------------------------------------------
 1 | name: dask-gateway-test-env
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python
 6 |   - dask=2.21.0
 7 |   - distributed=2.21.0
 8 |   - dask-gateway=0.8.0
 9 |   - jupyterlab
10 |   - nodejs
11 |   - ipywidgets
12 |   - cloudpickle=1.5.0
13 |   - toolz=0.10.0
14 |   - dask-labextension


--------------------------------------------------------------------------------
/gcp-examples/vm_instance_example/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "credential_file" {
 2 |   default = ""
 3 | }
 4 | 
 5 | variable "deployment_name" {
 6 |   default = "test-vm"
 7 | }
 8 | 
 9 | variable "project" {
10 |   default = ""
11 | }
12 | 
13 | variable "region" {
14 |   default = "us-west2"
15 | }
16 | 
17 | variable "zone" {
18 |   default = "us-west2-a"
19 | }
20 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/mount_volume.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | echo "Running start-up script as root"
 3 | 
 4 | # Auto-Mount EFS Drive  
 5 | yum install -y amazon-efs-utils
 6 | MOUNTPOINT=/mnt/efs
 7 | mkdir $MOUNTPOINT
 8 | 
 9 | # NEED TO CHANGE FS ID
10 | sudo mount -t efs fs-382b4792:/ $MOUNTPOINT
11 | echo "Ephemeral disk mounted to $MOUNTPOINT"


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "credential_file" {
 2 |   default = ""
 3 | }
 4 | 
 5 | variable "deployment_name" {
 6 |   default = "test-deployment"
 7 | }
 8 | 
 9 | variable "project" {
10 |   default = ""
11 | }
12 | 
13 | variable "region" {
14 |   default = "us-west2"
15 | }
16 | 
17 | variable "zone" {
18 |   default = "us-west2-a"
19 | }
20 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/ec2-intro-tutorial/ec2-outputs.tf:
--------------------------------------------------------------------------------
 1 | output "ami-id" {
 2 |   description = "ID of the Amazon Machine Image used for our EC2 instance"
 3 |   value       = "${data.aws_ami.amazon-linux.id}"
 4 | }
 5 | 
 6 | output "name" {
 7 |   description = "Amazon Resource Name of our EC2 instance"
 8 |   value       = "${aws_instance.test-ec2.arn}"
 9 | }
10 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/prometheus-values-min.yaml:
--------------------------------------------------------------------------------
 1 | alertmanager:
 2 |   persistentVolume:
 3 |     storageClass: prometheus
 4 | server:
 5 |   persistentVolume:
 6 |     storageClass: prometheus
 7 | # The following is the basic instructions
 8 | # This is not recommended for a production environment
 9 |   service:
10 |     type: NodePort
11 |     nodePort: 30900 


--------------------------------------------------------------------------------
/aws/your-cluster.tfvars.template:
--------------------------------------------------------------------------------
 1 | # Put your cluster where your data is
 2 | region = "us-east-1"
 3 | 
 4 | # See https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html for
 5 | # more information
 6 | map_users = [{
 7 |     userarn  = "arn:aws:iam::<aws-account-id>:user/<user-name>"
 8 |     username = "<user-name>"
 9 |     groups   = ["system:masters"]
10 | }]
11 | 
12 | # Name of your cluster
13 | cluster_name = "<cluster-name>"


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/your-cluster.tfvars.template:
--------------------------------------------------------------------------------
 1 | # Put your cluster where your data is
 2 | region = "us-east-1"
 3 | 
 4 | # See https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html for
 5 | # more information
 6 | map_users = [{
 7 |     userarn  = "arn:aws:iam::<aws-account-id>:user/<user-name>"
 8 |     username = "<user-name>"
 9 |     groups   = ["system:masters"]
10 | }]
11 | 
12 | # Name of your cluster
13 | cluster_name = "<cluster-name>"


--------------------------------------------------------------------------------
/gcp-examples/vm_instance_example/main.tf:
--------------------------------------------------------------------------------
 1 | provider "google" {
 2 |   credentials = file(var.credential_file)
 3 |   project     = var.project
 4 |   region      = var.region
 5 |   zone        = var.zone
 6 |   version     = "~>v3.39.0"
 7 | }
 8 | 
 9 | resource "google_compute_instance" "google-vm" {
10 |   name = var.deployment_name
11 |   machine_type = "e2-micro"
12 | 
13 |   boot_disk {
14 |     initialize_params {
15 |       image = "debian-cloud/debian-9"
16 |     }
17 |   }
18 | 
19 |   network_interface {
20 |     network = "default"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/aws/file-output.tf:
--------------------------------------------------------------------------------
 1 | resource "local_file" "hubploy_yaml" {
 2 |   filename = "hubploy.yaml"
 3 |   content = <<EOF
 4 | images:
 5 |   image_name: ${aws_ecr_repository.primary_user_image.repository_url}
 6 | 
 7 |   registry:
 8 |     provider: aws
 9 |     aws:
10 |       zone: ${var.region}
11 |       service_key: # FIXME: Use role assumpmtions when hubploy supports them
12 |       project: ${data.aws_caller_identity.current.account_id}
13 | 
14 | 
15 | cluster:
16 |   provider: aws
17 |   aws:
18 |       zone: ${var.region}
19 |       service_key: # FIXME: Use role assumpmtions when hubploy supports them
20 |       cluster: ${module.eks.cluster_id}
21 | EOF
22 | }


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/aws-node-termination-handler.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.12.6"
 3 | }
 4 | 
 5 | data "helm_repository" "eks" {
 6 |   name = "eks"
 7 |   url  = "https://aws.github.io/eks-charts"
 8 | }
 9 | 
10 | resource "helm_release" "aws-node-termination-handler" {
11 |   name       = "aws-node-termination-handler"
12 |   namespace  = "kube-system"
13 |   repository = data.helm_repository.eks.metadata[0].name
14 |   chart      = "aws-node-termination-handler"
15 | 
16 |   set{
17 |     name  = "serviceAccount.name"
18 |     value = "iamserviceaccount-${var.name_prefix}aws-node-termination-handler"
19 |   }
20 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local .terraform directories
 2 | **/.terraform/*
 3 | 
 4 | # .tfstate files
 5 | *.tfstate
 6 | *.tfstate.*
 7 | 
 8 | # Crash log files
 9 | crash.log
10 | 
11 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
12 | # .tfvars files are managed as part of configuration and so should be included in
13 | # version control.
14 | #
15 | # example.tfvars
16 | 
17 | # Ignore override files as they are usually used to override resources locally and so
18 | # are not checked in
19 | override.tf
20 | override.tf.json
21 | *_override.tf
22 | *_override.tf.json
23 | 
24 | # Include override files you do wish to add to version control using negated pattern
25 | #
26 | # !example_override.tf
27 | 
28 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
29 | # example: *tfplan*
30 | 


--------------------------------------------------------------------------------
/.github/workflows/terraform.yaml:
--------------------------------------------------------------------------------
 1 | name: Validate Terraform
 2 | on:
 3 |   - pull_request
 4 | 
 5 | jobs:
 6 |   terraform:
 7 |     name: terraform
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     defaults:
11 |       run:
12 |         shell: bash
13 |         working-directory: ./aws
14 | 
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Setup Terraform
20 |         uses: hashicorp/setup-terraform@v1
21 |         with:
22 |           terraform_version: 0.12.29
23 | 
24 |       - name: Terraform Init
25 |         id: init
26 |         run: terraform init
27 | 
28 |       - name: Terraform Format
29 |         id: fmt
30 |         run: terraform fmt -check
31 |         continue-on-error: true
32 | 
33 |       - name: Terraform Validate
34 |         id: validate
35 |         run: terraform validate -no-color
36 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/ec2-intro-tutorial/ec2-main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.12.6"
 3 | }
 4 | 
 5 | provider "aws" {
 6 |   version = ">= 2.57"
 7 |   region  = var.region
 8 |   profile = var.profile
 9 | }
10 | 
11 | data "aws_ami" "amazon-linux" {
12 |   most_recent      = true
13 |   owners           = ["amazon"]
14 | 
15 |   filter {
16 |     name   = "name"
17 |     values = ["amzn-ami*2018.03.0.2*"]
18 |   }
19 | 
20 |   filter {
21 |     name   = "virtualization-type"
22 |     values = ["hvm"]
23 |   }
24 | 
25 |   filter {
26 |     name   = "root-device-type"
27 |     values = ["ebs"]
28 |   }
29 | }
30 | 
31 | resource "aws_instance" "test-ec2" {
32 |   ami           = data.aws_ami.amazon-linux.id
33 |   instance_type = "t2.micro"
34 | 
35 |   tags = {
36 |     Name = "${var.deployment_name}"
37 |   }
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/aws/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   default = "us-east-1"
 3 | }
 4 | 
 5 | variable "cluster_name" {
 6 |   default = "test-cluster-change-name"
 7 | }
 8 | 
 9 | variable "map_accounts" {
10 |   description = "Additional AWS account numbers to add to the aws-auth configmap."
11 |   type        = list(string)
12 |   default = [ ]
13 | }
14 | 
15 | variable "map_roles" {
16 |   description = "Additional IAM roles to add to the aws-auth configmap."
17 |   type = list(object({
18 |     rolearn  = string
19 |     username = string
20 |     groups   = list(string)
21 |   }))
22 | 
23 |   default = [
24 |   ]
25 | }
26 | 
27 | variable "map_users" {
28 |   description = "Additional IAM users to add to the aws-auth configmap."
29 |   type = list(object({
30 |     userarn  = string
31 |     username = string
32 |     groups   = list(string)
33 |   }))
34 | 
35 |   default = [
36 |   ]
37 | }


--------------------------------------------------------------------------------
/aws/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "cluster_endpoint" {
 2 |   description = "Endpoint for EKS control plane."
 3 |   value       = module.eks.cluster_endpoint
 4 | }
 5 | 
 6 | output "cluster_security_group_id" {
 7 |   description = "Security group ids attached to the cluster control plane."
 8 |   value       = module.eks.cluster_security_group_id
 9 | }
10 | 
11 | output "kubectl_config" {
12 |   description = "kubectl config as generated by the module."
13 |   value       = module.eks.kubeconfig
14 | }
15 | 
16 | output "config_map_aws_auth" {
17 |   description = "A kubernetes configuration to authenticate to this EKS cluster."
18 |   value       = module.eks.config_map_aws_auth
19 | }
20 | 
21 | output "region" {
22 |   description = "AWS region."
23 |   value       = var.region
24 | }
25 | 
26 | output "node_groups" {
27 |   description = "Outputs from node groups"
28 |   value       = module.eks.node_groups
29 | }
30 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/grafana-values-min.yaml:
--------------------------------------------------------------------------------
 1 | persistence:
 2 |   storageClassName: prometheus
 3 | datasources: 
 4 |   datasources.yaml:
 5 |     apiVersion: 1
 6 |     datasources:
 7 |     - name: Prometheus
 8 |       type: prometheus
 9 |       url: http://prometheus-server.prometheus.svc.cluster.local
10 |       access: proxy
11 |       isDefault: true
12 | service:
13 |   type: LoadBalancer
14 | dashboardProviders:
15 |   dashboardproviders.yaml:
16 |     apiVersion: 1
17 |     providers:
18 |       - name: 'default'
19 |         orgId: 1
20 |         folder: ''
21 |         type: file
22 |         disableDeletion: true
23 |         editable: true
24 |         options:
25 |           path: /var/lib/grafana/dashboards/default
26 | dashboards:
27 |   default:
28 |     cluster-monitoring-for-kubernetes:
29 |       gnetId: 10000
30 |       revision: 1
31 |       datasource: Prometheus
32 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "cluster_endpoint" {
 2 |   description = "Endpoint for EKS control plane."
 3 |   value       = module.eks.cluster_endpoint
 4 | }
 5 | 
 6 | output "cluster_security_group_id" {
 7 |   description = "Security group ids attached to the cluster control plane."
 8 |   value       = module.eks.cluster_security_group_id
 9 | }
10 | 
11 | output "kubectl_config" {
12 |   description = "kubectl config as generated by the module."
13 |   value       = module.eks.kubeconfig
14 | }
15 | 
16 | output "config_map_aws_auth" {
17 |   description = "A kubernetes configuration to authenticate to this EKS cluster."
18 |   value       = module.eks.config_map_aws_auth
19 | }
20 | 
21 | output "region" {
22 |   description = "AWS region."
23 |   value       = var.region
24 | }
25 | 
26 | output "node_groups" {
27 |   description = "Outputs from node groups"
28 |   value       = module.eks.node_groups
29 | }
30 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "cluster_endpoint" {
 2 |   description = "Endpoint for EKS control plane."
 3 |   value       = module.eks.cluster_endpoint
 4 | }
 5 | 
 6 | output "cluster_security_group_id" {
 7 |   description = "Security group ids attached to the cluster control plane."
 8 |   value       = module.eks.cluster_security_group_id
 9 | }
10 | 
11 | output "kubectl_config" {
12 |   description = "kubectl config as generated by the module."
13 |   value       = module.eks.kubeconfig
14 | }
15 | 
16 | output "config_map_aws_auth" {
17 |   description = "A kubernetes configuration to authenticate to this EKS cluster."
18 |   value       = module.eks.config_map_aws_auth
19 | }
20 | 
21 | output "region" {
22 |   description = "AWS region."
23 |   value       = var.region
24 | }
25 | 
26 | output "node_groups" {
27 |   description = "Outputs from node groups"
28 |   value       = module.eks.node_groups
29 | }
30 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/outputs.tf:
--------------------------------------------------------------------------------
 1 | output "cluster_endpoint" {
 2 |   description = "Endpoint for EKS control plane."
 3 |   value       = module.eks.cluster_endpoint
 4 | }
 5 | 
 6 | output "cluster_security_group_id" {
 7 |   description = "Security group ids attached to the cluster control plane."
 8 |   value       = module.eks.cluster_security_group_id
 9 | }
10 | 
11 | output "kubectl_config" {
12 |   description = "kubectl config as generated by the module."
13 |   value       = module.eks.kubeconfig
14 | }
15 | 
16 | output "config_map_aws_auth" {
17 |   description = "A kubernetes configuration to authenticate to this EKS cluster."
18 |   value       = module.eks.config_map_aws_auth
19 | }
20 | 
21 | output "region" {
22 |   description = "AWS region."
23 |   value       = var.region
24 | }
25 | 
26 | output "node_groups" {
27 |   description = "Outputs from node groups"
28 |   value       = module.eks.node_groups
29 | }
30 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   default = "us-east-1"
 3 | }
 4 | 
 5 | variable "profile" {
 6 |   default = "terraform-bot"
 7 | }
 8 | 
 9 | variable "cluster_name" {
10 |   default = "test-cluster-change-name"
11 | }
12 | 
13 | variable "vpc_name" {
14 |   default = "vpc-test-cluster-change-name"
15 | }
16 | 
17 | variable "map_accounts" {
18 |   description = "Additional AWS account numbers to add to the aws-auth configmap."
19 |   type        = list(string)
20 |   default = [ ]
21 | }
22 | 
23 | variable "map_roles" {
24 |   description = "Additional IAM roles to add to the aws-auth configmap."
25 |   type = list(object({
26 |     rolearn  = string
27 |     username = string
28 |     groups   = list(string)
29 |   }))
30 | 
31 |   default = [
32 |   ]
33 | }
34 | 
35 | variable "map_users" {
36 |   description = "Additional IAM users to add to the aws-auth configmap."
37 |   type = list(object({
38 |     userarn  = string
39 |     username = string
40 |     groups   = list(string)
41 |   }))
42 | 
43 |   default = [
44 |   ]
45 | }


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "region" {
 2 |   default = "us-west-2"
 3 | }
 4 | 
 5 | variable "profile" {
 6 |   default = "default"
 7 | }
 8 | 
 9 | variable "name_prefix" {
10 |   default = ""
11 | }
12 | 
13 | variable "map_accounts" {
14 |   description = "Additional AWS account numbers to add to the aws-auth configmap."
15 |   type        = list(string)
16 |   default = [ ]
17 | }
18 | 
19 | variable "map_roles" {
20 |   description = "Additional IAM roles to add to the aws-auth configmap."
21 |   type = list(object({
22 |     rolearn  = string
23 |     username = string
24 |     groups   = list(string)
25 |   }))
26 | 
27 |   default = [
28 |   ]
29 | }
30 | 
31 | variable "map_users" {
32 |   description = "Additional IAM users to add to the aws-auth configmap."
33 |   type = list(object({
34 |     userarn  = string
35 |     username = string
36 |     groups   = list(string)
37 |   }))
38 | 
39 |   default = [
40 |   ]
41 | }
42 | 
43 | variable "grafana_admin" {
44 |   default = "admin"
45 | }
46 | 
47 | variable "grafana_password" {
48 |   default = ""
49 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Pangeo Data
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Pangeo Data
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/aws-examples/README.md:
--------------------------------------------------------------------------------
 1 | # AWS Deployment Examples
 2 | 
 3 | This folder houses different deployments to be used as examples.
 4 | 
 5 | ### `minimal-deployment-tutorial/`
 6 | 
 7 | This is meant as the
 8 | [Zero-to-JupyterHub-K8s](https://zero-to-jupyterhub.readthedocs.io/en/latest/)
 9 | equivalent setup. It's a very short, minimally configured setup that deploys
10 | enough infrastructure to support a JupyterHub deployment. This also heavily
11 | features tutorial documentation and is a presented demo at the ADSA 2020
12 | conference (link to come).
13 | 
14 | ### `blog-post/`
15 | 
16 | This accompanies the blog post
17 | [Deploying JupyterHub-Ready Infrastructure with Terraform on AWS](https://medium.com/pangeo/terraform-jupyterhub-aws-34f2b725f4fd).
18 | The infrastructure and instructions are present in this folder, but
19 | the blog post gives background information and is written better.
20 | 
21 | ### `hackweek-infrastructure/`
22 | 
23 | Hackweeks are a large part of the educational branch of UW's
24 | eScience Institute. Here, we have infrastructure that was used to support
25 | two of these hackweeks. It deploys similar infrastructure to the `blog-post`
26 | example, but is more battle-tested.
27 | 
28 | 


--------------------------------------------------------------------------------
/gcp-examples/vm_instance_example/README.md:
--------------------------------------------------------------------------------
 1 | # Google Cloud VM Example
 2 | 
 3 | Deploy a simple Virtual Machine on Google Cloud via Terraform.
 4 | 
 5 | ## Setup
 6 | 
 7 | Download / Configure the following
 8 | 
 9 | - [Terraform](https://www.terraform.io/downloads.html)
10 | - gcloud:
11 |   - [Install](https://cloud.google.com/sdk/docs/install)
12 |   - [Configure](https://cloud.google.com/sdk/docs/initializing)
13 |   - Create and download a
14 |   [service account key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) 
15 | 
16 | ## Deployment
17 | 
18 | Input variables into `your-values.tfvars` if you want
19 | options that differ from the defaults in `variables.tf`.
20 | Using these variables will require adding the following flag
21 | to all `terraform` commands: `--var-file=your-values.tfvars`.
22 | 
23 | In particular, you will need to supply the filepath of your
24 | service account key to the `credential_file` input variable.
25 | 
26 | Once you are ready to deploy, you can look at the plan with:
27 | 
28 | ```
29 | terraform plan
30 | ```
31 | 
32 | Deploy the VM instance with:
33 | 
34 | ```
35 | terraform apply
36 | ```
37 | 
38 | ## Tear Down
39 | 
40 | Remove the VM instance with:
41 | 
42 | ```
43 | terraform destroy
44 | ```
45 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/bastion.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_instance" "ecs-optimized" {
 2 |   ami                    = "ami-014a2e30da708ee8b"
 3 |   instance_type          = "t3a.medium"
 4 |   key_name               = "${var.name_prefix}bastion-instance"
 5 |   subnet_id              = module.vpc.public_subnets[0]
 6 |   vpc_security_group_ids = [aws_security_group.ecs-test.id,
 7 |                             aws_security_group.home_dirs_sg.id]
 8 |   user_data              = file("mount_volume.sh")
 9 |   tags = {
10 |     Name = "${var.name_prefix}efs-mount"
11 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
12 |     AutoTag_Creator = data.aws_caller_identity.current.arn
13 |     Project = "${var.name_prefix}project"
14 |   }
15 | }
16 | 
17 | resource "aws_security_group" "ecs-test" {
18 |   name        = "${var.name_prefix}ecs-test"
19 |   description = "Allow SSH"
20 |   vpc_id      = module.vpc.vpc_id
21 | 
22 |   ingress {
23 |     from_port = 22
24 |     to_port   = 22
25 |     protocol  = "tcp"
26 |     # Don't leave open for long periods of time
27 |     cidr_blocks = ["0.0.0.0/0", "71.197.186.34/32"]
28 |   }
29 |   egress {
30 |     from_port   = 0
31 |     to_port     = 0
32 |     protocol    = "-1"
33 |     cidr_blocks = ["0.0.0.0/0"]
34 |   }
35 |   tags = {
36 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
37 |     AutoTag_Creator = data.aws_caller_identity.current.arn
38 |     Project = "${var.name_prefix}project"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/monitoring.tf:
--------------------------------------------------------------------------------
 1 | resource "kubernetes_namespace" "prometheus" {
 2 |   metadata {
 3 |     name = "prometheus"
 4 |   }
 5 | }
 6 | 
 7 | resource "kubernetes_storage_class" "prometheus-storageclass" {
 8 |   metadata {
 9 |     name      = "prometheus"
10 |   }
11 | 
12 |   storage_provisioner = "kubernetes.io/aws-ebs"
13 |   reclaim_policy      = "Retain"
14 |   mount_options       = ["debug"]
15 |   parameters          = {
16 |     type = "gp2"
17 |   }
18 | }
19 | 
20 | resource "helm_release" "prometheus" {
21 |   name = "prometheus"
22 |   namespace = kubernetes_namespace.prometheus.metadata.0.name
23 |   repository = data.helm_repository.stable.metadata[0].name
24 |   chart = "prometheus"
25 |   version = "11.2.1"
26 | 
27 |   values = [
28 |     file("prometheus-values-min.yaml")
29 |   ]
30 | }
31 | 
32 | resource "kubernetes_namespace" "grafana" {
33 |   metadata {
34 |     name = "grafana"
35 |   }
36 | }
37 | 
38 | resource "helm_release" "grafana" {
39 |   name = "grafana"
40 |   namespace = kubernetes_namespace.grafana.metadata.0.name
41 |   repository = data.helm_repository.stable.metadata[0].name
42 |   chart = "grafana"
43 |   depends_on = [helm_release.prometheus]
44 |   version = "~5.0.24"
45 | 
46 |   values = [
47 |     file("grafana-values-min.yaml")
48 |   ]
49 | 
50 |   set {
51 |     name = "adminUser"
52 |     value = var.grafana_admin
53 |   }
54 | 
55 |   set {
56 |     name = "adminPassword"
57 |     value = var.grafana_password
58 |   }
59 | } 


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/dask-gateway-config.yaml:
--------------------------------------------------------------------------------
 1 | gateway:
 2 |   backend:
 3 |     image:
 4 |       name: daskgateway/dask-gateway
 5 |       tag: 0.8.0
 6 |       pullPolicy: IfNotPresent
 7 |       extraPodConfig:
 8 |         tolerations:
 9 |           - key: "k8s.dask.org/dedicated"
10 |             operator: "Equal"
11 |             value: "scheduler"
12 |             effect: "NoSchedule"
13 |           - key: "k8s.dask.org_dedicated"
14 |             operator: "Equal"
15 |             value: "scheduler"
16 |             effect: "NoSchedule"
17 | 
18 |     scheduler:
19 |       cores:
20 |         request: 6
21 |       memory:
22 |         request: 12G
23 | 
24 |     worker:
25 |       memory:
26 |         request: 4G
27 |       extraContainerConfig:
28 |         env:
29 |           - name: "GOOGLE_APPLICATION_CREDENTIALS"
30 |             value: "/var/run/secret/cloud.google.com/service-account.json"
31 |         volumeMounts:
32 |           - name: "service-account"
33 |             mountPath: "/var/run/secret/cloud.google.com"
34 |       extraPodConfig:
35 |         tolerations:
36 |           - key: "k8s.dask.org/dedicated"
37 |             operator: "Equal"
38 |             value: "worker"
39 |             effect: "NoSchedule"
40 |           - key: "k8s.dask.org_dedicated"
41 |             operator: "Equal"
42 |             value: "worker"
43 |             effect: "NoSchedule"
44 |         volumes:
45 |           - name: "service-account"
46 |             secret:
47 |               secretName: "dask-worker-sa-key"
48 | 
49 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/s3-backend/main.tf:
--------------------------------------------------------------------------------
 1 | # Create S3 + DynamoDB for remote terraform state
 2 | # =======================
 3 | terraform {
 4 |   required_version = ">= 0.12.0"
 5 | }
 6 | 
 7 | # After first terraform apply, store state in S3 by uncommenting, and re-running `terraform init` 
 8 | # variables not allowed in backend config, so you must manually enter bucket, region, table
 9 | # DON'T INCLUDE THIS PART IN THIS FOLDER
10 | # Put the following into the other folders where you want the state stored.
11 | # Can't store this state, unfortunately
12 | #terraform {
13 | #  backend "s3" {
14 | #    bucket         = "hackweek-terraform-state-bucket"
15 | #    key            = ""
16 | #    region         = "us-west-2"
17 | #    encrypt        = true
18 | #  }
19 | #}
20 | 
21 | provider "aws" {
22 |   version     = "~> 2.40"
23 |   profile     = var.profile
24 |   region      = var.region
25 | }
26 | 
27 | data "aws_caller_identity" "current" {}
28 | 
29 | output "s3_bucket_arn" {
30 |   value       = aws_s3_bucket.terraform_state.arn
31 |   description = "The ARN of the S3 bucket"
32 | }
33 | 
34 | resource "aws_s3_bucket" "terraform_state" {
35 |   bucket        = var.bucket_name
36 |   force_destroy = true
37 |   versioning {
38 |     enabled = true
39 |   }
40 |   server_side_encryption_configuration {
41 |     rule {
42 |       apply_server_side_encryption_by_default {
43 |         sse_algorithm = "AES256"
44 |       }
45 |     }
46 |   }
47 |   tags = {
48 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
49 |     AutoTag_Creator = data.aws_caller_identity.current.arn
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/aws/efs.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_efs_file_system" "home_dirs" {
 2 |   tags = {
 3 |     Name = "${var.cluster_name}-home-dirs"
 4 |   }
 5 | }
 6 | 
 7 | 
 8 | resource "aws_security_group" "home_dirs_sg" {
 9 |   name   = "home_dirs_sg"
10 |   vpc_id = module.vpc.vpc_id
11 | 
12 |   # NFS
13 |   ingress {
14 | 
15 |     # FIXME: Is ther a way to do this without CIDR block copy/pasta
16 |     cidr_blocks = [ "172.16.0.0/16"]
17 |     # FIXME: Do we need this security_groups here along with cidr_blocks
18 |     security_groups = [ module.eks.worker_security_group_id ]
19 |     from_port        = 2049
20 |     to_port          = 2049
21 |     protocol         = "tcp"
22 |   }
23 | 
24 |   tags = {
25 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
26 |     AutoTag_Creator = data.aws_caller_identity.current.arn
27 |   }
28 | }
29 | 
30 | resource "aws_efs_mount_target" "home_dirs_targets" {
31 |   count = length(module.vpc.public_subnets)
32 |   file_system_id = aws_efs_file_system.home_dirs.id
33 |   subnet_id = module.vpc.public_subnets[count.index]
34 |   security_groups = [ aws_security_group.home_dirs_sg.id ]
35 | }
36 | 
37 | data "helm_repository" "stable" {
38 |   name = "stable"
39 |   url = "https://kubernetes-charts.storage.googleapis.com"
40 | }
41 | 
42 | resource "kubernetes_namespace" "support" {
43 |   metadata {
44 |     name = "support"
45 |   }
46 | }
47 | 
48 | resource "helm_release" "efs-provisioner" {
49 |   name = "efs-provisioner"
50 |   namespace = kubernetes_namespace.support.metadata.0.name
51 |   repository = data.helm_repository.stable.metadata[0].name
52 |   chart = "efs-provisioner"
53 |   version = "0.11.0"
54 | 
55 |   set{
56 |     name = "efsProvisioner.efsFileSystemId"
57 |     value = aws_efs_file_system.home_dirs.id
58 |   }
59 | 
60 |   set {
61 |       name = "efsProvisioner.awsRegion"
62 |       value = var.region
63 |   }
64 | 
65 |   set {
66 |       # We don't entirely know the effects of dynamic gid allocation,
67 |       # particularly on the ability to re-use EFS when we recreate
68 |       # clusters. Turn it off for now.
69 |       name = "efsProvisioner.storageClass.gidAllocate.enabled"
70 |       value = false
71 |   }
72 | 
73 |   set {
74 |     name = "efsProvisioner.path"
75 |     value = "/"
76 |   }
77 | 
78 |   set {
79 |     name = "efsProvisioner.provisionerName"
80 |     value = "aws.amazon.com/efs"
81 |   }
82 | }


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/efs.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_efs_file_system" "home_dirs" {
 2 |   tags = {
 3 |     Name = "${var.cluster_name}-home-dirs"
 4 |   }
 5 | }
 6 | 
 7 | 
 8 | resource "aws_security_group" "home_dirs_sg" {
 9 |   name   = "home_dirs_sg"
10 |   vpc_id = module.vpc.vpc_id
11 | 
12 |   # NFS
13 |   ingress {
14 | 
15 |     # FIXME: Is ther a way to do this without CIDR block copy/pasta
16 |     cidr_blocks = [ "172.16.0.0/16"]
17 |     # FIXME: Do we need this security_groups here along with cidr_blocks
18 |     security_groups = [ module.eks.worker_security_group_id ]
19 |     from_port        = 2049
20 |     to_port          = 2049
21 |     protocol         = "tcp"
22 |   }
23 | 
24 |   tags = {
25 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
26 |     AutoTag_Creator = data.aws_caller_identity.current.arn
27 |   }
28 | }
29 | 
30 | resource "aws_efs_mount_target" "home_dirs_targets" {
31 |   count = length(module.vpc.private_subnets)
32 |   file_system_id = aws_efs_file_system.home_dirs.id
33 |   subnet_id = module.vpc.private_subnets[count.index]
34 |   security_groups = [ aws_security_group.home_dirs_sg.id ]
35 | }
36 | 
37 | data "helm_repository" "stable" {
38 |   name = "stable"
39 |   url = "https://kubernetes-charts.storage.googleapis.com"
40 | }
41 | 
42 | resource "kubernetes_namespace" "support" {
43 |   metadata {
44 |     name = "support"
45 |   }
46 | }
47 | 
48 | resource "helm_release" "efs-provisioner" {
49 |   name = "efs-provisioner"
50 |   namespace = kubernetes_namespace.support.metadata.0.name
51 |   repository = data.helm_repository.stable.metadata[0].name
52 |   chart = "efs-provisioner"
53 |   version = "0.11.0"
54 | 
55 |   set{
56 |     name = "efsProvisioner.efsFileSystemId"
57 |     value = aws_efs_file_system.home_dirs.id
58 |   }
59 | 
60 |   set {
61 |       name = "efsProvisioner.awsRegion"
62 |       value = var.region
63 |   }
64 | 
65 |   set {
66 |       # We don't entirely know the effects of dynamic gid allocation,
67 |       # particularly on the ability to re-use EFS when we recreate
68 |       # clusters. Turn it off for now.
69 |       name = "efsProvisioner.storageClass.gidAllocate.enabled"
70 |       value = false
71 |   }
72 | 
73 |   set {
74 |     name = "efsProvisioner.path"
75 |     value = "/"
76 |   }
77 | 
78 |   set {
79 |     name = "efsProvisioner.provisionerName"
80 |     value = "aws.amazon.com/efs"
81 |   }
82 | }


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/s3-data-bucket.tf:
--------------------------------------------------------------------------------
 1 | # S3 bucket
 2 | resource "aws_s3_bucket" "hackweek-data-bucket" {
 3 |   bucket = "${var.name_prefix}bucket"
 4 |   acl    = "private"
 5 | 
 6 |   tags = {
 7 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 8 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 9 |     Project = "${var.name_prefix}project"
10 |   }
11 | }
12 | 
13 | # bucket access policy
14 | resource "aws_iam_policy" "hackweek-bucket-access-policy" {
15 |     name        = "${var.name_prefix}data-bucket-access-policy"
16 |     path        = "/"
17 |     description = "Permissions for Terraform-controlled EKS cluster creation and management"
18 |     policy      = data.aws_iam_policy_document.hackweek-bucket-access-permissions.json
19 | }
20 | 
21 | # bucket access policy data
22 | data "aws_iam_policy_document" "hackweek-bucket-access-permissions" {
23 |   version = "2012-10-17"
24 | 
25 |   statement {
26 |     sid       = "${var.name_prefix}DataBucketListAccess"
27 | 
28 |     effect    = "Allow"
29 | 
30 |     actions   = [
31 |       "s3:ListBucket"
32 |     ]
33 | 
34 |     resources = [
35 |       aws_s3_bucket.hackweek-data-bucket.arn
36 |     ]
37 |   }
38 | 
39 |   statement {
40 |     sid       = "${var.name_prefix}DataBucketReadWriteAccess"
41 | 
42 |     effect    = "Allow"
43 | 
44 |     actions   = [
45 |       "s3:PutObject",
46 |       "s3:GetObject",
47 |       "s3:DeleteObject"
48 |     ]
49 | 
50 |     resources = [
51 |       "${aws_s3_bucket.hackweek-data-bucket.arn}/*"
52 |     ]
53 |   }
54 | }
55 | 
56 | # bucket access role
57 | # Wait for https://github.com/terraform-aws-modules/terraform-aws-iam/pull/74
58 | # to be merged
59 | # I have applied the PR manually in the meantime and it works
60 | module "iam_assumable_role_bucket_access" {
61 |   source                        = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
62 |   version                       = "~> v2.6.0"
63 |   create_role                   = true
64 |   role_name                     = "${var.name_prefix}bucket-access-serviceaccount"
65 |   provider_url                  = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
66 |   role_policy_arns              = [aws_iam_policy.hackweek-bucket-access-policy.arn]
67 |   oidc_fully_qualified_subjects = ["system:serviceaccount:hackweek-hub-staging:jovyan",
68 |                                    "system:serviceaccount:hackweek-hub-prod:jovyan"]
69 | 
70 |   tags = {
71 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
72 |     AutoTag_Creator = data.aws_caller_identity.current.arn
73 |     Project = "${var.name_prefix}project"
74 |   }
75 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PANGEO Terraform Deploy
 2 | 
 3 | Opinionated deployment of a PANGEO-style JupyterHub with [Terraform](https://www.terraform.io/)
 4 | 
 5 | ## What?
 6 | 
 7 | A cloud based JupyterHub close to your data is a great way to run interactive
 8 | computations, especially paired with [Dask](http://dask.org/) for parallel compute.
 9 | However, setting these up on your cloud provider of choice in an automated fashion
10 | with reasonable defaults can be a chore. This project aims to automate as much of that
11 | as possible.
12 | 
13 | This project's goal is to help you set up and maintain this kind of environment
14 | in a completely automated fashion - including setting up all the cloud infrastructure
15 | necessary. We do this by leveraging open source projects like
16 | [terraform](https://www.terraform.io/), [helm](https://helm.sh/) and
17 | [zero-to-jupyterhub](https://z2jh.jupyter.org).
18 | 
19 | Currently, there is only code for AWS here. However, we hope other cloud providers
20 | will be represented here soon enough.
21 | 
22 | ## How?
23 | 
24 | ### AWS Setup
25 | 
26 | #### 1. Install Tools
27 | 
28 | You'll need the following tools installed:
29 | 
30 | 1. [Terraform](https://www.terraform.io/downloads.html).
31 |    If you are on MacOS, you can install it with `brew install terraform`
32 | 2. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/).
33 |    If you are on MacOS, you can install it with `brew install kubectl`
34 | 3. [AWS CLI](https://aws.amazon.com/cli/)
35 | 
36 | #### 2. Authenticate to AWS
37 | 
38 | You need to have the `aws` CLI configured to run correctly from your
39 | local machine - terraform will just read from the same source. The
40 | [documentation on configuring AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
41 | should help.
42 | 
43 | #### 3. Fill in your variable names
44 | 
45 | The terraform deployment needs several variable names set before it
46 | can start. You can copy the file `aws/your-cluster.tfvars.template` into a file
47 | named `aws/<your-cluster>.tfvars`, and modify the placeholders there
48 | as appropriate.
49 | 
50 | #### 4. Run terraform!
51 | 
52 | Once this is all done, you should:
53 | 
54 | a. `cd aws`
55 | b. Run `terraform init` to set up appropriate plugins
56 | c. Run `terraform apply -var-file=<your-cluster>.tfvars`, referring to
57 |    The `tfvars` file you made in step 3
58 | d. Type `yes` when prompted
59 | e. ![Wait for a while](https://imgs.xkcd.com/comics/compiling.png).
60 |    This could take a while!
61 | 
62 | Your cluster is now set up! There are no hubs on it yet though. You should
63 | make a copy of the [hubploy template](https://github.com/yuvipanda/hubploy-template)
64 | repo, and go from there.
65 | 


--------------------------------------------------------------------------------
/aws/autoscaler.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_version = ">= 0.12.6"
 3 | }
 4 | 
 5 | # Create IAM role + automatically make it available to cluster autoscaler service account
 6 | module "iam_assumable_role_admin" {
 7 |   source                        = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
 8 |   version                       = "~> v2.6.0"
 9 |   create_role                   = true
10 |   role_name                     = "${module.eks.cluster_id}-cluster-autoscaler"
11 |   provider_url                  = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
12 |   role_policy_arns              = [aws_iam_policy.cluster_autoscaler.arn]
13 |   oidc_fully_qualified_subjects = ["system:serviceaccount:kube-system:cluster-autoscaler-aws-cluster-autoscaler"]
14 | 
15 |   tags = {
16 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
17 |     AutoTag_Creator = data.aws_caller_identity.current.arn
18 |   }
19 | }
20 | 
21 | resource "aws_iam_policy" "cluster_autoscaler" {
22 |   name_prefix = "cluster-autoscaler"
23 |   description = "EKS cluster-autoscaler policy for cluster ${module.eks.cluster_id}"
24 |   policy      = data.aws_iam_policy_document.cluster_autoscaler.json
25 | }
26 | 
27 | data "aws_iam_policy_document" "cluster_autoscaler" {
28 |   statement {
29 |     sid    = "clusterAutoscalerAll"
30 |     effect = "Allow"
31 | 
32 |     actions = [
33 |       "autoscaling:DescribeAutoScalingGroups",
34 |       "autoscaling:DescribeAutoScalingInstances",
35 |       "autoscaling:DescribeLaunchConfigurations",
36 |       "autoscaling:DescribeTags",
37 |       "ec2:DescribeLaunchTemplateVersions",
38 |     ]
39 | 
40 |     resources = ["*"]
41 |   }
42 | 
43 |   statement {
44 |     sid    = "clusterAutoscalerOwn"
45 |     effect = "Allow"
46 | 
47 |     actions = [
48 |       "autoscaling:SetDesiredCapacity",
49 |       "autoscaling:TerminateInstanceInAutoScalingGroup",
50 |       "autoscaling:UpdateAutoScalingGroup",
51 |     ]
52 | 
53 |     resources = ["*"]
54 | 
55 |     condition {
56 |       test     = "StringEquals"
57 |       variable = "autoscaling:ResourceTag/kubernetes.io/cluster/${module.eks.cluster_id}"
58 |       values   = ["owned"]
59 |     }
60 | 
61 |     condition {
62 |       test     = "StringEquals"
63 |       variable = "autoscaling:ResourceTag/k8s.io/cluster-autoscaler/enabled"
64 |       values   = ["true"]
65 |     }
66 |   }
67 | }
68 | resource "helm_release" "cluster-autoscaler" {
69 |   name = "cluster-autoscaler"
70 |   # Check that this is good, kube-system should already exist
71 |   namespace = "kube-system"
72 |   repository = data.helm_repository.stable.metadata[0].name
73 |   chart = "cluster-autoscaler"
74 |   version = "7.2.0"
75 | 
76 |   values = [
77 |     file("cluster-autoscaler-values.yml")
78 |   ]
79 | 
80 |   # Terraform keeps this in state, so we get it automatically!
81 |   set{
82 |     name = "awsRegion"
83 |     value = var.region
84 |   }
85 | 
86 |   set{
87 |     name = "autoDiscovery.clusterName"
88 |     value = module.eks.cluster_id
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/main.tf:
--------------------------------------------------------------------------------
  1 | # Providers
  2 | terraform {
  3 |   required_version = ">= 0.12.6"
  4 | }
  5 | 
  6 | provider "aws" {
  7 |   version = ">= 2.57"
  8 |   region  = var.region
  9 |   profile = var.profile
 10 | }
 11 | 
 12 | provider "random" {
 13 |   version = "~> 2.1"
 14 | }
 15 | 
 16 | provider "local" {
 17 |   version = "~> 1.4"
 18 | }
 19 | 
 20 | provider "null" {
 21 |   version = "~> 2.1"
 22 | }
 23 | 
 24 | provider "template" {
 25 |   version = "~> 2.1"
 26 | }
 27 | 
 28 | # VPC
 29 | data "aws_availability_zones" "available" {}
 30 | 
 31 | module "vpc" {
 32 |   source  = "terraform-aws-modules/vpc/aws"
 33 |   version = "~>2.44"
 34 |   name    = "${var.deployment_name}vpc"
 35 | 
 36 |   cidr            = "10.0.0.0/16"
 37 |   azs             = data.aws_availability_zones.available.names
 38 |   private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
 39 |   public_subnets  = ["10.0.101.0/24", "10.0.102.0/24", "10.0.103.0/24"]
 40 | 
 41 |   enable_dns_hostnames = true
 42 |   enable_dns_support   = true
 43 |   enable_nat_gateway   = true
 44 |   single_nat_gateway   = true
 45 | 
 46 |   tags = {
 47 |     "kubernetes.io/cluster/${var.deployment_name}cluster" = "shared"
 48 |   }
 49 | }
 50 | 
 51 | # Kubernetes
 52 | # After this is created, will need to set kubeconfig to look at this.
 53 | data "aws_eks_cluster" "cluster" {
 54 |   name = module.eks.cluster_id
 55 | }
 56 | 
 57 | data "aws_eks_cluster_auth" "cluster" {
 58 |   name = module.eks.cluster_id
 59 | }
 60 | 
 61 | provider "kubernetes" {
 62 |   host                   = data.aws_eks_cluster.cluster.endpoint
 63 |   cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
 64 |   token                  = data.aws_eks_cluster_auth.cluster.token
 65 |   load_config_file       = false
 66 |   version                = "~> 1.11"
 67 | }
 68 | 
 69 | # EKS Cluster
 70 | module "eks" {
 71 |   source          = "terraform-aws-modules/eks/aws"
 72 |   cluster_name    = "${var.deployment_name}cluster"
 73 |   cluster_version = "1.15"
 74 | 
 75 |   subnets         = module.vpc.private_subnets
 76 |   vpc_id          = module.vpc.vpc_id
 77 |   enable_irsa     = true
 78 | 
 79 |   cluster_endpoint_private_access = true
 80 | 
 81 |   #tags = {}
 82 | 
 83 |   worker_groups = [
 84 |     {
 85 |       name                  = "core"
 86 |       asg_max_size          = 1
 87 |       asg_min_size          = 1
 88 |       asg_desired_capacity  = 1
 89 |       instance_type         = "t3.xlarge"
 90 |       subnets               = [module.vpc.private_subnets[0]]
 91 | 
 92 |       # Use this to set labels / taints
 93 |       kubelet_extra_args    = "--node-labels=node-role.kubernetes.io/core=core,hub.jupyter.org/node-purpose=core"
 94 |       
 95 |       #tags = {}
 96 |     },
 97 |     {
 98 |       name               = "user"
 99 |       instance_type      = "m5.2xlarge"
100 | 
101 |       # Use this to set labels / taints
102 |       kubelet_extra_args = "--node-labels=node-role.kubernetes.io/user=user,hub.jupyter.org/node-purpose=user"
103 |     }
104 |   ]
105 | }
106 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/autoscaler.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | }
  4 | 
  5 | # Create IAM role + automatically make it available to cluster autoscaler service account
  6 | module "iam_assumable_role_admin" {
  7 |   source                        = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
  8 |   version                       = "~> v2.6.0"
  9 |   create_role                   = true
 10 |   role_name                     = "cluster-autoscaler-blogpost"
 11 |   provider_url                  = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
 12 |   role_policy_arns              = [aws_iam_policy.cluster_autoscaler.arn]
 13 |   oidc_fully_qualified_subjects = ["system:serviceaccount:kube-system:cluster-autoscaler-aws-cluster-autoscaler"]
 14 | 
 15 |   tags = {
 16 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 17 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 18 |   }
 19 | }
 20 | 
 21 | resource "aws_iam_policy" "cluster_autoscaler" {
 22 |   name_prefix = "cluster-autoscaler"
 23 |   description = "EKS cluster-autoscaler policy for cluster ${module.eks.cluster_id}"
 24 |   policy      = data.aws_iam_policy_document.cluster_autoscaler.json
 25 | }
 26 | 
 27 | data "aws_iam_policy_document" "cluster_autoscaler" {
 28 |   statement {
 29 |     sid       = "clusterAutoscalerAll"
 30 |     effect    = "Allow"
 31 | 
 32 |     actions   = [
 33 |       "autoscaling:DescribeAutoScalingGroups",
 34 |       "autoscaling:DescribeAutoScalingInstances",
 35 |       "autoscaling:DescribeLaunchConfigurations",
 36 |       "autoscaling:DescribeTags",
 37 |       "ec2:DescribeLaunchTemplateVersions",
 38 |     ]
 39 | 
 40 |     resources = ["*"]
 41 |   }
 42 | 
 43 |   statement {
 44 |     sid        = "clusterAutoscalerOwn"
 45 |     effect     = "Allow"
 46 | 
 47 |     actions    = [
 48 |       "autoscaling:SetDesiredCapacity",
 49 |       "autoscaling:TerminateInstanceInAutoScalingGroup",
 50 |       "autoscaling:UpdateAutoScalingGroup",
 51 |     ]
 52 | 
 53 |     resources  = ["*"]
 54 | 
 55 |     condition {
 56 |       test     = "StringEquals"
 57 |       variable = "autoscaling:ResourceTag/kubernetes.io/cluster/${module.eks.cluster_id}"
 58 |       values   = ["owned"]
 59 |     }
 60 | 
 61 |     condition {
 62 |       test     = "StringEquals"
 63 |       variable = "autoscaling:ResourceTag/k8s.io/cluster-autoscaler/enabled"
 64 |       values   = ["true"]
 65 |     }
 66 |   }
 67 | }
 68 | resource "helm_release" "cluster-autoscaler" {
 69 |   name        = "cluster-autoscaler"
 70 |   # Check that this is good, kube-system should already exist
 71 |   namespace   = "kube-system"
 72 |   repository  = data.helm_repository.stable.metadata[0].name
 73 |   chart       = "cluster-autoscaler"
 74 | 
 75 |   # Terraform keeps this in state, so we get it automatically!
 76 |   set{
 77 |     name  = "cloudProvder"
 78 |     value = "aws"
 79 |   }
 80 | 
 81 |   set{
 82 |     name  = "awsRegion"
 83 |     value = var.region
 84 |   }
 85 | 
 86 |   set{
 87 |     name  = "autoDiscovery.clusterName"
 88 |     value = module.eks.cluster_id
 89 |   }
 90 | 
 91 |   set{
 92 |     name  = "rbac.create"
 93 |     value = true
 94 |   }
 95 | 
 96 |   set{
 97 |     name  = "rbac.serviceAccountAnnotations.eks\\.amazonaws\\.com/role-arn"
 98 |     value = module.iam_assumable_role_admin.this_iam_role_arn
 99 |   }
100 | }


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/autoscaler.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | }
  4 | 
  5 | # Create IAM role + automatically make it available to cluster autoscaler service account
  6 | module "iam_assumable_role_admin" {
  7 |   source                        = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc"
  8 |   version                       = "~> v2.6.0"
  9 |   create_role                   = true
 10 |   role_name                     = "${var.name_prefix}cluster-autoscaler"
 11 |   provider_url                  = replace(module.eks.cluster_oidc_issuer_url, "https://", "")
 12 |   role_policy_arns              = [aws_iam_policy.cluster_autoscaler.arn]
 13 |   oidc_fully_qualified_subjects = ["system:serviceaccount:kube-system:cluster-autoscaler-aws-cluster-autoscaler"]
 14 | 
 15 |   tags = {
 16 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 17 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 18 |     Project = "${var.name_prefix}project"
 19 |   }
 20 | }
 21 | 
 22 | resource "aws_iam_policy" "cluster_autoscaler" {
 23 |   name_prefix = "cluster-autoscaler"
 24 |   description = "EKS cluster-autoscaler policy for cluster ${module.eks.cluster_id}"
 25 |   policy      = data.aws_iam_policy_document.cluster_autoscaler.json
 26 | }
 27 | 
 28 | data "aws_iam_policy_document" "cluster_autoscaler" {
 29 |   statement {
 30 |     sid    = "clusterAutoscalerAll"
 31 |     effect = "Allow"
 32 | 
 33 |     actions = [
 34 |       "autoscaling:DescribeAutoScalingGroups",
 35 |       "autoscaling:DescribeAutoScalingInstances",
 36 |       "autoscaling:DescribeLaunchConfigurations",
 37 |       "autoscaling:DescribeTags",
 38 |       "ec2:DescribeLaunchTemplateVersions",
 39 |     ]
 40 | 
 41 |     resources = ["*"]
 42 |   }
 43 | 
 44 |   statement {
 45 |     sid    = "clusterAutoscalerOwn"
 46 |     effect = "Allow"
 47 | 
 48 |     actions = [
 49 |       "autoscaling:SetDesiredCapacity",
 50 |       "autoscaling:TerminateInstanceInAutoScalingGroup",
 51 |       "autoscaling:UpdateAutoScalingGroup",
 52 |     ]
 53 | 
 54 |     resources = ["*"]
 55 | 
 56 |     condition {
 57 |       test     = "StringEquals"
 58 |       variable = "autoscaling:ResourceTag/kubernetes.io/cluster/${module.eks.cluster_id}"
 59 |       values   = ["owned"]
 60 |     }
 61 | 
 62 |     condition {
 63 |       test     = "StringEquals"
 64 |       variable = "autoscaling:ResourceTag/k8s.io/cluster-autoscaler/enabled"
 65 |       values   = ["true"]
 66 |     }
 67 |   }
 68 | }
 69 | resource "helm_release" "cluster-autoscaler" {
 70 |   name = "cluster-autoscaler"
 71 |   # Check that this is good, kube-system should already exist
 72 |   namespace = "kube-system"
 73 |   repository = data.helm_repository.stable.metadata[0].name
 74 |   chart = "cluster-autoscaler"
 75 | 
 76 |   # Terraform keeps this in state, so we get it automatically!
 77 |   set{
 78 |     name = "awsRegion"
 79 |     value = var.region
 80 |   }
 81 | 
 82 |   set{
 83 |     name = "autoDiscovery.clusterName"
 84 |     value = module.eks.cluster_id
 85 |   }
 86 | 
 87 |   set {
 88 |     name  = "cloudProvider"
 89 |     value = "aws"
 90 |   }
 91 | 
 92 |   set {
 93 |     name  = "rbac.create"
 94 |     value = true
 95 |   }
 96 | 
 97 |   set {
 98 |     name  = "rbac.serviceAccountAnnotations.eks\\.amazonaws\\.com/role-arn"
 99 |     value = module.iam_assumable_role_admin.this_iam_role_arn
100 |   }
101 | }


--------------------------------------------------------------------------------
/aws/iam.tf:
--------------------------------------------------------------------------------
  1 | # Attached to deployers group to let them assume the role we need
  2 | # Attached to hubploy-deployer role as well
  3 | data "aws_iam_policy_document" "hubploy_deployers" {
  4 |   statement {
  5 |     sid = "1"
  6 |     actions = [
  7 |       "sts:AssumeRole",
  8 |     ]
  9 |     resources = [
 10 |         aws_iam_role.hubploy_eks.arn,
 11 |         aws_iam_role.hubploy_ecr.arn
 12 |     ]
 13 |   }
 14 | }
 15 | 
 16 | # Attached to group
 17 | data "aws_iam_policy_document" "hubploy_eks" {
 18 |     statement {
 19 |         sid = "1"
 20 |         actions = [
 21 |           "eks:DescribeCluster"
 22 |         ]
 23 |         resources = [
 24 |           module.eks.cluster_arn
 25 |         ]
 26 |     }
 27 | }
 28 | 
 29 | # https://stackoverflow.com/questions/34922920/how-can-i-allow-a-group-to-assume-a-role
 30 | data "aws_iam_policy_document" "hubploy_assumptions" {
 31 |   statement {
 32 |     principals {
 33 |       type = "AWS"
 34 |       identifiers = [
 35 |           # Very icky, but see https://stackoverflow.com/questions/34922920/how-can-i-allow-a-group-to-assume-a-role
 36 |           "arn:aws:iam::${data.aws_caller_identity.current.account_id}:root"
 37 |       ]
 38 |     }
 39 |     actions = [
 40 |       "sts:AssumeRole"
 41 |     ]
 42 | 
 43 |   }
 44 | }
 45 | 
 46 | 
 47 | 
 48 | resource "aws_iam_role" "hubploy_eks" {
 49 |   name = "${var.cluster_name}-hubploy-eks"
 50 |   assume_role_policy = data.aws_iam_policy_document.hubploy_assumptions.json
 51 | }
 52 | 
 53 | resource "aws_iam_policy" "hubploy_eks" {
 54 |   name = "${var.cluster_name}-hubploy-eks"
 55 |   description = "Just enough access to get EKS credentials"
 56 | 
 57 |   policy = data.aws_iam_policy_document.hubploy_eks.json
 58 | }
 59 | 
 60 | resource "aws_iam_role_policy_attachment" "hubploy_eks" {
 61 |   role       = aws_iam_role.hubploy_eks.name
 62 |   policy_arn = aws_iam_policy.hubploy_eks.arn
 63 | }
 64 | 
 65 | resource "aws_iam_policy" "hubploy_deployers" {
 66 |   name = "${var.cluster_name}-hubploy-deployers"
 67 | 
 68 |   policy = data.aws_iam_policy_document.hubploy_deployers.json
 69 | }
 70 | resource "aws_iam_group" "hubploy_deployers" {
 71 |     name = "${var.cluster_name}-hubploy-deployers"
 72 | }
 73 | resource "aws_iam_group_policy_attachment" "hubploy_deployers" {
 74 |   group       = aws_iam_group.hubploy_deployers.name
 75 |   policy_arn = aws_iam_policy.hubploy_deployers.arn
 76 | }
 77 | 
 78 | resource "aws_iam_role" "hubploy_ecr" {
 79 |   name = "${var.cluster_name}-hubploy-ecr"
 80 |   assume_role_policy = data.aws_iam_policy_document.hubploy_assumptions.json
 81 | }
 82 | 
 83 | resource "aws_iam_role_policy_attachment" "hubploy_ecr_policy_attachment" {
 84 |   role = aws_iam_role.hubploy_ecr.name
 85 |   # FIXME: Restrict resources to the ECR repository we created
 86 |   policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryPowerUser"
 87 | }
 88 | 
 89 | 
 90 | data "aws_iam_policy_document" "hubploy_deployer_ec2_policy" {
 91 |   statement {
 92 |     sid = "1"
 93 |     actions = [
 94 |       "sts:AssumeRole",
 95 |     ]
 96 |     principals {
 97 |         type = "Service"
 98 |         identifiers = [
 99 |             "ec2.amazonaws.com"
100 |         ]
101 |     }
102 |   }
103 | }
104 | 
105 | resource "aws_iam_role" "hubploy_deployer" {
106 |   name = "${var.cluster_name}-hubploy-deployer"
107 |   assume_role_policy = data.aws_iam_policy_document.hubploy_deployer_ec2_policy.json
108 | }
109 | 
110 | resource "aws_iam_policy" "hubploy_deployer" {
111 |   name = "${var.cluster_name}-hubploy-deployer"
112 |   policy = data.aws_iam_policy_document.hubploy_deployers.json
113 | }
114 | 
115 | resource "aws_iam_role_policy_attachment" "hubploy_deployer" {
116 |   role       = aws_iam_role.hubploy_deployer.name
117 |   policy_arn = aws_iam_policy.hubploy_deployer.arn
118 | }
119 | 


--------------------------------------------------------------------------------
/aws/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | }
  4 | 
  5 | provider "aws" {
  6 |   version = ">= 2.28.1"
  7 |   region  = var.region
  8 | }
  9 | 
 10 | provider "template" {
 11 |   version = "~> 2.1"
 12 | }
 13 | 
 14 | data "aws_caller_identity" "current" {}
 15 | 
 16 | data "aws_eks_cluster" "cluster" {
 17 |   name = module.eks.cluster_id
 18 | }
 19 | 
 20 | data "aws_eks_cluster_auth" "cluster" {
 21 |   name = module.eks.cluster_id
 22 | }
 23 | 
 24 | provider "kubernetes" {
 25 |   host                   = data.aws_eks_cluster.cluster.endpoint
 26 |   cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
 27 |   token                  = data.aws_eks_cluster_auth.cluster.token
 28 |   load_config_file       = false
 29 |   version                = "~> 1.11.1"
 30 | }
 31 | 
 32 | data "aws_availability_zones" "available" {
 33 | }
 34 | 
 35 | module "vpc" {
 36 |   source  = "terraform-aws-modules/vpc/aws"
 37 |   version = "~> 2.6"
 38 | 
 39 |   name                 = "${var.cluster_name}-vpc"
 40 |   cidr                 = "172.16.0.0/16"
 41 |   azs                  = data.aws_availability_zones.available.names
 42 |   # We can use private subnets too once https://github.com/aws/containers-roadmap/issues/607
 43 |   # is fixed
 44 |   public_subnets       = ["172.16.1.0/24", "172.16.2.0/24", "172.16.3.0/24"]
 45 |   enable_dns_hostnames = true
 46 |   enable_dns_support   = true
 47 | 
 48 |   tags = {
 49 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 50 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 51 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 52 |   }
 53 | 
 54 |   public_subnet_tags = {
 55 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 56 |     "kubernetes.io/role/elb"                    = "1"
 57 |   }
 58 | 
 59 |   private_subnet_tags = {
 60 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 61 |     "kubernetes.io/role/internal-elb"           = "1"
 62 |   }
 63 | }
 64 | 
 65 | module "eks" {
 66 |   source       = "terraform-aws-modules/eks/aws"
 67 |   cluster_name = var.cluster_name
 68 |   cluster_version = "1.15"
 69 |   # FIXME: We can use private subnets once https://github.com/aws/containers-roadmap/issues/607
 70 |   # is fixed
 71 |   subnets      = module.vpc.public_subnets
 72 |   vpc_id       = module.vpc.vpc_id
 73 |   enable_irsa  = true
 74 | 
 75 |   tags = {
 76 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 77 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 78 |   }
 79 | 
 80 |   node_groups_defaults = {
 81 |     ami_type  = "AL2_x86_64"
 82 |     disk_size = 50
 83 |   }
 84 | 
 85 |   node_groups = {
 86 |     core = {
 87 |       desired_capacity = 1
 88 |       max_capacity     = 3
 89 |       min_capacity     = 1
 90 |       instance_type = "t3.micro"
 91 |       k8s_labels    = {
 92 |         "hub.jupyter.org/node-purpose" =  "core"
 93 |       }
 94 |       additional_tags = {
 95 |       }
 96 |     }
 97 |     notebook = {
 98 |      desired_capacity = 1
 99 |      max_capacity     = 10
100 |      min_capacity     = 1
101 | 
102 |      instance_type = "t3.medium"
103 |      k8s_labels = {
104 |        "hub.jupyter.org/node-purpose" =  "user"
105 |      }
106 |      additional_tags = {
107 |      }
108 |     }
109 |   }
110 | 
111 |   map_accounts = var.map_accounts
112 |   map_users = var.map_users
113 | 
114 | 
115 |   map_roles = concat([{
116 |     rolearn  = aws_iam_role.hubploy_eks.arn
117 |     username = aws_iam_role.hubploy_eks.name
118 |     # FIXME: Narrow these permissions down?
119 |     groups   = ["system:masters"]
120 |   }], var.map_roles)
121 | }
122 | 
123 | 
124 | provider "helm" {
125 |   kubernetes {
126 |     host                   = data.aws_eks_cluster.cluster.endpoint
127 |     cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
128 |     token                  = data.aws_eks_cluster_auth.cluster.token
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | }
  4 | 
  5 | provider "google-beta" {
  6 |   credentials = var.credential_file
  7 |   project     = var.project
  8 |   region      = var.region
  9 |   zone        = var.zone
 10 |   version     = "~>v3.39.0"
 11 | }
 12 | 
 13 | provider "google" {
 14 |   credentials = var.credential_file
 15 |   project     = var.project
 16 |   region      = var.region
 17 |   zone        = var.zone
 18 | }
 19 | 
 20 | module "vpc" {
 21 |   source = "terraform-google-modules/network/google"
 22 |   version = "~>2.3"
 23 | 
 24 |   project_id = var.project
 25 |   network_name = "${var.deployment_name}-network"
 26 | 
 27 |   subnets = [
 28 |     {
 29 |       subnet_name = "${var.deployment_name}-subnet-0",
 30 |       subnet_ip = "10.0.0.0/16",
 31 |       subnet_region = var.region
 32 |     }
 33 |   ]
 34 | 
 35 |   secondary_ranges = {
 36 |     "${var.deployment_name}-subnet-0" = [
 37 |       {
 38 |         range_name = "us-west2-gke-pods"
 39 |         ip_cidr_range = "192.168.0.0/18"
 40 |       },
 41 |       {
 42 |         range_name = "us-west2-gke-services"
 43 |         ip_cidr_range = "192.168.64.0/18"
 44 |       },
 45 |     ]
 46 |   }
 47 | }
 48 | 
 49 | module "gke" {
 50 |   source = "terraform-google-modules/kubernetes-engine/google//modules/beta-public-cluster"
 51 |   project_id = var.project
 52 |   name = "${var.deployment_name}-cluster"
 53 |   region = var.region
 54 |   #zone = var.zone
 55 |   network = module.vpc.network_name
 56 |   subnetwork = module.vpc.subnets_names[0]
 57 |   ip_range_pods = "us-west2-gke-pods"
 58 |   ip_range_services = "us-west2-gke-services"
 59 |   create_service_account = false
 60 |   remove_default_node_pool = true
 61 |   disable_legacy_metadata_endpoints = false
 62 |   #cluster_autoscaling = var.cluster_autoscaling
 63 | 
 64 |   node_pools = [
 65 |     {
 66 |       name = "scheduler-pool"
 67 |       machine_type = "n1-standard-8"
 68 |       min_count = 0
 69 |       max_count = 2
 70 |       #service_account = var.compute_engine_service_account
 71 |       preemptible = true
 72 |     },
 73 |     {
 74 |       name = "worker-pool"
 75 |       machine_type = "n1-standard-8"
 76 |       min_count = 0
 77 |       max_count = 40
 78 |       #service_account = var.compute_engine_service_account
 79 |       preemptible = true
 80 |     },
 81 |     {
 82 |       name = "gateway"
 83 |       machine_type = "n1-standard-8"
 84 |       auto_upgrade = true
 85 |       initial_node_count = 1
 86 |       preemptible = false
 87 |     }
 88 |   ]
 89 | 
 90 |   #node_pools_metadata = {}
 91 | 
 92 |   node_pools_labels = {
 93 |     all = {
 94 |       all-pools-example = true,
 95 |       Owner = "salvis",
 96 |       Project = "gke-terraform-test-cluster",
 97 |     }
 98 |   }
 99 | 
100 |   node_pools_taints = {
101 |     all = [
102 |       {
103 |         key = "all-pools-example"
104 |         value = true
105 |         effect = "PREFER_NO_SCHEDULE"
106 |       },
107 |     ]
108 |     scheduler-pool = [
109 |       {
110 |         key = "k8s.dask.org/dedicated"
111 |         value = "scheduler"
112 |         effect = "NO_SCHEDULE"
113 |       },
114 |     ]
115 |     worker-pool = [
116 |       {
117 |         key = "k8s.dask.org/dedicated"
118 |         value = "worker"
119 |         effect = "NO_SCHEDULE"
120 |       },
121 |     ]
122 |   }
123 | }
124 | 
125 | resource "kubernetes_cluster_role_binding" "example" {
126 |   metadata {
127 |     name = "terraform-clusterrole-binding"
128 |   }
129 | 
130 |   role_ref {
131 |     api_group = "rbac.authorization.k8s.io"
132 |     kind = "ClusterRole"
133 |     name = "cluster-adminm"
134 |   }
135 |   subject {
136 |     kind = "User"
137 |     name = "admin"
138 |     api_group = "rbac.authorization.k8s.io"
139 |   }
140 |   subject {
141 |     kind = "ServiceAccount"
142 |     name = "default"
143 |     namespace = "kube-system"
144 |   }
145 |   subject {
146 |     kind = "Group"
147 |     name = "system:masters"
148 |     api_group = "rbac.authorization.k8s.io"
149 |   }
150 | }
151 | 


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/dask-gateway-test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dask-Gateway Connection Example\n",
  8 |     "\n",
  9 |     "In this notebook, we will connect to the gateway, create a cluster, spin up some workers, and run a trivial computation."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from dask_gateway import Gateway, BasicAuth\n",
 19 |     "import getpass\n",
 20 |     "import dask\n",
 21 |     "from distributed import Client"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Authenticate to the Gateway\n",
 29 |     "\n",
 30 |     "Let's create a basic auth object to connect to the cluster. There is no real authentication that needs to be done, so any password will work."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "password = getpass.getpass()"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "auth = BasicAuth(\"pangeo\", password)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "The address you need in the quotes can be found with\n",
 56 |     "```\n",
 57 |     "kubectl get svc -n dask-gateway traefik-dask-gateway\n",
 58 |     "```\n",
 59 |     "The entry for `EXTERNAL-IP` is what you want after `http://`."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "gateway = Gateway(\"http://\", auth=auth)\n",
 69 |     "gateway.list_clusters()"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# No cluster options are specified here, but this would show them if they existed.\n",
 79 |     "gateway.cluster_options()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Spin up Cluster\n",
 87 |     "\n",
 88 |     "Once the cluster is running, we can ask it to provision workers."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "cluster = gateway.new_cluster()\n",
 98 |     "client = Client(cluster, set_as_default=False)\n",
 99 |     "client"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "cluster.adapt(minimum=2,maximum=5)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "The cell below should show that the cluster now has workers. To inspect the cluster further, you can click the Dask symbol on the left sidebar (hovering your mouse over it should make the word \"Dask\" pop up). From there, copy the dashboard link from the Client into the search bar, click the magnifying glass search button, and then click on the orange window buttons beneath to see more."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Check that the cluster has scaled\n",
125 |     "client"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Trivial Computation"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "import dask.array as da"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "a = da.random.normal(size=(1000,1000), chunks=(500,500))"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "a.mean().compute()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "## Shutdown the Cluster"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "client.close()\n",
176 |     "cluster.close()"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.7.8"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/oceanhackweek2020-notes.md:
--------------------------------------------------------------------------------
  1 | # Notes on JupyterHub Infrastructure for OceanHackWeek 2020
  2 | 
  3 | Relevant Links:
  4 | - [JupyterHub Configuration](https://github.com/oceanhackweek/jupyterhub)
  5 | - [Infrastructure Configuration](https://github.com/oceanhackweek/ohw-terraform-deploy/tree/main)
  6 | - [Computational Environment](https://github.com/oceanhackweek/jupyter-image)
  7 | 
  8 | ### Versions:
  9 | - Terraform version: [12.6](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/main.tf#L2)
 10 | - Helm Version: 3. Minor version is not recalled or recorded.
 11 | - JupyterHub Helm chart version: [v0.9.0-beta.3.n030.h796379e](https://github.com/oceanhackweek/jupyterhub/blob/staging/hub/requirements.yaml)
 12 | - Prometheus Helm chart version: [11.2.1](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/monitoring.tf#L25)
 13 | - Grafana Helm chart version: [5.0.24](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/monitoring.tf#L44)
 14 | - Cluster-autoscaler Helm chart version: [unspecified](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/autoscaler.tf#L74)
 15 | - EFS-Provisioner Helm chart version: [0.11.0](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/efs.tf#L64)
 16 | - AWS Node Termination Helm chart version: [unspecified](https://github.com/oceanhackweek/ohw-terraform-deploy/blob/main/aws/aws-node-termination-handler.tf)
 17 | 
 18 | ### Stand up JupyterHub, Notes
 19 | 
 20 | Make new `terraform-bot` as `ohw-terraform-bot`. Command to use
 21 | default profile is
 22 | 
 23 | ```
 24 | terraform apply -var-file=../../ohw.tfvars -var 'profile=default'
 25 | ```
 26 | 
 27 | Make access keys for `ohw-terraform-bot`, test with
 28 | 
 29 | ```
 30 | aws sts get-caller-identity --profile ohw-terraform-bot
 31 | ```
 32 | 
 33 | Couldn't figure out permissions for version control, added
 34 | permissions for S3 encryption.
 35 | 
 36 | VPC tags references EKS cluster id and created a cycle bc the vpc
 37 | should be created first. Referenced cluster_id in the tags via the
 38 | same string interpolation used to create the eks cluster
 39 | 
 40 | ```
 41 | terraform apply -var-file=../../ohw.tfvars -var-file=../../supprt/secrets.tfvars
 42 | ```
 43 | 
 44 | Login to the bastion instance, run
 45 | 
 46 | ```bash
 47 | cd /mnt/efs
 48 | sudo mkdir ocean.hackweek.io
 49 | cd ocean.hackweek.io
 50 | sudo mkdir tutorial-data
 51 | sudo mkdir shared
 52 | ```
 53 | 
 54 | Trying GitHub auth for Grafana. Works to get auth for anyone in the
 55 | GitHub org. Can't get teams to be responsive yet.
 56 | 
 57 | ### Secondary S3 Bucket
 58 | 
 59 | Separate folder for terraform state since it needs a different aws
 60 | provider w/ different region.
 61 | 
 62 | Create bucket and equivalent access policy (like the first bucket).
 63 | Need to attach this new policy to the role created in
 64 | `s3-data-bucket.tf`, so this would generally be run after the main
 65 | bunch of Terraform `apply`ing.
 66 | 
 67 | ### Costs
 68 | 
 69 | Large charge on Aug 12th, EC2-Other. API Operation for this is
 70 | listed as NatGateway. Data transfer from somewhere?
 71 | 
 72 | Interesting tiny discrepancy in cost explorer: total costs and EC2
 73 | Other costs are 8 cents lower when filtering by
 74 | "Project=ohw-project" after having filtered for
 75 | "Owner=ohw-terraform-bot"
 76 | 
 77 | ### Closing Thoughts
 78 | 
 79 | Didn't have others hooked into infrastructure at all. No testing if
 80 | they could actually alter it. Should have a process to set this up
 81 | at the beginning.
 82 | 
 83 | Grafana with GitHub login was configured and worked great! Docs for
 84 | that are here; https://grafana.com/docs/grafana/latest/auth/github/
 85 | Others used Grafana with GitHub login just fine, didn't mind being
 86 | Viewers and not Editors.
 87 | 
 88 | I have been pushing directly to staging, which is easier for me but
 89 | probably unprofessional.
 90 | 
 91 | ### OHW Closing Thoughts Call: Infrastructure
 92 | 
 93 | - Should have a deadline for the packages that will be used in the
 94 | hub.
 95 | - Consider higher memory nodes (so participants have more memory)
 96 | - Should have Grafana data saving as part of the shutdown process.
 97 | 
 98 | ### Tear Down
 99 | 
100 | Shutdown commands, as I ran them, plus notes:
101 | 
102 | ```bash
103 | helm delete ohw-hub-staging -n ohw-hub-staging
104 | helm delete ohw-hub-prod -n ohw-hub-prod
105 | cd cloud-infrastructure/aws/secondary-bucket
106 | terraform destroy --var-file ../../../secondary-s3-bucket.tfvars
107 | yes
108 | [1][2]
109 | cd ..
110 | terraform destroy --var-file ../../ohw.tfvars
111 | yes
112 | [3]
113 | cd ../s3-backend
114 | terraform destroy --var-file ../../ohw.tfvars
115 | [4]
116 | cd ../aws-creds
117 | terraform destroy
118 | [5]
119 | ```
120 | 
121 | [1] I can't delete the policy version, needed to add that to the IAM user.
122 |   - Can't add that, delete the policy manually.
123 | 
124 | [2] Bucket isn't empty, delete manually.
125 | 
126 | [3] Bucket isn't empty, delete manually, then re-run the command.
127 | 
128 | [4] Error deleting an object version, Access Denied, delete manually.
129 | 
130 | [5] Need to delete access keys manually, then re-run command.
131 | 
132 | Updated github with config for future reference.
133 | 
134 | Removed DNS records
135 | 
136 | ### Time Spent on this
137 | 
138 | - (0.5 + 3 hours) infrastructure standup
139 | - (3 hours) JupyterHub standup
140 | - (2 hours) Image setup
141 | - (4 hours) maintenance
142 | - (0.5 hours) secondary S3 bucket
143 | - (2 hours) closing thoughts call
144 | - (1 hour) jupyterhub teardown
145 | 
146 | ### Usage Plots
147 | 
148 | ![Ocean Hack Week Users over Time](ohw_users_over_time.png)
149 | 
150 | ![Ocean Hack Week Memory Usage over Time](ohw_memory_usage_comparison.png)
151 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/README.md:
--------------------------------------------------------------------------------
  1 | # Hackweek Example Infrastructure
  2 | 
  3 | This is an example infrastructure configuration that can support a
  4 | JupyterHub for the purpose of a hackweek. Two hackweeks have been
  5 | supported in this manner. Each of them has a repo for its infrastructure:
  6 | - [ICESat-2 Hackweek 2020](https://github.com/ICESAT-2HackWeek/terraform-deploy/tree/master)
  7 | - [OceanHackWeek 2020](https://github.com/oceanhackweek/ohw-terraform-deploy/tree/main)
  8 | 
  9 | This repository has been used with the
 10 | [hackweek-template](https://github.com/salvis2/hackweek-template)
 11 | repository, which deploys the JupyterHub onto the cluster using
 12 | [`hubploy`](https://github.com/yuvipanda/hubploy). The instructions for
 13 | deploying the JupyterHub are present in the `hackweek-template` repo.
 14 | 
 15 | This repo has three folders, each of which serves a specific purpose:
 16 | - `s3-backend/`: Set up an S3 bucket to hold Terraform's configuration
 17 | for the other two folders. 
 18 | - `iam-permissions/`: Set up an IAM user to have all of the permissions
 19 | Terraform needs to deploy this infrastructure.
 20 | - `infrastructure/`: The infrastructure's configuration.
 21 | 
 22 | ## S3 Backend
 23 | 
 24 | The backend is optional. If you are managing the infrastructure alone,
 25 | you probably don't need a backend. If this is the case, you can delete
 26 | the `backend` blocks in `infrastructure/main.tf` and
 27 | `iam-permissions/iam.tf`.
 28 | 
 29 | If you are working with multiple people to manage the infrastructure,
 30 | it is highly recommended to have a backend. Terraform only understands
 31 | what it has deployed because of its state file. The backend stores the
 32 | state file encrypted on S3, allowing multiple people to work on the
 33 | configuration without having to check Terraform state files into version
 34 | control. Since the state file contains all secrets in plain text,
 35 | checking it into version control would immediately release all your
 36 | secrets to the internet. Don't do it.
 37 | 
 38 | If you need to change some of the variables in `s3-backend/variables.tf`,
 39 | you can make a `.tfvars` files with some new values. If you do this,
 40 | put the `--var-file=<your-values>.tfvars` flag after any
 41 | `terraform plan`, `terraform apply`, or `terraform destroy` command.
 42 | 
 43 | Build the backend with Terraform:
 44 | 
 45 | ```
 46 | cd s3-backend
 47 | terraform init
 48 | terraform apply
 49 | ```
 50 | 
 51 | ## IAM Permissions
 52 | 
 53 | This section is also optional. If you have an IAM profile with `admin`
 54 | privileges, you can use your own AWS profile. You will need to supply
 55 | this profile name in `infrastructure/<your-values>.tfvars`.
 56 | 
 57 | If you changed the S3 backend bucket's name, change it in the `backend`
 58 | block in `iam-permissions/iam.tf`. It must be hard-coded, so passing
 59 | a variable to this key is not possible.
 60 | 
 61 | Create the user with:
 62 | 
 63 | ```
 64 | cd iam-permissions
 65 | terraform init
 66 | terraform apply
 67 | ```
 68 | 
 69 | You will then have to manually create keys for this user in the
 70 | [IAM section of the AWS Console](https://console.aws.amazon.com/iam/).
 71 | Configure `awscli` to use these keys with
 72 | `aws configure --profile=<var.new-user-name>`.
 73 | 
 74 | If you add infrastructure to the `infrastructure` folder, you may need
 75 | to add permissions to this user in order for Terraform to run. You
 76 | can rerun the above commands again to update the user's permissions.
 77 | 
 78 | ## Infrastructure
 79 | 
 80 | The infrastructure has the following pieces:
 81 | - VPC configured to use private subnets behind public subnets and
 82 | routing.
 83 | - EKS Cluster with 3 nodegroups, configured for spot instances
 84 | and autoscaling.
 85 | - `cluster-autoscaler` release to enable autoscaling.
 86 | - `aws-node-termination-handler` release for increased quality-of-life
 87 | with spot instancing.
 88 | - EFS for user home directory storage, hub shared storage, and
 89 | read-only storage.
 90 | - Bastion machine to connect to the EFS (especially for putting data
 91 | into the read-only storage).
 92 | - S3 data bucket for hub shared storage.
 93 | - Monitoring deployment of Prometheus and Grafana.
 94 | 
 95 | Replacing variable values is recommended, so
 96 | `your-cluster.tfvars.template` is provided to show the options. You
 97 | should copy the content of this file to a new file,
 98 | `<your-cluster>.tfvars`, and input your values. Putting more entries
 99 | into `map_users` will allow multiple `aws` users to configure the
100 | cluster. 
101 | 
102 | Replacing the Grafana `adminUser` and `adminPassword` are
103 | highly recommended, but the `.tfvars` files with their values
104 | should **NOT** be checked into version control without encryption.
105 | 
106 | If you have multiple `.tfvars` files, you can supply them to
107 | Terraform with extra `--var-file` flags, one per `.tfvars` file.
108 | 
109 | Deploy the infrastructure with:
110 | 
111 | ```
112 | cd infrastructure
113 | terraform init
114 | terraform apply --var-file=<your-cluster>.tfvars
115 | ```
116 | 
117 | This can take 10-15 minutes to create. Sometimes, `helm` resources can
118 | stall out. You should be able to run the `apply` command again and get
119 | it to succeed. You can now put other things on the cluster, like a
120 | JupyterHub!
121 | 
122 | ## Tear-Down
123 | 
124 | Remove the infrastructure in the reverse order that you created it.
125 | Before doing anything with Terraform, you must remove anything you put
126 | onto the cluster that you didn't put there with Terraform.
127 | 
128 | Remove infrastructure:
129 | 
130 | ```
131 | cd infrastructure
132 | terraform destroy --var-file=<your-cluster>.tfvars
133 | # wait
134 | 
135 | cd iam-permissions
136 | terraform destroy
137 | 
138 | cd s3-backend
139 | terraform destroy
140 | ```
141 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/efs.tf:
--------------------------------------------------------------------------------
  1 | resource "aws_efs_file_system" "home_dirs" {
  2 |   tags = {
  3 |     Name = "${var.name_prefix}home-dirs"
  4 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
  5 |     AutoTag_Creator = data.aws_caller_identity.current.arn
  6 |     Project = "${var.name_prefix}project"
  7 |   }
  8 | }
  9 | 
 10 | 
 11 | resource "aws_security_group" "home_dirs_sg" {
 12 |   name   = "${var.name_prefix}home_dirs_sg"
 13 |   vpc_id = module.vpc.vpc_id
 14 | 
 15 |   # NFS
 16 |   ingress {
 17 | 
 18 |     # FIXME: Is ther a way to do this without CIDR block copy/pasta
 19 |     cidr_blocks = [ "172.16.0.0/16"]
 20 |     # FIXME: Do we need this security_groups here along with cidr_blocks
 21 |     security_groups = [ module.eks.worker_security_group_id ]
 22 |     from_port        = 2049
 23 |     to_port          = 2049
 24 |     protocol         = "tcp"
 25 |   }
 26 | 
 27 |   tags = {
 28 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 29 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 30 |     Project = "${var.name_prefix}project"
 31 |   }
 32 | }
 33 | 
 34 | resource "aws_efs_mount_target" "home_dirs_targets" {
 35 |   count           = length(module.vpc.private_subnets)
 36 |   file_system_id  = aws_efs_file_system.home_dirs.id
 37 |   subnet_id       = module.vpc.private_subnets[count.index]
 38 |   security_groups = [ aws_security_group.home_dirs_sg.id ]
 39 | }
 40 | 
 41 | data "helm_repository" "stable" {
 42 |   name = "stable"
 43 |   url  = "https://kubernetes-charts.storage.googleapis.com"
 44 | }
 45 | 
 46 | resource "kubernetes_namespace" "support" {
 47 |   metadata {
 48 |     name = "support"
 49 |   }
 50 | }
 51 | 
 52 | resource "helm_release" "efs-provisioner" {
 53 |   name       = "efs-provisioner"
 54 |   namespace  = kubernetes_namespace.support.metadata.0.name
 55 |   repository = data.helm_repository.stable.metadata[0].name
 56 |   chart      = "efs-provisioner"
 57 |   version    = "0.11.0"
 58 | 
 59 |   set{
 60 |     name  = "efsProvisioner.efsFileSystemId"
 61 |     value = aws_efs_file_system.home_dirs.id
 62 |   }
 63 | 
 64 |   set {
 65 |       name  = "efsProvisioner.awsRegion"
 66 |       value = var.region
 67 |   }
 68 | 
 69 |   set {
 70 |       # We don't entirely know the effects of dynamic gid allocation,
 71 |       # particularly on the ability to re-use EFS when we recreate
 72 |       # clusters. Turn it off for now.
 73 |       name  = "efsProvisioner.storageClass.gidAllocate.enabled"
 74 |       value = false
 75 |   }
 76 | 
 77 |   set {
 78 |     name  = "efsProvisioner.path"
 79 |     value = "/home-directories"
 80 |   }
 81 | 
 82 |   set {
 83 |     name  = "efsProvisioner.provisionerName"
 84 |     value = "aws.amazon.com/efs"
 85 |   }
 86 | }
 87 | 
 88 | resource "kubernetes_persistent_volume" "shared-efs-volume" {
 89 |   metadata {
 90 |     name = "${var.name_prefix}staging-shared-nfs"
 91 |   }
 92 | 
 93 |   spec {
 94 |     capacity = {
 95 |       storage = "1Mi"
 96 |     }
 97 |     access_modes = ["ReadWriteMany"]
 98 |     persistent_volume_source {
 99 |       nfs {
100 |         server = aws_efs_file_system.home_dirs.dns_name
101 |         path = "/icesat-2.hackweek.io/shared/"
102 |       }
103 |     }
104 |     storage_class_name = "manual-sc"
105 |   }
106 | }
107 | 
108 | resource "kubernetes_persistent_volume_claim" "shared-efs-claim" {
109 |   metadata {
110 |     name = "shared-nfs"
111 |     namespace = "hackweek-hub-staging"
112 |   }
113 | 
114 |   spec {
115 |     access_modes = ["ReadWriteMany"]
116 |     resources {
117 |       requests = {
118 |         storage = "1Mi"
119 |       }
120 |     }
121 |     volume_name = kubernetes_persistent_volume.shared-efs-volume.metadata.0.name
122 |     storage_class_name = "manual-sc"
123 |   }
124 | 
125 |   wait_until_bound = false
126 | }
127 | 
128 | resource "kubernetes_persistent_volume" "shared-efs-volume-prod" {
129 |   metadata {
130 |     name = "${var.name_prefix}prod-shared-nfs"
131 |   }
132 | 
133 |   spec {
134 |     capacity = {
135 |       storage = "1Mi"
136 |     }
137 |     access_modes = ["ReadWriteMany"]
138 |     persistent_volume_source {
139 |       nfs {
140 |         server = aws_efs_file_system.home_dirs.dns_name
141 |         path = "/icesat-2.hackweek.io/shared/"
142 |       }
143 |     }
144 |     storage_class_name = "manual-sc"
145 |   }
146 | }
147 | 
148 | resource "kubernetes_persistent_volume_claim" "shared-efs-claim-prod" {
149 |   metadata {
150 |     name = "shared-nfs"
151 |     namespace = "hackweek-hub-prod"
152 |   }
153 | 
154 |   spec {
155 |     access_modes = ["ReadWriteMany"]
156 |     resources {
157 |       requests = {
158 |         storage = "1Mi"
159 |       }
160 |     }
161 |     volume_name = kubernetes_persistent_volume.shared-efs-volume-prod.metadata.0.name
162 |     storage_class_name = "manual-sc"
163 |   }
164 | 
165 |   wait_until_bound = false
166 | }
167 | 
168 | resource "kubernetes_persistent_volume" "tutorial-data-volume" {
169 |   metadata {
170 |     name = "${var.name_prefix}tutorial-data-volume"
171 |   }
172 | 
173 |   spec {
174 |     capacity = {
175 |       storage = "1Mi"
176 |     }
177 |     access_modes = ["ReadOnlyMany"]
178 | 
179 |     persistent_volume_source {
180 |       nfs {
181 |         server = aws_efs_file_system.home_dirs.dns_name
182 |         path = "/icesat-2.hackweek.io/tutorial-data/"
183 |         read_only = true
184 |       }
185 |     }
186 |     storage_class_name = "manual-sc"
187 |   }
188 | }
189 | 
190 | resource "kubernetes_persistent_volume_claim" "tutorial-data-claim" {
191 |   metadata {
192 |     name = "${var.name_prefix}tutorial-data-claim"
193 |     namespace = "hackweek-hub-prod"
194 |   }
195 | 
196 |   spec {
197 |     access_modes = ["ReadOnlyMany"]
198 | 
199 |     resources {
200 |       requests = {
201 |         storage = "1Mi"
202 |       }
203 |     }
204 |     volume_name = kubernetes_persistent_volume.tutorial-data-volume.metadata.0.name
205 |     storage_class_name = "manual-sc"
206 |   }
207 | 
208 |   wait_until_bound = false
209 | }


--------------------------------------------------------------------------------
/gcp-examples/gke-dask-gateway/README.md:
--------------------------------------------------------------------------------
  1 | # Google Cloud VM Example
  2 | 
  3 | Deploy a network and a GKE cluster on Google Cloud via
  4 | Terraform. This infrastructure will host a deployment of
  5 | [`dask-gateway`](https://gateway.dask.org/).
  6 | 
  7 | ## Setup
  8 | 
  9 | Download / Configure the following
 10 | 
 11 | - [Terraform](https://www.terraform.io/downloads.html)
 12 | - gcloud:
 13 |   - [Install](https://cloud.google.com/sdk/docs/install)
 14 |   - [Configure](https://cloud.google.com/sdk/docs/initializing)
 15 |   - Create and download a
 16 |   [service account key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys) 
 17 | - [Helm](https://helm.sh/docs/intro/quickstart/)
 18 | 
 19 | If you want to run the example notebook to connect to the
 20 | gateway, you will also need:
 21 | 
 22 | - [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)
 23 | 
 24 | You will also need to add the `conda-forge` channel.
 25 | 
 26 | ```
 27 | conda config --add channels conda-forge
 28 | ```
 29 | 
 30 | ## Deployment
 31 | 
 32 | Input variables into `your-cluster.tfvars` if you want
 33 | options that differ from the defaults in `variables.tf`.
 34 | Using these variables will require adding the following flag
 35 | to all `terraform` commands: `--var-file=your-cluster.tfvars`.
 36 | 
 37 | In particular, you will need to supply the filepath of your
 38 | service account key to the `credential_file` input variable.
 39 | 
 40 | And yes, you need both the `google` and the `google-beta`
 41 | providers.
 42 | 
 43 | Once you are ready to deploy, you can look at the plan with:
 44 | 
 45 | ```
 46 | terraform plan --var-file your-cluster.tfvars
 47 | ```
 48 | 
 49 | Deploy the network and cluster with:
 50 | 
 51 | ```
 52 | terraform apply --var-file your-cluster.tfvars
 53 | ```
 54 | 
 55 | ## Install `dask-gateway`
 56 | 
 57 | To install `dask-gateway` onto the cluster, there are a few
 58 | steps:
 59 | - Make sure Helm is installed (listed above)
 60 | - Update your `kubeconfig` file with:
 61 | ```
 62 | gcloud container clusters get-credentials <your-cluster-name> --region <your-region>
 63 | ```
 64 | - Add your `gcloud` credentials to kubernetes as a secret.
 65 | This is the same key referenced above in "create and download
 66 | a service account key."
 67 | ```
 68 | kubectl -n dask-gateway create secret generic dask-worker-sa-key --from-file <~/path/to/key/file.json>
 69 | ```
 70 | - Install `dask-gateway`
 71 | ```
 72 | kubectl create ns dask-gateway
 73 | helm repo add daskgateway https://dask.org/dask-gateway-helm-repo/
 74 | helm repo update
 75 | helm upgrade --install -n dask-gateway --version 0.8.0 --values dask-gateway-config.yaml dask-gateway daskgateway/dask-gateway
 76 | ```
 77 | 
 78 | ## Connecting to the Gateway
 79 | 
 80 | As an example to connect to and use the gateway, there is
 81 | a conda environment and a Jupyter notebook.
 82 | 
 83 | A conda environment file is provided:
 84 | `dask-gateway-test-env.yaml`. We use it to match the package
 85 | versions that will be present on the dask cluster. To
 86 | activate this environment, use
 87 | 
 88 | ```
 89 | conda env create -f dask-gateway-test-env.yaml
 90 | conda activate dask-gateway-test-env
 91 | jupyter labextension install @jupyter-widgets/jupyterlab-manager
 92 | jupyter labextension install dask-labextension
 93 | jupyter serverextension enable dask_labextension
 94 | ```
 95 | 
 96 | There is a Jupyter Notebook provided as an example of how
 97 | to connect to the gateway. Launch it with
 98 | 
 99 | ```
100 | jupyter lab
101 | ```
102 | 
103 | Once you are done with the notebook, you should run the
104 | last cell that has
105 | 
106 | ```
107 | client.close()
108 | cluster.close()
109 | ```
110 | 
111 | to shut down your `dask-gateway` cluster. Then shut down
112 | the JupyterLab session at File > Shut Down.
113 | 
114 | To remove this environment, run
115 | 
116 | ```
117 | conda deactivate
118 | conda remove --name dask-gateway-test-env --all
119 | ```
120 | 
121 | More general information on connecting to the gateway is in
122 | the `dask` documentation:
123 | https://gateway.dask.org/install-kube.html#connecting-to-the-gateway
124 | 
125 | Other information for using the gateway for computations
126 | is in the `dask` documentation as well:
127 | https://gateway.dask.org/usage.html
128 | 
129 | ### Notes on this `dask-gateway` Deployment
130 | 
131 | This deployment is not secure. Anyone with the IP address
132 | of the dask cluster can get into it and run computations.
133 | 
134 | ## Tear Down `dask-gateway`
135 | 
136 | This must be performed before trying to tear down the
137 | infrastructure, otherwise the `terraform destroy` command
138 | will fail.
139 | 
140 | ```
141 | helm delete dask-gateway -n dask-gateway
142 | ```
143 | 
144 | It may take a few moments after the command returns for
145 | 
146 | ```
147 | kubectl get svc -n dask-gateway
148 | ```
149 | 
150 | to show that all the `LoadBalancer`s are cleaned up. Do not
151 | move on until the above comand tells you
152 | 
153 | ```
154 | No resources found in dask-gateway namespace.
155 | ```
156 | 
157 | ## Tear Down the Infrastructure
158 | 
159 | Remove the network and cluster with:
160 | 
161 | ```
162 | terraform destroy --var-file your-cluster.tfvars
163 | ```
164 | 
165 | Your `kubeconfig` file will still have the information for the
166 | cluster until you manually delete it. You can remove it as
167 | follows:
168 | 
169 | ```
170 | kubectl config delete-cluster <your-cluster-arn>
171 | kubectl config delete-context <your-cluster-context>
172 | kubectl config unset users.<user-name>
173 | ```
174 | 
175 | You can get those variables with the corresponding commands:
176 | 
177 | - `your-cluster-arn`: `kubectl config get-clusters`
178 | - `your-cluster-context`: `kubectl config get-contexts`
179 | - `user-name`: `kubectl config view`, the name you want will
180 | look something like
181 | `arn:aws:eks:us-west-2:############:cluster/<your-cluster>`.
182 | 
183 | If you had a previous kubectl context set, you may also want
184 | to set it to be something else with
185 | 
186 | ```
187 | kubectl config use-context <different context>
188 | ```
189 | 


--------------------------------------------------------------------------------
/aws-creds/iam.tf:
--------------------------------------------------------------------------------
  1 | # There are several ways that we could implement a minimal policy set
  2 | # The minimal policy set is found here: 
  3 | # https://github.com/terraform-aws-modules/terraform-aws-eks/blob/master/docs/iam-permissions.md
  4 | # Here are a few implementations
  5 | 
  6 | 
  7 | # Create a new user named terraform-bot
  8 | # Create policy in IAM and attach to terraform-bot
  9 | # TODO: You will need to manually generate access keys for this user
 10 | # TODO: You will need to manually configure awscli to use these access keys
 11 | # This is what I do for pangeo
 12 | 
 13 | #resource "aws_iam_user" "user" {
 14 | #  name = "terraform-bot"
 15 | #}
 16 | 
 17 | #resource "aws_iam_policy" "terraform_iam_policy" {
 18 | #    name = "terraform-policy"
 19 | #    path = "/"
 20 | #    description = "Permissions for Terraform-controlled EKS cluster creation and management"
 21 | #    policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 22 | #}
 23 | 
 24 | #resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 25 | #  user        = aws_iam_user.user.name
 26 | #  policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 27 | #}
 28 | 
 29 | 
 30 | # Create a role with the policy json
 31 | # Allow a user to assume this role
 32 | # TODO: You will need to manually allow a user to assume this role
 33 | # Probably want to make a standalone user like above
 34 | # Probably not recommended
 35 | 
 36 | #resource "aws_iam_role" "terraform_role" {
 37 | #  name = "terraform-deployment-role"
 38 | #  path = "/"
 39 | #  assume_role_policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 40 | #}
 41 | 
 42 | 
 43 | # Create the policy in IAM
 44 | # Attach the policy to the default awscli configuration profile
 45 | # Will leave the policy on the user EVEN AFTER finishing the terraform configuration
 46 | # For this reason, I think this is not recommended
 47 | 
 48 | #resource "aws_iam_policy" "terraform_iam_policy" {
 49 | #    name = "terraform-policy"
 50 | #    path = "/"
 51 | #    description = "Permissions for Terraform-controlled EKS cluster creation and management"
 52 | #    policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 53 | #}
 54 | 
 55 | #resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 56 | #  user        = split("/", data.aws_caller_identity.current.arn)[1]
 57 | #  policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 58 | #}
 59 | 
 60 | #data "aws_caller_identity" "current" {}
 61 | 
 62 | 
 63 | # This is the data for the policy needed to run terraform to create an eks cluster
 64 | data "aws_iam_policy_document" "terraform_iam_policy_source" {
 65 | 	version = "2012-10-17"
 66 | 
 67 |   statement {
 68 |     sid     = "VisualEditor0"
 69 | 
 70 |     effect  = "Allow"
 71 | 
 72 |     actions = [
 73 |       "autoscaling:AttachInstances",
 74 |       "autoscaling:CreateAutoScalingGroup",
 75 |       "autoscaling:CreateLaunchConfiguration",
 76 |       "autoscaling:CreateOrUpdateTags",
 77 |       "autoscaling:DeleteAutoScalingGroup",
 78 |       "autoscaling:DeleteLaunchConfiguration",
 79 |       "autoscaling:DeleteTags",
 80 |       "autoscaling:Describe*",
 81 |       "autoscaling:DetachInstances",
 82 |       "autoscaling:SetDesiredCapacity",
 83 |       "autoscaling:UpdateAutoScalingGroup",
 84 |       "autoscaling:SuspendProcesses",
 85 |       "ec2:AllocateAddress",
 86 |       "ec2:AssignPrivateIpAddresses",
 87 |       "ec2:Associate*",
 88 |       "ec2:AttachInternetGateway",
 89 |       "ec2:AttachNetworkInterface",
 90 |       "ec2:AuthorizeSecurityGroupEgress",
 91 |       "ec2:AuthorizeSecurityGroupIngress",
 92 |       "ec2:CreateDefaultSubnet",
 93 |       "ec2:CreateDhcpOptions",
 94 |       "ec2:CreateEgressOnlyInternetGateway",
 95 |       "ec2:CreateInternetGateway",
 96 |       "ec2:CreateNatGateway",
 97 |       "ec2:CreateNetworkInterface",
 98 |       "ec2:CreateRoute",
 99 |       "ec2:CreateRouteTable",
100 |       "ec2:CreateSecurityGroup",
101 |       "ec2:CreateSubnet",
102 |       "ec2:CreateTags",
103 |       "ec2:CreateVolume",
104 |       "ec2:CreateVpc",
105 |       "ec2:DeleteDhcpOptions",
106 |       "ec2:DeleteEgressOnlyInternetGateway",
107 |       "ec2:DeleteInternetGateway",
108 |       "ec2:DeleteNatGateway",
109 |       "ec2:DeleteNetworkInterface",
110 |       "ec2:DeleteRoute",
111 |       "ec2:DeleteRouteTable",
112 |       "ec2:DeleteSecurityGroup",
113 |       "ec2:DeleteSubnet",
114 |       "ec2:DeleteTags",
115 |       "ec2:DeleteVolume",
116 |       "ec2:DeleteVpc",
117 |       "ec2:DeleteVpnGateway",
118 |       "ec2:Describe*",
119 |       "ec2:DetachInternetGateway",
120 |       "ec2:DetachNetworkInterface",
121 |       "ec2:DetachVolume",
122 |       "ec2:Disassociate*",
123 |       "ec2:ModifySubnetAttribute",
124 |       "ec2:ModifyVpcAttribute",
125 |       "ec2:ModifyVpcEndpoint",
126 |       "ec2:ReleaseAddress",
127 |       "ec2:RevokeSecurityGroupEgress",
128 |       "ec2:RevokeSecurityGroupIngress",
129 |       "ec2:UpdateSecurityGroupRuleDescriptionsEgress",
130 |       "ec2:UpdateSecurityGroupRuleDescriptionsIngress",
131 |       "ec2:CreateLaunchTemplate",
132 |       "ec2:CreateLaunchTemplateVersion",
133 |       "ec2:DeleteLaunchTemplate",
134 |       "ec2:DeleteLaunchTemplateVersions",
135 |       "ec2:DescribeLaunchTemplates",
136 |       "ec2:DescribeLaunchTemplateVersions",
137 |       "ec2:GetLaunchTemplateData",
138 |       "ec2:ModifyLaunchTemplate",
139 |       "ec2:RunInstances",
140 |       "eks:CreateCluster",
141 |       "eks:DeleteCluster",
142 |       "eks:DescribeCluster",
143 |       "eks:ListClusters",
144 |       "eks:UpdateClusterConfig",
145 |       "eks:DescribeUpdate",
146 |       "iam:AddRoleToInstanceProfile",
147 |       "iam:AttachRolePolicy",
148 |       "iam:CreateInstanceProfile",
149 |   	  "iam:CreateOpenIDConnectProvider",
150 |       "iam:CreateServiceLinkedRole",
151 |       "iam:CreatePolicy",
152 |       "iam:CreatePolicyVersion",
153 |       "iam:CreateRole",
154 |       "iam:DeleteInstanceProfile",
155 | 		  "iam:DeleteOpenIDConnectProvider",
156 |       "iam:DeletePolicy",
157 |       "iam:DeleteRole",
158 |       "iam:DeleteRolePolicy",
159 |       "iam:DeleteServiceLinkedRole",
160 |       "iam:DetachRolePolicy",
161 |       "iam:GetInstanceProfile",
162 | 		  "iam:GetOpenIDConnectProvider",
163 |       "iam:GetPolicy",
164 |       "iam:GetPolicyVersion",
165 |       "iam:GetRole",
166 |       "iam:GetRolePolicy",
167 |       "iam:List*",
168 |       "iam:PassRole",
169 |       "iam:PutRolePolicy",
170 |       "iam:RemoveRoleFromInstanceProfile",
171 |       "iam:TagRole",
172 |       "iam:UpdateAssumeRolePolicy"
173 |     ]
174 | 
175 |     resources = ["*"]
176 |   }
177 | } 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | }
  4 | 
  5 | provider "aws" {
  6 |   version = ">= 2.28.1"
  7 |   region  = var.region
  8 |   profile = var.profile
  9 | }
 10 | 
 11 | provider "template" {
 12 |   version = "~> 2.1"
 13 | }
 14 | 
 15 | data "aws_caller_identity" "current" {}
 16 | 
 17 | data "aws_eks_cluster" "cluster" {
 18 |   name = module.eks.cluster_id
 19 | }
 20 | 
 21 | data "aws_eks_cluster_auth" "cluster" {
 22 |   name = module.eks.cluster_id
 23 | }
 24 | 
 25 | provider "kubernetes" {
 26 |   host                   = data.aws_eks_cluster.cluster.endpoint
 27 |   cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
 28 |   token                  = data.aws_eks_cluster_auth.cluster.token
 29 |   load_config_file       = false
 30 |   version                = "~> 1.11.1"
 31 | }
 32 | 
 33 | data "aws_availability_zones" "available" {
 34 | }
 35 | 
 36 | module "vpc" {
 37 |   source  = "terraform-aws-modules/vpc/aws"
 38 |   version = "~> 2.6"
 39 | 
 40 |   name                 = var.vpc_name
 41 |   cidr                 = "172.16.0.0/16"
 42 |   azs                  = data.aws_availability_zones.available.names
 43 | 
 44 |   public_subnets       = ["172.16.1.0/24", "172.16.2.0/24", "172.16.3.0/24"]
 45 |   private_subnets      = ["172.16.4.0/24", "172.16.5.0/24", "172.16.6.0/24"]
 46 |   enable_dns_hostnames = true
 47 |   enable_dns_support   = true
 48 |   enable_nat_gateway   = true
 49 |   single_nat_gateway   = true
 50 | 
 51 |   tags = {
 52 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 53 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 54 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 55 |   }
 56 | 
 57 |   public_subnet_tags = {
 58 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 59 |     "kubernetes.io/role/elb"                    = "1"
 60 |   }
 61 | 
 62 |   private_subnet_tags = {
 63 |     "kubernetes.io/cluster/${var.cluster_name}" = "shared"
 64 |     "kubernetes.io/role/internal-elb"           = "1"
 65 |   }
 66 | }
 67 | 
 68 | module "eks" {
 69 |   source          = "terraform-aws-modules/eks/aws"
 70 |   cluster_name    = var.cluster_name
 71 |   cluster_version = "1.14"
 72 | 
 73 |   subnets         = module.vpc.private_subnets
 74 |   vpc_id          = module.vpc.vpc_id
 75 |   enable_irsa     = true
 76 | 
 77 |   cluster_endpoint_private_access = true
 78 | 
 79 |   tags = {
 80 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 81 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 82 |   }
 83 | 
 84 |   node_groups_defaults = {
 85 |     ami_type  = "AL2_x86_64"
 86 |     disk_size = 50
 87 |   }
 88 | 
 89 |   worker_groups = [
 90 |     {
 91 |       name                    = "core"
 92 |       asg_max_size            = 1
 93 |       asg_min_size            = 1
 94 |       asg_desired_capacity    = 1
 95 |       instance_type           = "t3a.medium"
 96 |       subnets                 = [module.vpc.private_subnets[0]]
 97 | 
 98 |       # Use this to set labels / taints
 99 |       kubelet_extra_args      = "--node-labels=node-role.kubernetes.io/core=core,hub.jupyter.org/node-purpose=core"
100 |       
101 |       tags = [
102 |         {
103 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
104 |           "propagate_at_launch" = "false"
105 |           "value"               = "true"
106 |         },
107 |         {
108 |           "key"                 = "k8s.io/cluster-autoscaler/${var.cluster_name}"
109 |           "propagate_at_launch" = "false"
110 |           "value"               = "true"
111 |         }
112 |       ]
113 |     }
114 |   ]
115 | 
116 |   worker_groups_launch_template = [
117 |     {
118 |       name                    = "user-spot"
119 |       override_instance_types = ["m5.2xlarge", "m4.2xlarge"]
120 |       spot_instance_pools     = 2
121 |       asg_max_size            = 100
122 |       asg_min_size            = 0
123 |       asg_desired_capacity    = 0
124 | 
125 |       # Use this to set labels / taints
126 |       kubelet_extra_args = "--node-labels=node-role.kubernetes.io/user=user,hub.jupyter.org/node-purpose=user --register-with-taints hub.jupyter.org/dedicated=user:NoSchedule"
127 | 
128 |       tags = [
129 |         {
130 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose" 
131 |           "propagate_at_launch" = "false"
132 |           "value"               = "user"
133 |         },
134 |         {
135 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org/dedicated" 
136 |           "propagate_at_launch" = "false"
137 |           "value"               = "user:NoSchedule"
138 |         },
139 |         {
140 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
141 |           "propagate_at_launch" = "false"
142 |           "value"               = "true"
143 |         },
144 |         {
145 |           "key"                 = "k8s.io/cluster-autoscaler/${var.cluster_name}"
146 |           "propagate_at_launch" = "false"
147 |           "value"               = "true"
148 |         }
149 |       ]
150 |     },
151 |     {
152 |       name                    = "worker-spot"
153 |       override_instance_types = ["r5.2xlarge", "r4.2xlarge"]
154 |       spot_instance_pools     = 2
155 |       asg_max_size            = 100
156 |       asg_min_size            = 0
157 |       asg_desired_capacity    = 0
158 | 
159 |       # Use this to set labels / taints
160 |       kubelet_extra_args = "--node-labels node-role.kubernetes.io/worker=worker,k8s.dask.org/node-purpose=worker --register-with-taints k8s.dask.org/dedicated=worker:NoSchedule"
161 | 
162 |       tags = [
163 |         {
164 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose" 
165 |           "propagate_at_launch" = "false"
166 |           "value"               = "worker"
167 |         },
168 |         {
169 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org/dedicated" 
170 |           "propagate_at_launch" = "false"
171 |           "value"               = "worker:NoSchedule"
172 |         },
173 |         {
174 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
175 |           "propagate_at_launch" = "false"
176 |           "value"               = "true"
177 |         },
178 |         {
179 |           "key"                 = "k8s.io/cluster-autoscaler/${var.cluster_name}"
180 |           "propagate_at_launch" = "false"
181 |           "value"               = "true"
182 |         }
183 |       ]
184 |     }
185 |   ]
186 | 
187 |   map_roles    = var.map_roles
188 |   map_users    = var.map_users
189 |   map_accounts = var.map_accounts
190 | }
191 | 
192 | 
193 | provider "helm" {
194 |   kubernetes {
195 |     host                   = data.aws_eks_cluster.cluster.endpoint
196 |     cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
197 |     token                  = data.aws_eks_cluster_auth.cluster.token
198 |   }
199 | }
200 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/aws-creds/iam.tf:
--------------------------------------------------------------------------------
  1 | # There are several ways that we could implement a minimal policy set
  2 | # The minimal policy set is found here: 
  3 | # https://github.com/terraform-aws-modules/terraform-aws-eks/blob/master/docs/iam-permissions.md
  4 | 
  5 | terraform {
  6 |   required_version = ">= 0.12.6"
  7 | }
  8 | 
  9 | provider "aws" {
 10 |   version = ">= 2.28.1"
 11 |   region  = var.region
 12 |   profile = var.profile
 13 | }
 14 | 
 15 | # Here are a few implementations
 16 | 
 17 | # Create a new user named terraform-bot
 18 | # Create policy in IAM and attach to terraform-bot
 19 | # TODO: You will need to manually generate access keys for this user
 20 | # TODO: You will need to manually configure awscli to use these access keys
 21 | # This is what I do for pangeo
 22 | 
 23 | resource "aws_iam_user" "user" {
 24 |   name = "terraform-bot"
 25 | }
 26 | 
 27 | resource "aws_iam_policy" "terraform_iam_policy" {
 28 |     name = "terraform-policy"
 29 |     path = "/"
 30 |     description = "Permissions for Terraform-controlled EKS cluster creation and management"
 31 |     policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 32 | }
 33 | 
 34 | resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 35 |   user        = aws_iam_user.user.name
 36 |   policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 37 | }
 38 | 
 39 | resource "aws_iam_user_policy_attachment" "attach-efs-policies" {
 40 |   user        = aws_iam_user.user.name
 41 |   policy_arn  = "arn:aws:iam::aws:policy/AmazonElasticFileSystemFullAccess"
 42 | }
 43 | 
 44 | 
 45 | # Create a role with the policy json
 46 | # Allow a user to assume this role
 47 | # TODO: You will need to manually allow a user to assume this role
 48 | # Probably want to make a standalone user like above
 49 | # Probably not recommended
 50 | 
 51 | #resource "aws_iam_role" "terraform_role" {
 52 | #  name = "terraform-deployment-role"
 53 | #  path = "/"
 54 | #  assume_role_policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 55 | #}
 56 | 
 57 | 
 58 | # Create the policy in IAM
 59 | # Attach the policy to the default awscli configuration profile
 60 | # Will leave the policy on the user EVEN AFTER finishing the terraform configuration
 61 | # For this reason, I think this is not recommended
 62 | 
 63 | #resource "aws_iam_policy" "terraform_iam_policy" {
 64 | #    name = "terraform-policy"
 65 | #    path = "/"
 66 | #    description = "Permissions for Terraform-controlled EKS cluster creation and management"
 67 | #    policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 68 | #}
 69 | 
 70 | #resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 71 | #  user        = split("/", data.aws_caller_identity.current.arn)[1]
 72 | #  policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 73 | #}
 74 | 
 75 | #data "aws_caller_identity" "current" {}
 76 | 
 77 | 
 78 | # This is the data for the policy needed to run terraform to create an eks cluster
 79 | data "aws_iam_policy_document" "terraform_iam_policy_source" {
 80 | 	version = "2012-10-17"
 81 | 
 82 |   statement {
 83 |     sid     = "VisualEditor0"
 84 | 
 85 |     effect  = "Allow"
 86 | 
 87 |     actions = [
 88 |       "autoscaling:AttachInstances",
 89 |       "autoscaling:CreateAutoScalingGroup",
 90 |       "autoscaling:CreateLaunchConfiguration",
 91 |       "autoscaling:CreateOrUpdateTags",
 92 |       "autoscaling:DeleteAutoScalingGroup",
 93 |       "autoscaling:DeleteLaunchConfiguration",
 94 |       "autoscaling:DeleteTags",
 95 |       "autoscaling:Describe*",
 96 |       "autoscaling:DetachInstances",
 97 |       "autoscaling:SetDesiredCapacity",
 98 |       "autoscaling:UpdateAutoScalingGroup",
 99 |       "autoscaling:SuspendProcesses",
100 |       "ec2:AllocateAddress",
101 |       "ec2:AssignPrivateIpAddresses",
102 |       "ec2:Associate*",
103 |       "ec2:AttachInternetGateway",
104 |       "ec2:AttachNetworkInterface",
105 |       "ec2:AuthorizeSecurityGroupEgress",
106 |       "ec2:AuthorizeSecurityGroupIngress",
107 |       "ec2:CreateDefaultSubnet",
108 |       "ec2:CreateDhcpOptions",
109 |       "ec2:CreateEgressOnlyInternetGateway",
110 |       "ec2:CreateInternetGateway",
111 |       "ec2:CreateNatGateway",
112 |       "ec2:CreateNetworkInterface",
113 |       "ec2:CreateRoute",
114 |       "ec2:CreateRouteTable",
115 |       "ec2:CreateSecurityGroup",
116 |       "ec2:CreateSubnet",
117 |       "ec2:CreateTags",
118 |       "ec2:CreateVolume",
119 |       "ec2:CreateVpc",
120 |       "ec2:DeleteDhcpOptions",
121 |       "ec2:DeleteEgressOnlyInternetGateway",
122 |       "ec2:DeleteInternetGateway",
123 |       "ec2:DeleteNatGateway",
124 |       "ec2:DeleteNetworkInterface",
125 |       "ec2:DeleteRoute",
126 |       "ec2:DeleteRouteTable",
127 |       "ec2:DeleteSecurityGroup",
128 |       "ec2:DeleteSubnet",
129 |       "ec2:DeleteTags",
130 |       "ec2:DeleteVolume",
131 |       "ec2:DeleteVpc",
132 |       "ec2:DeleteVpnGateway",
133 |       "ec2:Describe*",
134 |       "ec2:DetachInternetGateway",
135 |       "ec2:DetachNetworkInterface",
136 |       "ec2:DetachVolume",
137 |       "ec2:Disassociate*",
138 |       "ec2:ModifySubnetAttribute",
139 |       "ec2:ModifyVpcAttribute",
140 |       "ec2:ModifyVpcEndpoint",
141 |       "ec2:ReleaseAddress",
142 |       "ec2:RevokeSecurityGroupEgress",
143 |       "ec2:RevokeSecurityGroupIngress",
144 |       "ec2:UpdateSecurityGroupRuleDescriptionsEgress",
145 |       "ec2:UpdateSecurityGroupRuleDescriptionsIngress",
146 |       "ec2:CreateLaunchTemplate",
147 |       "ec2:CreateLaunchTemplateVersion",
148 |       "ec2:DeleteLaunchTemplate",
149 |       "ec2:DeleteLaunchTemplateVersions",
150 |       "ec2:DescribeLaunchTemplates",
151 |       "ec2:DescribeLaunchTemplateVersions",
152 |       "ec2:GetLaunchTemplateData",
153 |       "ec2:ModifyLaunchTemplate",
154 |       "ec2:RunInstances",
155 |       "eks:CreateCluster",
156 |       "eks:DeleteCluster",
157 |       "eks:DescribeCluster",
158 |       "eks:ListClusters",
159 |       "eks:ListFargateProfiles",
160 |       "eks:ListNodegroups",
161 |       "eks:ListTagsForResource",
162 |       "eks:TagResource",
163 |       "eks:UntagResource",
164 |       "eks:UpdateClusterConfig",
165 |       "iam:AddRoleToInstanceProfile",
166 |       "iam:AttachRolePolicy",
167 |       "iam:CreateInstanceProfile",
168 |   	  "iam:CreateOpenIDConnectProvider",
169 |       "iam:CreateServiceLinkedRole",
170 |       "iam:CreatePolicy",
171 |       "iam:CreatePolicyVersion",
172 |       "iam:CreateRole",
173 |       "iam:DeleteInstanceProfile",
174 | 		  "iam:DeleteOpenIDConnectProvider",
175 |       "iam:DeletePolicy",
176 |       "iam:DeleteRole",
177 |       "iam:DeleteRolePolicy",
178 |       "iam:DeleteServiceLinkedRole",
179 |       "iam:DetachRolePolicy",
180 |       "iam:GetInstanceProfile",
181 | 		  "iam:GetOpenIDConnectProvider",
182 |       "iam:GetPolicy",
183 |       "iam:GetPolicyVersion",
184 |       "iam:GetRole",
185 |       "iam:GetRolePolicy",
186 |       "iam:List*",
187 |       "iam:PassRole",
188 |       "iam:PutRolePolicy",
189 |       "iam:RemoveRoleFromInstanceProfile",
190 |       "iam:TagRole",
191 |       "iam:UpdateAssumeRolePolicy"
192 |     ]
193 | 
194 |     resources = ["*"]
195 |   }
196 | } 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/deployment-notes/icesat2-hackweek-2020-notes.md:
--------------------------------------------------------------------------------
  1 | # Notes on JupyterHub infrastructure for the IceSat-2 2020 Hackweek
  2 | 
  3 | Relevant links:
  4 | - [JupyterHub Configuration](https://github.com/ICESAT-2HackWeek/jupyterhub-2020)
  5 | - [Infrastructure Configuration](https://github.com/ICESAT-2HackWeek/terraform-deploy/tree/master)
  6 | - [Computational Environment](https://github.com/ICESAT-2HackWeek/jupyter-image-2020 )
  7 | 
  8 | ### Versions:
  9 | - Terraform version: [12.6](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/main.tf#L2)
 10 | - Helm Version: 3. Minor version is not recalled or recorded.
 11 | - JupyterHub Helm chart version: [v0.9.0-beta.3.n030.h796379e](https://github.com/ICESAT-2HackWeek/jupyterhub-2020/blob/staging/hub/requirements.yaml)
 12 | - Prometheus Helm chart version: [11.2.1](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/monitoring.tf#L25)
 13 | - Grafana Helm chart version: [5.0.24](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/monitoring.tf#L44)
 14 | - Cluster-autoscaler Helm chart version: [unspecified](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/autoscaler.tf#L68)
 15 | - EFS-Provisioner Helm chart version: [0.11.0](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/efs.tf#L53)
 16 | - AWS Node Termination Helm chart version: [unspecified](https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/aws-node-termination-handler.tf#L10)
 17 | 
 18 | Resource requests per-user:
 19 | ```
 20 |     cpu:
 21 |       limit: 4
 22 |       guarantee: 1.75
 23 |     memory:
 24 |       limit: 8G
 25 |       guarantee: 7G
 26 | ```
 27 | 
 28 | Note m5 and m4 2xlarge nodes, so 4 people per node. 100Gb local disk
 29 | + EFS drive with default settings. CPUs are logical cores, so the
 30 | limit of 4 is 2 physical cpu cores with each with 2 independent
 31 | threads.
 32 | 
 33 | ### Successes
 34 | - Generally very stable for 80 simultaneous users! 
 35 | - Scale-up time less then 5 minutes per person
 36 | 
 37 | ### Issues
 38 | - Scale down an issue when people do not explicitly log out. Common
 39 | to wake up to 5-10 nodes running for over 12 hours. 
 40 | - Some complaints regarding performance when tutorials read the same
 41 | data on `/srv/shared` network storage location
 42 | - The `aws-node-termination-handler` will correctly replace user
 43 | nodes if the nodes need replacing but users will not know why this is
 44 | happening nor be able to do anything about it.
 45 | 
 46 | ### Lessons Learned / Future Hackweeks
 47 | 
 48 | - Having prometheus / grafana is great to understand how CPU, RAM,
 49 | network activity changes over time, and can be used to diagnose
 50 | performance issues for individual users or nodes. Could be helpful to
 51 | make these visualizations public and come up with custom dashboards 
 52 | like instantaneous CPU % versus default averages over 10 minutes)
 53 |   - Grafana probably needs a separate login per person who wants to
 54 |   login simultaneously
 55 |   - More practice with Grafana and more tailored dashboarding will
 56 |   help admins to utilize Grafana usefully. I’ve spent a lot of time
 57 |   just wondering how to do things in Grafana.
 58 |   - I found a dashboard that was simple and good to work with:
 59 |   [ID 10000](https://grafana.com/grafana/dashboards/10000).
 60 |   Configuration to use this is present in
 61 |   https://github.com/ICESAT-2HackWeek/terraform-deploy/blob/master/aws/grafana-values-min.yaml
 62 |   - Grafana data was extracted for use with pandas and yielded
 63 |   plots of JupyterHub usage. Some can be seen below.
 64 | - Multi-user load testing would be nice.
 65 |   - logins https://github.com/yuvipanda/hubtraf not being updated :( 
 66 |   - Would be nice to test that the cluster can autoscale correctly
 67 |   with many users logging in prior to the event.
 68 |   - run tutorial content with multiple users simultaneously *instead*
 69 |   of just tutorial lead. Then you understand typical multi-user
 70 |   performance instead of single user performance.
 71 |   - Not sure of other options.
 72 | - Could consider a “high performance hackweek” option, where each
 73 | person gets their own dedicated node (e.g. 8 cores, 32 Gb ram, 100Gb
 74 | scratch drive, GPU? ) and the shared drive is AWS FSX Lustre instead
 75 | of EFS).
 76 |   - The vast majority of costs for the hackweeks are from compute
 77 |   instances: IceSat-2's hackweek had these costs as ~90% of total
 78 |   costs for the event itself, decreasing to ~80% including pre- and
 79 |   post-event time when usage was lower but data storage was the
 80 |   same.
 81 |   - Similar percentages represent the cost breakdowns for
 82 |   OceanHackWeek, 95% during the event and 80% with pre- and
 83 |   post-event time.
 84 |   - A more granular cost analysis was done for OceanHackweek, from
 85 |   that ~10-15% of costs are from machines that host user pods.
 86 |   The resources for each user were such that four users would share
 87 |   a node. Other compute costs cover the hub's main compute instance
 88 |   and data transfers.
 89 |   - Implementing the option above where users get their own node
 90 |   would thus quadruple these costs for those users. This does not
 91 |   cover issues like data transfer, core node running costs, etc.
 92 | - Image updating could use some instructions. `conda-lock` seems
 93 | essential for our use-case. Instructions on the order of github
 94 | actions would be nice, I (Sebastian) messed that up once.
 95 |   - Image building was done in this repo:
 96 |   https://github.com/ICESAT-2HackWeek/jupyter-image-2020
 97 | - EFS permissions are configured before JupyterHub is launched. This
 98 | makes it impossible to allow tutorial leads to manage data at a more
 99 | permissive level than hackweek participants and puts some work onto
100 | admins to move data through a different instance so that it is
101 | write-protected.
102 | - Long JupyterLab session memory issues. I sent these messages out
103 | to participants:
104 | 
105 | > Hi all, after helping someone diagnose some crashing kernel
106 | problems, I think everyone should know that closing a notebook tab in
107 | JupyterLab interface doesn't actually shut down the kernel. If you
108 | are on the Hub all day and opening / running many notebooks, you may
109 | have some performance issues late in the day.
110 | > Along the left sidebar is the "Running Terminals and Kernels"
111 | button (it looks like a square inside a circle). It will show you all
112 | of the kernels (probably one per notebook that you've opened) that
113 | are running. Shutting those down is recommended after you are done
114 | working with a notebook.
115 | > Aside from that, don't forget to log out of the Hub when you are
116 | done for the day! It shuts down all your resources and saves us money
117 | and allows your server to have a clean slate every time you log in,
118 | which can save you some headache.
119 | - Should there be a policy where we tell people that we will spin
120 | their servers down at night unless they specifically ask us not to?
121 | 
122 | ### Usage Plots
123 | 
124 | ![IceSat-2 Hackweek Users over Time](icesat_2_hackweek_users_over_time.png)
125 | 
126 | ![IceSat-2 Hackweek Memory Usage over Time](icesat_2_hackweek_memory_usage_comparison.png)
127 | 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/iam-permissions/iam.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | 
  4 |   backend "s3" {
  5 |     #bucket         = "hackweek-terraform-state-bucket"
  6 |     key            = "hackweek-iam-user-config.tfstate"
  7 |     region         = "us-west-2"
  8 |     encrypt        = true
  9 |   }
 10 | }
 11 | 
 12 | # There are several ways that we could implement a minimal policy set
 13 | # The minimal policy set is found here: 
 14 | # https://github.com/terraform-aws-modules/terraform-aws-eks/blob/master/docs/iam-permissions.md
 15 | # Here are a few implementations
 16 | 
 17 | 
 18 | # Create a new user named terraform-bot
 19 | # Create policy in IAM and attach to terraform-bot
 20 | # TODO: You will need to manually generate access keys for this user
 21 | # TODO: You will need to manually configure awscli to use these access keys
 22 | # This is what I do for pangeo
 23 | 
 24 | resource "aws_iam_user" "user" {
 25 |   name = var.new-user-name
 26 | }
 27 | 
 28 | resource "aws_iam_policy" "terraform_iam_policy" {
 29 |     name = "terraform-policy"
 30 |     path = "/"
 31 |     description = "Permissions for Terraform-controlled EKS cluster creation and management"
 32 |     policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 33 | }
 34 | 
 35 | resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 36 |   user        = aws_iam_user.user.name
 37 |   policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 38 | }
 39 | 
 40 | resource "aws_iam_user_policy_attachment" "attach-efs-policies" {
 41 |   user        = aws_iam_user.user.name
 42 |   policy_arn  = "arn:aws:iam::aws:policy/AmazonElasticFileSystemFullAccess"
 43 | }
 44 | 
 45 | # Create a role with the policy json
 46 | # Allow a user to assume this role
 47 | # TODO: You will need to manually allow a user to assume this role
 48 | # Probably want to make a standalone user like above
 49 | # Probably not recommended
 50 | 
 51 | #resource "aws_iam_role" "terraform_role" {
 52 | #  name = "terraform-deployment-role"
 53 | #  path = "/"
 54 | #  assume_role_policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 55 | #}
 56 | 
 57 | 
 58 | # Create the policy in IAM
 59 | # Attach the policy to the default awscli configuration profile
 60 | # Will leave the policy on the user EVEN AFTER finishing the terraform configuration
 61 | # For this reason, I think this is not recommended
 62 | 
 63 | #resource "aws_iam_policy" "terraform_iam_policy" {
 64 | #    name = "terraform-policy"
 65 | #    path = "/"
 66 | #    description = "Permissions for Terraform-controlled EKS cluster creation and management"
 67 | #    policy = data.aws_iam_policy_document.terraform_iam_policy_source.json
 68 | #}
 69 | 
 70 | #resource "aws_iam_user_policy_attachment" "attach-terraform-permissions" {
 71 | #  user        = split("/", data.aws_caller_identity.current.arn)[1]
 72 | #  policy_arn  = aws_iam_policy.terraform_iam_policy.arn
 73 | #}
 74 | 
 75 | #data "aws_caller_identity" "current" {}
 76 | 
 77 | 
 78 | # This is the data for the policy needed to run terraform to create an eks cluster
 79 | data "aws_iam_policy_document" "terraform_iam_policy_source" {
 80 | 	version = "2012-10-17"
 81 | 
 82 |   statement {
 83 |     sid     = "VisualEditor0"
 84 | 
 85 |     effect  = "Allow"
 86 | 
 87 |     actions = [
 88 |       "autoscaling:AttachInstances",
 89 |       "autoscaling:CreateAutoScalingGroup",
 90 |       "autoscaling:CreateLaunchConfiguration",
 91 |       "autoscaling:CreateOrUpdateTags",
 92 |       "autoscaling:DeleteAutoScalingGroup",
 93 |       "autoscaling:DeleteLaunchConfiguration",
 94 |       "autoscaling:DeleteTags",
 95 |       "autoscaling:Describe*",
 96 |       "autoscaling:DetachInstances",
 97 |       "autoscaling:SetDesiredCapacity",
 98 |       "autoscaling:UpdateAutoScalingGroup",
 99 |       "autoscaling:SuspendProcesses",
100 |       "ec2:AllocateAddress",
101 |       "ec2:AssignPrivateIpAddresses",
102 |       "ec2:Associate*",
103 |       "ec2:AttachInternetGateway",
104 |       "ec2:AttachNetworkInterface",
105 |       "ec2:AuthorizeSecurityGroupEgress",
106 |       "ec2:AuthorizeSecurityGroupIngress",
107 |       "ec2:CreateDefaultSubnet",
108 |       "ec2:CreateDhcpOptions",
109 |       "ec2:CreateEgressOnlyInternetGateway",
110 |       "ec2:CreateInternetGateway",
111 |       "ec2:CreateNatGateway",
112 |       "ec2:CreateNetworkInterface",
113 |       "ec2:CreateRoute",
114 |       "ec2:CreateRouteTable",
115 |       "ec2:CreateSecurityGroup",
116 |       "ec2:CreateSubnet",
117 |       "ec2:CreateTags",
118 |       "ec2:CreateVolume",
119 |       "ec2:CreateVpc",
120 |       "ec2:DeleteDhcpOptions",
121 |       "ec2:DeleteEgressOnlyInternetGateway",
122 |       "ec2:DeleteInternetGateway",
123 |       "ec2:DeleteNatGateway",
124 |       "ec2:DeleteNetworkInterface",
125 |       "ec2:DeleteRoute",
126 |       "ec2:DeleteRouteTable",
127 |       "ec2:DeleteSecurityGroup",
128 |       "ec2:DeleteSubnet",
129 |       "ec2:DeleteTags",
130 |       "ec2:DeleteVolume",
131 |       "ec2:DeleteVpc",
132 |       "ec2:DeleteVpnGateway",
133 |       "ec2:Describe*",
134 |       "ec2:DetachInternetGateway",
135 |       "ec2:DetachNetworkInterface",
136 |       "ec2:DetachVolume",
137 |       "ec2:Disassociate*",
138 |       "ec2:ModifySubnetAttribute",
139 |       "ec2:ModifyVpcAttribute",
140 |       "ec2:ModifyVpcEndpoint",
141 |       "ec2:ReleaseAddress",
142 |       "ec2:RevokeSecurityGroupEgress",
143 |       "ec2:RevokeSecurityGroupIngress",
144 |       "ec2:UpdateSecurityGroupRuleDescriptionsEgress",
145 |       "ec2:UpdateSecurityGroupRuleDescriptionsIngress",
146 |       "ec2:CreateLaunchTemplate",
147 |       "ec2:CreateLaunchTemplateVersion",
148 |       "ec2:DeleteLaunchTemplate",
149 |       "ec2:DeleteLaunchTemplateVersions",
150 |       "ec2:DescribeLaunchTemplates",
151 |       "ec2:DescribeLaunchTemplateVersions",
152 |       "ec2:GetLaunchTemplateData",
153 |       "ec2:ModifyLaunchTemplate",
154 |       "ec2:RunInstances",
155 |       "ec2:TerminateInstances",
156 |       "eks:CreateCluster",
157 |       "eks:DeleteCluster",
158 |       "eks:DescribeCluster",
159 |       "eks:ListClusters",
160 |       "eks:TagResource",
161 |       "eks:UntagResource",
162 |       "eks:UpdateClusterConfig",
163 |       "eks:UpdateClusterVersion",
164 |       "eks:DescribeUpdate",
165 |       "iam:AddRoleToInstanceProfile",
166 |       "iam:AttachRolePolicy",
167 |       "iam:CreateInstanceProfile",
168 |   	  "iam:CreateOpenIDConnectProvider",
169 |       "iam:CreateServiceLinkedRole",
170 |       "iam:CreatePolicy",
171 |       "iam:CreatePolicyVersion",
172 |       "iam:CreateRole",
173 |       "iam:DeleteInstanceProfile",
174 | 		  "iam:DeleteOpenIDConnectProvider",
175 |       "iam:DeletePolicy",
176 |       "iam:DeleteRole",
177 |       "iam:DeleteRolePolicy",
178 |       "iam:DeleteServiceLinkedRole",
179 |       "iam:DetachRolePolicy",
180 |       "iam:GetInstanceProfile",
181 | 		  "iam:GetOpenIDConnectProvider",
182 |       "iam:GetPolicy",
183 |       "iam:GetPolicyVersion",
184 |       "iam:GetRole",
185 |       "iam:GetRolePolicy",
186 |       "iam:List*",
187 |       "iam:PassRole",
188 |       "iam:PutRolePolicy",
189 |       "iam:RemoveRoleFromInstanceProfile",
190 |       "iam:TagRole",
191 |       "iam:UpdateAssumeRolePolicy",
192 |       "s3:CreateBucket",
193 |       "s3:DeleteBucket",
194 |       "s3:DescribeJob",
195 |       "s3:Get*",
196 |       "s3:HeadBucket",
197 |       "s3:List*",
198 |       "s3:PutBucketAcl",
199 |       "s3:PutBucketTagging"
200 |     ]
201 | 
202 |     resources = ["*"]
203 |   }
204 | } 


--------------------------------------------------------------------------------
/aws-examples/hackweek-infrastructure/infrastructure/main.tf:
--------------------------------------------------------------------------------
  1 | terraform {
  2 |   required_version = ">= 0.12.6"
  3 | 
  4 |   backend "s3" {
  5 |     #bucket         = "hackweek-terraform-state-bucket"
  6 |     key            = "hackweek-cluster-config.tfstate"
  7 |     region         = "us-west-2"
  8 |     encrypt        = true
  9 |   }
 10 | }
 11 | 
 12 | provider "aws" {
 13 |   version = "2.59.0"
 14 |   region  = var.region
 15 |   profile = var.profile
 16 | }
 17 | 
 18 | provider "template" {
 19 |   version = "~> 2.1"
 20 | }
 21 | 
 22 | data "aws_caller_identity" "current" {}
 23 | 
 24 | data "aws_eks_cluster" "cluster" {
 25 |   name = module.eks.cluster_id
 26 | }
 27 | 
 28 | data "aws_eks_cluster_auth" "cluster" {
 29 |   name = module.eks.cluster_id
 30 | }
 31 | 
 32 | provider "kubernetes" {
 33 |   host                   = data.aws_eks_cluster.cluster.endpoint
 34 |   cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
 35 |   token                  = data.aws_eks_cluster_auth.cluster.token
 36 |   load_config_file       = false
 37 |   version                = "~> 1.11.1"
 38 | }
 39 | 
 40 | data "aws_availability_zones" "available" {
 41 | }
 42 | 
 43 | module "vpc" {
 44 |   source  = "terraform-aws-modules/vpc/aws"
 45 |   version = "~> 2.6"
 46 | 
 47 |   name                 = "${var.name_prefix}vpc"
 48 |   cidr                 = "172.16.0.0/16"
 49 |   azs                  = data.aws_availability_zones.available.names
 50 | 
 51 |   public_subnets       = ["172.16.1.0/24", "172.16.2.0/24", "172.16.3.0/24"]
 52 |   private_subnets      = ["172.16.4.0/24", "172.16.5.0/24", "172.16.6.0/24"]
 53 |   enable_dns_hostnames = true
 54 |   enable_dns_support   = true
 55 |   enable_nat_gateway   = true
 56 |   single_nat_gateway   = true
 57 | 
 58 |   tags = {
 59 |     "kubernetes.io/cluster/${module.eks.cluster_id}" = "shared"
 60 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 61 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 62 |     Project = "${var.name_prefix}project"
 63 |   }
 64 | 
 65 |   public_subnet_tags = {
 66 |     "kubernetes.io/cluster/${module.eks.cluster_id}" = "shared"
 67 |     "kubernetes.io/role/elb"                    = "1"
 68 |   }
 69 | 
 70 |   private_subnet_tags = {
 71 |     "kubernetes.io/cluster/${module.eks.cluster_id}" = "shared"
 72 |     "kubernetes.io/role/internal-elb"           = "1"
 73 |   }
 74 | }
 75 | 
 76 | module "eks" {
 77 |   source          = "terraform-aws-modules/eks/aws"
 78 |   cluster_name    = "${var.name_prefix}cluster"
 79 |   cluster_version = "1.15"
 80 |   version         = "11.1.0"
 81 | 
 82 |   subnets         = module.vpc.private_subnets
 83 |   vpc_id          = module.vpc.vpc_id
 84 |   enable_irsa     = true
 85 | 
 86 |   cluster_endpoint_private_access = true
 87 | 
 88 |   tags = {
 89 |     Owner = split("/", data.aws_caller_identity.current.arn)[1]
 90 |     AutoTag_Creator = data.aws_caller_identity.current.arn
 91 |     Project = "${var.name_prefix}project"
 92 |   }
 93 | 
 94 |   workers_group_defaults = {
 95 |     ami_id = "ami-065418523a44331e5"
 96 |   }
 97 | 
 98 |   worker_groups = [
 99 | 
100 |   ]
101 | 
102 |   worker_groups_launch_template = [
103 |     {
104 |       name                    = "core-spot"
105 |       asg_max_size            = 1
106 |       asg_min_size            = 1
107 |       asg_desired_capacity    = 1
108 |       instance_type           = ["t3a.xlarge", "t3a.xlarge"]
109 |       spot_instance_pools     = 2
110 |       subnets                 = [module.vpc.private_subnets[0]]
111 | 
112 |       # Use this to set labels / taints
113 |       kubelet_extra_args      = "--node-labels=role=core,hub.jupyter.org/node-purpose=core"
114 |       
115 |       tags = [
116 |         {
117 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
118 |           "propagate_at_launch" = "false"
119 |           "value"               = "true"
120 |         },
121 |         {
122 |           "key"                 = "k8s.io/cluster-autoscaler/${module.eks.cluster_id}"
123 |           "propagate_at_launch" = "false"
124 |           "value"               = "true"
125 |         }
126 |       ]
127 |     },
128 |     {
129 |       name                    = "user-spot"
130 |       override_instance_types = ["m5.2xlarge", "m4.2xlarge", "m5a.2xlarge"]
131 |       spot_instance_pools     = 3
132 |       asg_max_size            = 100
133 |       asg_min_size            = 0
134 |       asg_desired_capacity    = 0
135 | 
136 |       # Use this to set labels / taints
137 |       kubelet_extra_args = "--node-labels=role=user,hub.jupyter.org/node-purpose=user --register-with-taints hub.jupyter.org/dedicated=user:NoSchedule"
138 | 
139 |       tags = [
140 |         {
141 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/label/hub.jupyter.org/node-purpose" 
142 |           "propagate_at_launch" = "false"
143 |           "value"               = "user"
144 |         },
145 |         {
146 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/taint/hub.jupyter.org/dedicated" 
147 |           "propagate_at_launch" = "false"
148 |           "value"               = "user:NoSchedule"
149 |         },
150 |         {
151 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
152 |           "propagate_at_launch" = "false"
153 |           "value"               = "true"
154 |         },
155 |         {
156 |           "key"                 = "k8s.io/cluster-autoscaler/${module.eks.cluster_id}"
157 |           "propagate_at_launch" = "false"
158 |           "value"               = "true"
159 |         }
160 |       ]
161 |     },
162 |     {
163 |       name                    = "worker-spot"
164 |       override_instance_types = ["r5.2xlarge", "r4.2xlarge"]
165 |       spot_instance_pools     = 2
166 |       asg_max_size            = 100
167 |       asg_min_size            = 0
168 |       asg_desired_capacity    = 0
169 | 
170 |       # Use this to set labels / taints
171 |       kubelet_extra_args = "--node-labels=role=worker,k8s.dask.org/node-purpose=worker --register-with-taints k8s.dask.org/dedicated=worker:NoSchedule"
172 | 
173 |       tags = [
174 |         {
175 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/label/k8s.dask.org/node-purpose" 
176 |           "propagate_at_launch" = "false"
177 |           "value"               = "worker"
178 |         },
179 |         {
180 |           "key"                 = "k8s.io/cluster-autoscaler/node-template/taint/k8s.dask.org/dedicated" 
181 |           "propagate_at_launch" = "false"
182 |           "value"               = "worker:NoSchedule"
183 |         },
184 |         {
185 |           "key"                 = "k8s.io/cluster-autoscaler/enabled"
186 |           "propagate_at_launch" = "false"
187 |           "value"               = "true"
188 |         },
189 |         {
190 |           "key"                 = "k8s.io/cluster-autoscaler/${module.eks.cluster_id}"
191 |           "propagate_at_launch" = "false"
192 |           "value"               = "true"
193 |         }
194 |       ]
195 |     }
196 |   ]
197 | 
198 |   map_roles    = var.map_roles
199 |   map_users    = var.map_users
200 |   map_accounts = var.map_accounts
201 | }
202 | 
203 | 
204 | provider "helm" {
205 |   version = "~> 1.1"
206 |   kubernetes {
207 |     host                   = data.aws_eks_cluster.cluster.endpoint
208 |     cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data)
209 |     token                  = data.aws_eks_cluster_auth.cluster.token
210 |   }
211 | }
212 | 


--------------------------------------------------------------------------------
/aws-examples/blog-post/README.md:
--------------------------------------------------------------------------------
  1 | # PANGEO Terraform Deploy
  2 | 
  3 | ## Introduction
  4 | 
  5 | This repo houses an opinionated deployment of PANGEO-style
  6 | JupyterHub-ready infrastructure with
  7 | [Terraform](https://www.terraform.io/). 
  8 | 
  9 | This particular branch is presented for use with the Medium blog post
 10 | [Deploying JupyterHub-Ready Infrastructure with Terraform on AWS](https://medium.com/pangeo/terraform-jupyterhub-aws-34f2b725f4fd).
 11 | The guide to deploy this JupyterHub-ready infrastructure can be
 12 | summarized as:
 13 | - Download Terraform, its dependencies, and the repo
 14 | - Configure a few settings for the infrastructure and for the AWS CLI
 15 | - Deploy the infrastructure using Terraform commands
 16 | 
 17 | 
 18 | ## Deployment Instructions
 19 | 
 20 | ### Install Terraform, dependencies, and this GitHub repo
 21 | 
 22 | In order to deploy the configuration in this repo, you'll need the
 23 | following tools installed:
 24 | 
 25 | - [Terraform](https://www.terraform.io/downloads.html)
 26 | - [AWS CLI](https://aws.amazon.com/cli/)
 27 | - [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
 28 | - [Helm](https://helm.sh/docs/intro/install/)
 29 | 
 30 | You will also need this repo. You can get it with:
 31 | 
 32 | ```
 33 | git clone https://github.com/pangeo-data/terraform-deploy.git
 34 | cd terraform-deploy/aws-examples/blog-post/
 35 | ```
 36 | 
 37 | You will notice there are two folders here, `aws` and `aws-creds`.
 38 | Terraform will interact with each directory separately. We can now set
 39 | up some credentials before we deploy the infrastructure.
 40 | 
 41 | ### Configuration
 42 | 
 43 | #### Configure the AWS CLI
 44 | 
 45 | You need to have the `aws` CLI configured to run correctly from your
 46 | local machine - terraform will just read from the same source. The
 47 | [documentation on configuring AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
 48 | should help.
 49 | 
 50 | This repo provides the `aws-creds` folder in case you do not have
 51 | admin permissions or want to follow the principle of least privilege.
 52 | In order to run the Terraform commands in the `aws` folder, we will
 53 | use the minimal policy set defined at the bottom of `iam.tf`. By
 54 | default (as in, what is uncommented), the folder gives you a new user
 55 | named `terraform-bot` with policy attachments for the minimal policy
 56 | set and EFS permissions.
 57 | 
 58 | If you want to experiment with other ways to enable the policies you
 59 | can try! Some of them are present (but commented out) in the same file.
 60 | 
 61 | If you want to create this user, go into `aws-creds/iam.tfvars` and
 62 | make sure the value of `profile` is the correct awscli profile you
 63 | want to use. Then, run the following:
 64 | 
 65 | ```
 66 | cd aws-creds
 67 | terraform init
 68 | terraform apply -var-file=iam.tfvars
 69 | ```
 70 | 
 71 | Terraform will show the plan to create the IAM policy, an IAM user,
 72 | and the attachment of two policies onto the user. Confirm the apply
 73 | command and Terraform will let you know when it's finished.
 74 | 
 75 | You will then have to configure `terraform-bot`'s credentials in the
 76 | AWS Console. Go and generate access keys for the user, then put them
 77 | into your command line with 
 78 | 
 79 | ```
 80 | aws configure --profile terraform-bot
 81 | ```
 82 | 
 83 | Later, you will tell Terraform to use this profile when running
 84 | commands so that it has only the permissions it needs when deploying
 85 | the infrastructure.
 86 | 
 87 | #### Configure your Infrastructure
 88 | 
 89 | The terraform deployment needs several variable names set before it
 90 | can start. If you look in `aws/your-cluster.tfvars`, there are four
 91 | variables present. You should input cluster and vpc names. You only
 92 | have to change the region if you want to create resources in a
 93 | different region. Similarly, the profile only needs to be changed if
 94 | you are not using the `terraform-bot` user from the last step. 
 95 | 
 96 | You can change the name of this file if you want. Just keep in mind
 97 | that the instructions will list it as `<your-cluster>.tfvars` and you
 98 | will have to type in the new filename that you set. A professional
 99 | deployment should have a more descriptive name, but it isn't necessary
100 | here.
101 | 
102 | There are additional variables you can specify in your `.tfvars` file
103 | if you wish. The other variables are present in `aws/variables.tf`.
104 | 
105 | To force Terraform to use the values provided, we will add the flag
106 | `-var-file=<your-cluster>.tfvars` with every Terraform command. 
107 | 
108 | The final bit of Terraform configuration is run with `terraform init`.
109 | This makes Terraform check all of the files in the working directory
110 | and see if it needs to download anything in order to work properly.
111 | Here, these downloads are module and provider blocks. If you attempt
112 | to run other commands before this, Terraform will prompt you to
113 | initialize the working directory.
114 | 
115 | Make sure you are in the `aws` folder, then run
116 | 
117 | ```
118 | terraform init
119 | ```
120 | 
121 | ### Infrastructure Deployment
122 | 
123 | NOTE: Creating these resources will cost your AWS account money. This
124 | cluster configuration has cost me under $5 per day running the
125 | cluster, vpc, and core node. 
126 | 
127 | #### First-Time Deployment
128 | 
129 | If you have configured the `awscli` profile you want to use and input
130 | the values you like into your `.tfvars` file, then you are ready to
131 | deploy the infrastructure! Running the command below will first
132 | generate a plan as Terraform validates the configuration. The plan is
133 | a list of the lowest-level resources it can give you. We use the
134 | modules so we don't have to look at all the low-level resources all
135 | the time (that's a lot to look at), but it is good to look at them at
136 | least once to make sure you understand everything you are creating.
137 | 
138 | You can take a look at the 63 resources if you like, but at the end of
139 | the day, all you need to do to start deploying infrastructure is type
140 | `yes` when Terraform prompts you.
141 | 
142 | ```
143 | terraform apply -var-file=<your-cluster>.tfvars
144 | ```
145 | 
146 | The infrastructure can take 15 minutes or more to create (the EKS
147 | cluster takes 9-12 minutes alone).
148 | 
149 | While watching Terraform deploy everything, you may notice that
150 | sometimes many resources are created at the same time, but other times
151 | only one resource is being created. Terraform takes into account
152 | resource dependencies and will make sure independent resources are
153 | created before dependent ones. It will try to deploy as many things at
154 | once as possible but will have to wait for certain resources to finish
155 | before it can move on.
156 | 
157 | NOTE: Sometimes you will get an error saying the Kubernetes cluster is
158 | unreachable. This is usually resolved by running the `terraform apply
159 | ...` command again.
160 | 
161 | Tons of green output means the deployment was successful!
162 | Congratulations!
163 | 
164 | #### Inspecting the Infrastructure
165 | 
166 | If you want to take a peek at your cluster, you will need to tell
167 | `kubectl` and `Helm` where your cluster is, since Terraform doesn't
168 | modify them by default. Do this with the following command, filling in
169 | values for your deployment.
170 | 
171 | ```
172 | aws eks update-kubeconfig --name=<cluster-name> --region=<region>
173 | --profile=<profile>
174 | ```
175 | 
176 | Now you should be able to run local commands to inspect the cluster!
177 | Try the following:
178 | 
179 | ```
180 | aws eks list-clusters --profile=<profile>
181 | aws eks describe-cluster --name=<cluster-name> --profile=terraform-bot
182 | kubectl get pods -A
183 | kubectl get nodes -A
184 | helm list -A
185 | ```
186 | 
187 | You should be able to see
188 | - A list of clusters on your account, including the one you just made
189 | - Information about the cluster you just made
190 | - All of the pods (individual software) present on machines in the
191 | cluster
192 | - All of the nodes (actual machines) in the cluster, which should just
193 | be one core node
194 | - All of the Helm releases on the cluster, which should be the
195 | `efs-provisioner` and the `cluster-autoscaler`.
196 | 
197 | If there were problems with deployment, these commands might fail or
198 | give you insight into the problems.
199 | 
200 | #### Modifying the Infrastructure
201 | 
202 | NOTE: Do not modify AWS resources with the console if you created them
203 | with Terraform. This can cause unintended problems for Terraform
204 | because it can't see the resource changes you made.
205 | 
206 | If you want to change some of the values or infrastructure, you can
207 | fiddle with the `.tf` files and then run `terraform apply
208 | -var-file=<your-cluster>.tfvars` again. Terraform will compare the new
209 | plan to the old plan that you already deployed and see what is has to
210 | do to get from one to the other. For individual resources, this may be
211 | an easy in-place modification, others may have to be destroyed and
212 | re-created, and others still may just be different resources, so you
213 | delete them and make the replacements. Terraform takes care of all of
214 | this for you but will show you what it intends to do in the plan it
215 | outputs.
216 | 
217 | NOTE: If you change the worker group templates and there are existing
218 | nodes when you run `terraform apply ...`, it wil not apply the changes
219 | to existing nodes. You will have to manually drain the node by setting
220 | the desired number of nodes to 0 in the AWS Console, wait for the
221 | nodes to disappear, then set the desired number of nodes to 1 once
222 | `terraform apply ...` has finished.
223 | 
224 | NOTE: Changing the desired number of nodes after the worker group
225 | template has been created will not work unless you do so in the AWS
226 | Console. Terraform doesn't affect that after the worker group template
227 | has been created.
228 | 
229 | #### Tear Down
230 | 
231 | If you don't want these resources on your account forever (since they
232 | cost you money), you can tear it all down with one command per
233 | directory.
234 | 
235 | Terraform remembers everything it has currently built, so as long as
236 | you provide the `.tfvars` file, it will find the resources correctly
237 | and remove them in the reverse order that they were built!
238 | 
239 | Running `terraform destroy ...` will generate a plan similar to
240 | `terraform apply ...`, but it will indicate that it is deleting
241 | resources, not deploying them. Again, you will be prompted to confirm
242 | the plan by typing `yes`.
243 | 
244 | ```
245 | terraform destroy --var-file=<your-cluster>.tfvars
246 | ```
247 | 
248 | The `destroy` command can time out trying to destroy some of the
249 | Kubernetes resources, but re-running it usually solves the issue. If
250 | you put anything on your cluster (besides the `efs-provisioner` and
251 | the `cluster-autoscaler`), you should remove it before running
252 | `terraform destroy ...`. Since Terraform isn't detecting what software
253 | is on your cluster (it only knows what it put on the cluster), it
254 | doesn't know how to remove it, and that can lead to issues.
255 | 
256 | Removing the `terraform-bot` user will require to manually delete the
257 | access keys in the AWS Console. Then, you can delete the Terraform
258 | entries.
259 | 
260 | ```
261 | cd ../aws-creds/
262 | terraform destroy -var-file=iam.tfvars
263 | ```
264 | 
265 | If you set your local kubeconfig to point to this cluster, you can
266 | remove that with the following:
267 | 
268 | ```
269 | kubectl config delete-cluster <your-cluster-arn>
270 | kubectl config delete-context <your-cluster-context>
271 | kubectl config unset users.<user-name>
272 | ```
273 | 
274 | You can get those variables with the corresponding commands:
275 | - `your-cluster-arn`: `kubectl config get-clusters`
276 | - `your-cluster-context`: `kubectl config get-contexts`
277 | - `user-name`: `kubectl config view`, the name you want will look
278 | something like
279 | `arn:aws:eks:us-west-2:############:cluster/<your-cluster>`.
280 | 
281 | If you had a previous `kubectl` context set, you may also want to set
282 | it to be something else with
283 | 
284 | ```
285 | kubectl config use-context <different context>
286 | ```
287 | 


--------------------------------------------------------------------------------
/aws-examples/minimal-deployment-tutorial/README.md:
--------------------------------------------------------------------------------
  1 | # Minimal Deployment Tutorial
  2 | 
  3 | This example is meant as a small tutorial to Terraform. It is a very
  4 | high-level overview of some of the concepts and should not replace
  5 | [Terraform's tutorials](https://learn.hashicorp.com/collections/terraform/aws-get-started).
  6 | A lot of the content in this README is paraphrased from
  7 | Terraform's documentation.
  8 | 
  9 | ## Why Terraform?
 10 | 
 11 | Terraform is a tool we use to deploy cloud infrastructure. This infrastructure
 12 | can be as simple as a single computer in the cloud or as complex as hundreds
 13 | of machines spinning up on demand for high-performance computing.
 14 | 
 15 | ![Terraform deployment diagram](https://github.com/pangeo-data/terraform-deploy/blob/master/aws-examples/minimal-deployment-tutorial/terraform-aws-vpc-eks-deployment.png?raw=true)
 16 | 
 17 | Shown above is an example deployment - the second example in this tutorial.
 18 | The network is set up, a cluster deployed into it, and a JupyterHub can
 19 | be put into the cluster. A more complex version of this was deployed for
 20 | 40-60 user hackweeks. You can find that in 
 21 | `aws-examples/hackweek-infrastructure/`.
 22 | 
 23 | Each cloud provider has their own user interface you can access on the web and
 24 | command-line tool for more programmatic access. However, we prefer Terraform
 25 | for two main reasons: 
 26 | 
 27 | 1. Terraform is cloud-agnostic. If you know how to use Terraform to deploy AWS
 28 | cloud infrastructure, you know how to use Terraform to deploy GCP cloud
 29 | infrastructure. You can use it to deploy on many of the main cloud providers
 30 | without changing tools, which lowers the barrier to trying out different
 31 | clouds.
 32 | 
 33 | 2. Terraform gracefully tracks what it has created. This means that running
 34 | the same commands multiple times does not cause errors or change the end
 35 | result unless an incremental change is provided. Terraform looks at the
 36 | difference between what you have deployed and what you want to deploy, then
 37 | deploys only what it needs to. Tearing down your infrastructure is equally
 38 | simple and you won't miss removing anything as long as everything you
 39 | deployed was deployed with Terraform.
 40 | 
 41 | ## Prerequisites
 42 | 
 43 | Before running any commands for this example, you should install a few things:
 44 | 
 45 | - [Terraform](https://www.terraform.io/downloads.html)
 46 | - [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
 47 | - [Helm](https://helm.sh/docs/intro/install/)
 48 | - [AWS CLI](https://aws.amazon.com/cli/): As part of installing this, you
 49 | will need to make an AWS account if you don't have one and
 50 | [configure](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html)
 51 | `awscli` to use that account.
 52 | 
 53 | You can then clone this repo:
 54 | 
 55 | ```
 56 | git clone https://github.com/pangeo-data/terraform-deploy.git
 57 | cd terraform-deploy/aws-examples/minimal-deployment/
 58 | ```
 59 | 
 60 | ## EC2 Example / Intro to Terraform
 61 | 
 62 | As a way to introduce some of the Terraform concepts, we will first deploy
 63 | an Elastic Compute Cloud (EC2) instance, a single machine in AWS's cloud. The
 64 | configuration for this section is in `ec2-intro-tutorial/`. 
 65 | 
 66 | ### Infrastructure
 67 | 
 68 | The infrastructure is just an EC2 instance. 
 69 | 
 70 | ### Terraform Components
 71 | 
 72 | #### Providers
 73 | 
 74 | Providers are there to understand resources and API interactions for a
 75 | given infrastructure platform. Here, we have the AWS provider to enable
 76 | us to create AWS infrastructure.
 77 | 
 78 | #### Input Variables
 79 | 
 80 | Input variables act as parameters, giving a way for users to customize a
 81 | deployment without altering the configuration code. This also enables users
 82 | to share configurations with others more easily. Variables are defined in
 83 | `ec2-intro-tutorial/ec2-variables.tf` and can be overridden in
 84 | `ec2-intro-tutorial/your-ec2-values.tfvars`.
 85 | 
 86 | #### Data Sources
 87 | 
 88 | Data sources fetch and compute data for use in Terraform's configuration. We
 89 | have a data source here to find an Amazon Machine Image ID that we use for
 90 | our EC2 instance.
 91 | 
 92 | #### Resources
 93 | 
 94 | Resources describe one or more infrastructure objects. Our only resource here
 95 | is the EC2 instance, seen in `ec2-intro-tutorial/ec2-main.tf`.
 96 | 
 97 | #### Output Values
 98 | 
 99 | Output values are the equivalent of return values. We use them here to print
100 | relevant information about the deployment once it is deployed, such as the
101 | AMI ID we used and the Amazon Resource Name (ARN) of our EC2 instance.
102 | Output values are defined for this folder in `ec2-intro-tutorial/ec2-outputs.tf`.
103 | Every data source and resource block also has outputs (see below).
104 | 
105 | #### Variable Referencing
106 | 
107 | Terraform adds functionality by allowing programmatic references
108 | to resources' and data sources' outputs. We use this here to set the
109 | AMI for the EC2 instance to be the value that our data source found.
110 | Also, the name of the EC2 instance is the input variable `deployment_name`.
111 | 
112 | ### Deployment
113 | 
114 | Deployment expects that you have AWS credentials with all the necessary
115 | permissions. Permissions are present for other examples or the main
116 | repo, but are not shown here. The easiest option is if you have admin
117 | privileges on your AWS account.
118 | 
119 | #### Set Up Input Variables
120 | 
121 | Input variables are defined in `ec2-intro-tutorial/ec2-variables.tf`. Their
122 | values can be overwritten by specifying entries in
123 | `ec2-intro-tutorial/your-ec2-values.tfvars`. When you run a plan, apply, or
124 | destroy command, you can supply these values to Terraform with
125 | `--var-file=your-ec2-values.tfvars`.
126 | 
127 | #### Command: Plan
128 | 
129 | The `terraform plan` command creates and shows Terraform's execution plan.
130 | If you have not deployed anything yet, this will be all the resources
131 | Terraform plans to deploy. If you have deployed some resources, this plan
132 | will show what Terraform needs to deploy or change to get to the new
133 | configuration. If you have deployed everything, the plan will indicate
134 | that everything is up-to-date.
135 | 
136 | To look at the execution plan, run:
137 | 
138 | ```
139 | cd ec2-intro-tutorial/
140 | terraform init
141 | terraform plan --var-file=your-ec2-values.tfvars
142 | ```
143 | 
144 | #### Command: Apply
145 | 
146 | The `terraform apply` command applies the changes to get to the desired
147 | configuration state. This behavior is similar to the plan command. 
148 | Every time you run `terraform apply`, the plan will be displayed before
149 | prompting you to accept the actions.
150 | 
151 | To deploy the configuration, run:
152 | 
153 | ```
154 | cd ec2-intro-tutorial/
155 | terraform init
156 | terraform apply --var-file=your-ec2-values.tfvars
157 | ```
158 | 
159 | The first two lines are unecessary if you ran them in the Command: Plan
160 | section.
161 | 
162 | ### Next Steps
163 | 
164 | If you want to connect to this EC2 instance, there are two options:
165 | 
166 | 1. Connect to it through the EC2 console. AWS Documentation says:
167 | > You can connect using EC2 Instance Connect with just a valid username. You
168 | can connect using Session Manager if you have been granted the necessary
169 | permissions. I have had some difficulties doing this, though.
170 | 
171 | 2. Create a key pair in the AWS console. You will then have to associate it
172 | with your EC2 instance, see more on that in the
173 | [Terraform documentation](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/instance#key_name).
174 | If you create the key pair, you will need to use the AWS console to delete it.
175 | Terraform does not manage the key pair for you.
176 | 
177 | ### Tear-Down
178 | 
179 | When you are ready to tear down the EC2 example, run:
180 | 
181 | ```
182 | terraform destroy --var-file=your-ec2-values.tfvars
183 | ```
184 | 
185 | and type `yes` when prompted.
186 | 
187 | ## Simple Cluster Deployment
188 | 
189 | This configuration deploys infrastructure that can support a
190 | JupyterHub, though itis not recommended for much more than testing.
191 | We use Terraform to deploy it on AWS.
192 | 
193 | ![Terraform deployment diagram](https://github.com/pangeo-data/terraform-deploy/blob/master/aws-examples/minimal-deployment-tutorial/terraform-aws-vpc-eks-deployment.png?raw=true)
194 | 
195 | Here is a visualization of the infrastructure that we will deploy, also
196 | seen above.
197 | 
198 | ### Infrastructure
199 | 
200 | This example is a minimally-configured deployment of:
201 | - A Virtual Private Cloud (VPC), essentially the network in the cloud that
202 | the cluster lives in. Security is a high priority on this.
203 | - An Elastic Kubernetes Service (EKS) cluster, with one machine for the core
204 | JupyterHub services and one for the users.
205 | 
206 | This configuration utilizes a Terraform module for each of these,
207 | enabling minimal configuration for us without sacrificing security
208 | and completeness.
209 | 
210 | ### Terraform Components
211 | 
212 | #### Modules
213 | 
214 | In addition to the components used in the previous examples, we have modules.
215 | Modules are collections of resources that are used together. They enable us to
216 | focus on the high-level decisions and leave the harder parts of infrastructure
217 | to the people that have made the module.
218 | 
219 | Both modules require the AWS provider and the EKS module requires several
220 | others, listed in `main.tf`. 
221 | 
222 | Both modules also have outputs. Some are referenced in `outputs.tf` and others
223 | are variable references between modules! The line
224 | 
225 | ```
226 |   vpc_id          = module.vpc.vpc_id
227 | ```
228 | 
229 | in `main.tf` takes in an output of the VPC module and uses it as an input
230 | for the EKS module. This also helps Terraform realize there is a resource
231 | dependency, so it will create the VPC resources before the EKS ones.
232 | 
233 | The VPC and EKS cluster are both deployed using modules that were developed
234 | by the Terraform open-source community!
235 | 
236 | ### Deployment
237 | 
238 | Deployment expects that you have AWS credentials with all the necessary
239 | permissions. Permissions are present for other examples or the main
240 | repo, but are not shown here. The easiest option is if you have admin
241 | privileges on your AWS account.
242 | 
243 | Input variables are defined in `variables.tf`. Their values can be
244 | overwritten by specifying entries in `your-cluster.tfvars`. When you run a
245 | plan, apply, or destroy command, you can supply these values to Terraform with
246 | `--var-file=your-cluster.tfvars`.
247 | 
248 | To deploy, run:
249 | 
250 | ```
251 | cd minimimal-deployment
252 | terraform init
253 | terraform apply --var-file=your-cluster.tfvars
254 | ```
255 | 
256 | And type `yes` when prompted.
257 | 
258 | There should be 41 resources that need to be created. The
259 | infrastructure creation process can take 15+ minutes, mostly because of
260 | the EKS cluster. The infrastructure will cost a few dollars a day to keep
261 | it up, so keep that in mind.
262 | 
263 | ### Next Steps
264 | 
265 | If you want to interact with the cluster after it is created, you will
266 | need to configure `kubectl`. Do this with:
267 | 
268 | ```
269 | aws eks update-kubeconfig --name=<your-cluster> --profile=<your-profile>
270 | ```
271 | 
272 | You should now be able to use `kubectl` and `helm` commands to inspect
273 | the cluster or deploy software onto it.
274 | 
275 | ### Tear-Down
276 | 
277 | If you put anything onto the infrastructure that Terraform created,
278 | such as a Helm release of JupyterHub, remove that before continuing.
279 | 
280 | To tear down the infrastructure, run:
281 | 
282 | ```
283 | cd minimal-deployment
284 | terraform destroy --var-file=your-cluster.tfvars
285 | ```
286 | 
287 | and type `yes` when prompted.
288 | 
289 | If you set your local kubeconfig to point to this cluster, you can
290 | remove that with the following:
291 | 
292 | ```
293 | kubectl config delete-cluster <your-cluster-arn>
294 | kubectl config delete-context <your-cluster-context>
295 | kubectl config unset users.<user-name>
296 | ```
297 | 
298 | You can get those variables with the corresponding commands:
299 | 
300 | - `your-cluster-arn`: `kubectl config get-clusters`
301 | - `your-cluster-context`: `kubectl config get-contexts`
302 | - `user-name`: `kubectl config view`, the name you want will look
303 | something like
304 | `arn:aws:eks:us-west-2:############:cluster/<your-cluster>`.
305 | 
306 | If you had a previous `kubectl` context set, you may also want to set it to
307 | be something else with
308 | 
309 | ```
310 | kubectl config use-context <different context>
311 | ```
312 | 
313 | ## Wrap-Up
314 | 
315 | In this tutorial, we learned what Terraform is and why it is useful. After
316 | learning the basics, we deployed and tore down a single machine in AWS's
317 | cloud, then a scalable cluster of machines! I hope you learned something
318 | useful.
319 | 
320 | Feel free to get involved in this repo or ask me, Sebastian Alvis (@salvis2)
321 | for more information.
322 | 
323 | 


--------------------------------------------------------------------------------