├── .gitignore ├── Makefile ├── README.md ├── images ├── high_level_design.jpg └── infra_schema.jpg └── stack ├── ansible ├── ansible.cfg ├── inventories │ └── dev │ │ └── aws_ec2.yml ├── playbooks │ ├── airbyte_playbook.yml │ └── metabase_playbook.yml └── roles │ └── docker │ └── tasks │ └── main.yml ├── data_jobs ├── Dockerfile ├── jobs │ └── hello_jobs.py └── setup.py ├── pulumi ├── Pulumi.dev.yaml ├── Pulumi.yaml ├── __main__.py ├── config.py ├── instance │ ├── config.py │ ├── instance.py │ ├── instance_profile.py │ └── security_groups.py ├── jobs │ ├── cloudwatch.py │ ├── config.py │ ├── ecr.py │ ├── ecs_cluster.py │ └── scheduled_job.py ├── network │ ├── __init__.py │ ├── az.py │ ├── config.py │ ├── nat_gateway.py │ ├── private_subnets.py │ ├── public_subnets.py │ ├── security_groups.py │ ├── vpc.py │ └── vpc_endpoints.py └── warehouse │ ├── config.py │ ├── data_warehouse.py │ ├── security_groups.py │ └── subnet_groups.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | venv_mini_data_stack 2 | .python-version 3 | stack/__pychache__/** 4 | stack/__pycache__ 5 | stack/pulumi/__pycache__ 6 | *-keypair 7 | *-keypair.pub 8 | stack/pulumi/network/__pycache__ 9 | stack/pulumi/instance/__pycache__ 10 | stack/pulumi/ecs/__pycache__ 11 | stack/pulumi/jobs/__pycache__ 12 | stack/pulumi/warehouse/__pycache__ -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup 2 | 3 | setup: 4 | @pythonVersion=$$(python3 --version | cut -d ' ' -f 2); \ 5 | majorVersion=$$(echo $$pythonVersion | cut -d '.' -f 1); \ 6 | minorVersion=$$(echo $$pythonVersion | cut -d '.' -f 2); \ 7 | if [ $$majorVersion -lt 3 ] || { [ $$majorVersion -eq 3 ] && [ $$minorVersion -lt 11 ]; }; then \ 8 | echo "\033[0;31mThis repo is not tested on Python versions <= 3.11. Your version: $$pythonVersion\033[0m"; \ 9 | else \ 10 | echo "\033[0;32mPython version is $$pythonVersion. Continuing with setup...\033[0m"; \ 11 | fi 12 | @if [ ! -d "data_stack_venv" ]; then \ 13 | echo "📦Creating virtual environment..."; \ 14 | python3 -m venv data_stack_venv; \ 15 | else \ 16 | echo "Virtual environment already exists."; \ 17 | fi 18 | @echo "📦Activating virtual environment..." 19 | . data_stack_venv/bin/activate && \ 20 | pip install -r stack/requirements.txt 21 | @if [ ! -f "dev-keypair" ] && [ ! -f "dev-keypair.pub" ]; then \ 22 | echo "🔐Generating SSH keypair..."; \ 23 | ssh-keygen -f dev-keypair -N ""; \ 24 | else \ 25 | echo "SSH keypair already exists."; \ 26 | fi 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mini Modern Data Stack 2 | Deploy a complete data stack that can run in production in just a couple of minutes. 3 | 4 | 5 | ![alt text](images/high_level_design.jpg "Stack HLD") 6 | 7 | ## Infrastructure 8 | The stack is made to be deployed on AWS and tries to be as simple as possible without compromising security 9 | - instances are in a private subnet with no direct access to internet 10 | - a NAT Gateway is used to enable instances to make request to external services 11 | - SSH port on instance are not open 12 | - connecting to an instance or using port forwarding is made through AWS Session Manager 13 | (act kind of like a bastion) 14 | - SSH keypair are still needed in order to execute ansible script on the instances 15 | - secrets for ECS tasks are managed by AWS Secret Manager and injected securely at task launch 16 | - there is 2 private subnets, because RDS always needs 2 subnets even if on a single-AZ deployment 17 | 18 | ![alt text](images/infra_schema.jpg "Infra schema") 19 | 20 | ## Deploying the stack 21 | 22 | **Here are the prerequisis to run this deployment** 23 | - an AWS account with AWS CLI configured 24 | (follow [instruction here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) 25 | then run `aws configure`) the AWS account needs to have a role that allow creating the necessary resources 26 | - aws ssm plugin to securely manage instances 27 | (follow [instruction here to install](https://docs.aws.amazon.com/systems-manager/latest/userguide/install-plugin-macos-overview.html) 28 | and [here to setup your local ssh config accordingly](https://docs.aws.amazon.com/systems-manager/latest/userguide/session-manager-getting-started-enable-ssh-connections.html#ssh-connections-enable)) 29 | - a Pulumi cloud account in order to manage the state of your infrastructure. 30 | It's completly free and you can do pretty much everything with the free tier 31 | ([create an account here](https://app.pulumi.com/signup)) 32 | - Pulumi CLI installed or your machine ([install the CLI](https://www.pulumi.com/docs/install/)) 33 | - the Docker daemon running on your computer 34 | 35 | 36 | ⚠️ Be careful to be on the right AWS profile (if you have several) when executing command. 37 | If you need to switch profile, use the command below : 38 | ```bash 39 | export AWS_PROFILE= 40 | ``` 41 | 42 | ### 1. Set up the environment 43 | 44 | Clone the repo and cd into it 45 | ```bash 46 | git clone git@github.com:jeremySrgt/mini-modern-data-stack.git 47 | cd mini-modern-data-stack 48 | ``` 49 | 50 | Execute the make script to create a python virtual env and create SSH keys 51 | ```bash 52 | make setup 53 | ``` 54 | 55 | Activate the virtual env 56 | ```bash 57 | source data_stack_venv/bin/activate 58 | ``` 59 | 60 | *Even if we won't expose the ssh port of our instances, 61 | SSH keys are still needed to configure them with Ansible.* 62 | *Don't worry there is a .gitignore rules to prevent them from getting pushed into Github* 63 | 64 | 65 | ### 2. Deploy the infra 66 | 67 | Now cd into *stack > pulumi* and login into pulumi cloud with your credentials 68 | 69 | ``` bash 70 | pulumi login 71 | ``` 72 | 73 | Create a stack named dev (actually, you can name it whatever you want, but you will need to set all the env var below 74 | as default value will not exist for a different stack name) 75 | ```bash 76 | pulumi stack init dev 77 | ``` 78 | 79 | #### 2.1 Set environment variable 80 | You now need to set a couple of environment variable and secrets. Here is the list : 81 | 82 | | Config name | Required | Default value | Description | 83 | |--------------------------|:--------:|:----------------------:|---------------------------------------------:| 84 | | region | true | eu-west-3 | the region where the stack will be deployed | 85 | | env | false | dev | name of the env you are deploying to | 86 | | public_key_path | false | ../../dev-keypair.pub | path to your public key (ending with .pub) | 87 | | airbyte_instance_type | false | t3.medium | type of Airbyte instance | 88 | | metabase_instance_type | false | t3.small | type of Metabase instance | 89 | | warehouse_instance_class | false | db.t3.micro | type of RDS instance class for the warehouse | 90 | | warehouse_db_name | false | company_data_warehouse | name of the default database | 91 | | dwh_master_user* | true | | master user of the database | 92 | | dwh_master_password* | true | | master user's password of the database | 93 | 94 | To set a config variable : 95 | ```bash 96 | pulumi config set 97 | ``` 98 | 99 | config name marked with a * are secrets and needs to be set as such : 100 | ```bash 101 | pulumi config set --secret 102 | ``` 103 | 104 | #### 2.2 Deploy 105 | 106 | Once it's all done you can first preview what will be deployed 107 | ```bash 108 | pulumi preview 109 | ``` 110 | 111 | Then deploy everything 112 | ```bash 113 | pulumi up --yes 114 | ``` 115 | 116 | > It takes about 10-15 minutes to deploy all the resources. it's actually deploying an RDS database that 117 | takes a bit of time. 118 | 119 | ### 3. Configure Airbyte and Metabase 120 | 121 | When it's done the last thing remaining is to configure our instances to deploy Metabase and Airbyte on it. 122 | 123 | cd into the ansible directory then run the folllowing command : 124 | 125 | ```bash 126 | ansible-playbook --private-key ../../dev-keypair -i inventories/dev/aws_ec2.yml playbooks/airbyte_playbook.yml 127 | ``` 128 | and 129 | ```bash 130 | ansible-playbook --private-key ../../dev-keypair -i inventories/dev/aws_ec2.yml playbooks/metabase_playbook.yml 131 | ``` 132 | > It takes about 5-8 minutes to configure Airbyte. Metabase should be a bit faster 133 | 134 | > Note that, by default, ansible will look for instance in eu-west-3 region, 135 | if you are deploying on another region set the AWS_REGION env in your terminal session, 136 | for example : `export AWS_REGION=eu-central-1` 137 | 138 | 139 | 🎉 If everything worked well, you should now have a complete mini data stack running on AWS, 140 | that can sync data from sources to your warehouse, run dbt and python transformations, 141 | and visualize data thanks to Metabase ! 142 | 143 | ## Accessing Airbyte and Metabase 144 | Airbyte and Metabase are quite sensible instances because they have access to your company's data. 145 | That is why they are not exposed to the internet for security reason. 146 | To access them you need something similar to an SSH tunnel to forward port. 147 | 148 | Since we are using AWS Session Manager to manage our instance, 149 | we can securely connect to them with the aws ssm plugin. 150 | We just need the instance ID (pulumi returned it as an output after the deployment) 151 | 152 | To start a port forwarding session : 153 | ```bash 154 | aws ssm start-session --target --document-name AWS-StartPortForwardingSession --parameters '{"portNumber":["8000"],"localPortNumber":["8000"]}' 155 | ``` 156 | **Remember to correctly install and setup the aws ssm plugin in order to connect to your instances and use port 157 | forwarding capabilities. It basically means adding the following lines to your ~/.ssh/config** 158 | ``` 159 | host i-* mi-* 160 | ProxyCommand sh -c "aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'"` 161 | ``` 162 | 163 | Now you should be able to acces Airbyte on http://localhost:8000/ 164 | 165 | Do the same for Metabase, but replace the port number with `3000` 166 | 167 | 168 | Learn more about [configuring Airbyte sync](https://airbyte.com/how-to-sync) and [setting up Metabase](https://www.metabase.com/docs/latest/configuring-metabase/setting-up-metabase) 169 | 170 | > Don't close the aws ssm session until you are done working on your instances 171 | 172 | ## Monthly cost 173 | *This estimate is based on eu-west-3 region with 10 ECS task running for 5 minutes each day, 1To of Go processed 174 | and all the default instance type, without taking any free tier into account* 175 | 176 | **152.2$/month** 177 | 178 | *EC2 cost = 51$/month* 179 | 180 | *RDS cost = 15$/month* 181 | 182 | *Fargate task cost = 0.20$/month* 183 | 184 | *NAT Gateway = 86$/month* 185 | 186 | 187 | ## Warning 188 | You should know that with this deployment, Airbyte and Metabase don't have a database setup to store their configuration 189 | and all user data. The only storage they have is the volume attached to their instance. It means that if you created 190 | Questions and Dashboard with Metabase, and connection with Airbyte they could be lost if the volume is destroyed. 191 | 192 | To prevent any unintended deletion of Metabase and Airbyte data, volume associated with their instances are kept even 193 | if you destroy the Pulumi stack. Delete them from the aws console if you destroy the stack through Pulumi 194 | 195 | It is recommended to set a proper database for Airbyte and Metabase to store their configuration and data. 196 | You can learn more on how to do for Metabase 197 | [here](https://www.metabase.com/docs/latest/installation-and-operation/configuring-application-database) 198 | and for Airbyte [here](https://docs.airbyte.com/operator-guides/configuring-airbyte-db) 199 | 200 | ## Destroying the stack 😢 201 | if, for any reason, you want to destroy the stack and all the related resources 202 | ```bash 203 | pulumi destroy 204 | ``` 205 | *Remember, you still need to delete the two volume associated with Airbyte and Metabase instance* 206 | 207 | ## Enhancements 208 | Here is a list of enhancement to be made, either to follow engineering best practices or to make the stack more 209 | resilient and scalable : 210 | 211 | * [ ] Configure a CI/CD deployment of the stack 212 | * [ ] Tag docker image based on commit SHA, for better version tracking 213 | * [ ] Migrate Metabase and Airbyte deployment to ECS for better resiliency and less instance 214 | configuration (Airbyte doesn't support ECS out of the box for now, so it's a bit of a challenge) 215 | -------------------------------------------------------------------------------- /images/high_level_design.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremySrgt/mini-modern-data-stack/7004b55122e18714d9cb85c2ae91c41d0f54407a/images/high_level_design.jpg -------------------------------------------------------------------------------- /images/infra_schema.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremySrgt/mini-modern-data-stack/7004b55122e18714d9cb85c2ae91c41d0f54407a/images/infra_schema.jpg -------------------------------------------------------------------------------- /stack/ansible/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | host_key_checking= False 3 | enable_plugins = aws_ec2 4 | remote_user = ec2-user 5 | 6 | [ssh_connection] 7 | ssh_args = -o ProxyCommand="sh -c \"aws ssm start-session --target %h --document-name AWS-StartSSHSession --parameters 'portNumber=%p'\"" 8 | -------------------------------------------------------------------------------- /stack/ansible/inventories/dev/aws_ec2.yml: -------------------------------------------------------------------------------- 1 | plugin: aws_ec2 2 | aws_profile: "{{ lookup('env', 'AWS_PROFILE') | default('default', true) }}" 3 | # You can set a specific region to look for, it speeds up the inventory creation 4 | #regions: 5 | # - eu-west-3 6 | filters: 7 | tag:Name: 8 | - metabase-instance 9 | - airbyte-instance 10 | instance-state-name : running 11 | hostnames: 12 | - instance-id 13 | keyed_groups: 14 | - key: tags['Name'] 15 | prefix: group 16 | compose: 17 | ansible_host: instance_id 18 | use_extra_vars: yes -------------------------------------------------------------------------------- /stack/ansible/playbooks/airbyte_playbook.yml: -------------------------------------------------------------------------------- 1 | - hosts: group_airbyte_instance 2 | gather_facts: false 3 | become: true 4 | roles: 5 | - roles/docker 6 | tasks: 7 | - name: Install git 8 | yum: 9 | name: 'git' 10 | state: present 11 | - name: Install Docker Compose Module for Python 12 | become_user: ec2-user 13 | ansible.builtin.pip: 14 | name: docker-compose 15 | - name: Create airbyte folder 16 | ansible.builtin.file: 17 | path: airbyte 18 | state: directory 19 | mode: 'u+rwx' 20 | - name: Get airbyte 21 | git: 22 | repo: 'https://github.com/airbytehq/airbyte.git' 23 | depth: 1 24 | dest: airbyte 25 | version: v0.50.47 26 | register: airbyte_version 27 | - name: Download necessary Airbyte file 28 | ansible.builtin.shell: ./run-ab-platform.sh --refresh 29 | args: 30 | chdir: airbyte/ 31 | - name: Start Airbyte 32 | become_user: ec2-user 33 | environment: 34 | BASIC_AUTH_USERNAME: "" 35 | BASIC_AUTH_PASSWORD: "" 36 | SYNC_JOB_RETRIES_COMPLETE_FAILURES_MAX_SUCCESSIVE: 1 37 | SYNC_JOB_RETRIES_PARTIAL_FAILURES_MAX_SUCCESSIVE: 1 38 | SYNC_JOB_MAX_TIMEOUT_DAYS: 1 39 | MAX_SPEC_WORKERS: 2 40 | MAX_CHECK_WORKERS: 2 41 | MAX_SYNC_WORKERS: 2 42 | MAX_DISCOVER_WORKERS: 2 43 | community.docker.docker_compose: 44 | project_src: airbyte 45 | recreate: always 46 | state: present 47 | - name: Prune old images version 48 | community.docker.docker_prune: 49 | images: true -------------------------------------------------------------------------------- /stack/ansible/playbooks/metabase_playbook.yml: -------------------------------------------------------------------------------- 1 | - hosts: group_metabase_instance 2 | gather_facts: false 3 | become: true 4 | roles: 5 | - roles/docker 6 | tasks: 7 | - name: Start metabase on docker 8 | become: false 9 | community.docker.docker_container: 10 | name: 'metabase' 11 | image: 'metabase/metabase:v0.47.9' 12 | env: 13 | MB_PASSWORD_COMPLEXITY: "strong" 14 | MB_APPLICATION_DB_MAX_CONNECTION_POOL_SIZE: "80" 15 | MAX_SESSION_AGE: "10080" 16 | ports: 17 | - "3000:3000" 18 | state: started -------------------------------------------------------------------------------- /stack/ansible/roles/docker/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: Install docker 2 | yum: 3 | name: "docker-20.10.17" 4 | state: present 5 | - name: Service docker 6 | service: 7 | name: docker 8 | state: started 9 | enabled: yes 10 | - name: Add docker ec2 user 11 | user: 12 | name: "ec2-user" 13 | append: yes 14 | groups: docker 15 | - name: Install pip3 16 | yum: 17 | name: "python3-pip" 18 | state: present 19 | - name: Install Docker Module for Python 20 | become_user: ec2-user 21 | ansible.builtin.pip: 22 | name: "docker==6.1.3" 23 | -------------------------------------------------------------------------------- /stack/data_jobs/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | COPY . . 4 | 5 | RUN pip install . 6 | -------------------------------------------------------------------------------- /stack/data_jobs/jobs/hello_jobs.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | load_dotenv() 5 | 6 | 7 | def say_hello(): 8 | # you can access env var and secrets that you defined in task definition like so: 9 | # os.getenv("DATAWAREHOUSE_DB") 10 | print("Hello Jobs !") 11 | return 0 12 | 13 | 14 | if __name__ == "__main__": 15 | say_hello() 16 | -------------------------------------------------------------------------------- /stack/data_jobs/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='data-jobs', 5 | version='0.1.0', 6 | packages=find_packages(), 7 | url='', 8 | license='', 9 | author='Jérémy Surget', 10 | author_email='', 11 | description='', 12 | install_requires=[ 13 | 'python-dotenv==1.0.1' 14 | ], 15 | 16 | extras_require={'dev': []}, 17 | python_requires='>=3.11', 18 | ) 19 | -------------------------------------------------------------------------------- /stack/pulumi/Pulumi.dev.yaml: -------------------------------------------------------------------------------- 1 | config: 2 | aws:region: eu-west-3 3 | -------------------------------------------------------------------------------- /stack/pulumi/Pulumi.yaml: -------------------------------------------------------------------------------- 1 | name: mini-data-stack 2 | runtime: 3 | name: python 4 | description: A complete mini data stack to run in production 5 | -------------------------------------------------------------------------------- /stack/pulumi/__main__.py: -------------------------------------------------------------------------------- 1 | import pulumi 2 | import warehouse.data_warehouse as data_warehouse 3 | import jobs.ecr as ecr 4 | import jobs.ecs_cluster as ecs_cluster 5 | import network.vpc_endpoints 6 | from jobs.scheduled_job import create_scheduled_job 7 | from network.private_subnets import private_subnet 8 | from network.az import primary_az 9 | from instance.instance import ec2_instance 10 | from config import METABASE_INSTANCE_TYPE, AIRBYTE_INSTANCE_TYPE 11 | 12 | metabase_instance = ec2_instance( 13 | resource_name="metabase-instance", 14 | instance_type=METABASE_INSTANCE_TYPE, 15 | az=primary_az, 16 | subnet_id=private_subnet.id, 17 | ) 18 | 19 | airbyte_instance = ec2_instance( 20 | resource_name="airbyte-instance", 21 | instance_type=AIRBYTE_INSTANCE_TYPE, 22 | az=primary_az, 23 | subnet_id=private_subnet.id, 24 | ) 25 | 26 | create_scheduled_job( 27 | name="hello_jobs", 28 | file_name="hello_jobs.py", 29 | schedule="cron(30 6 * * ? *)", # Every day at 6.30 AM UTC 30 | ) 31 | 32 | pulumi.export("Metabase instance ID", metabase_instance.id) 33 | pulumi.export("Airbyte instance ID", airbyte_instance.id) 34 | -------------------------------------------------------------------------------- /stack/pulumi/config.py: -------------------------------------------------------------------------------- 1 | from pulumi import Config 2 | 3 | cfg = Config() 4 | 5 | METABASE_INSTANCE_TYPE = cfg.get("metabase_instance_type", default="t3.micro") 6 | AIRBYTE_INSTANCE_TYPE = cfg.get("airbyte_instance_type", default="t3.medium") 7 | -------------------------------------------------------------------------------- /stack/pulumi/instance/config.py: -------------------------------------------------------------------------------- 1 | from pulumi import Config 2 | 3 | cfg = Config() 4 | 5 | ENV = cfg.get("env", default="dev") 6 | PUBLIC_KEY_PATH = cfg.get("public_key_path", "../../dev-keypair.pub") 7 | -------------------------------------------------------------------------------- /stack/pulumi/instance/instance.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from instance.instance_profile import ec2_instance_profile 3 | from instance.security_groups import ( 4 | allow_443_outbound_to_private_subnet_cidr, 5 | allow_outbound_to_anywhere, 6 | ) 7 | from network.vpc import internet_gateway 8 | from instance.config import ENV, PUBLIC_KEY_PATH 9 | 10 | public_key = open(PUBLIC_KEY_PATH).read() 11 | 12 | data_instance_keypair = aws.ec2.KeyPair( 13 | "data_instance_keypair", 14 | public_key=public_key, 15 | tags={"Name": f"{ENV}-data-instance-keypair", "env": ENV}, 16 | ) 17 | 18 | 19 | def ec2_instance( 20 | resource_name: str, 21 | subnet_id: str, 22 | instance_type: str, 23 | az: str, 24 | ) -> aws.ec2.Instance: 25 | instance = aws.ec2.Instance( 26 | resource_name, 27 | instance_type=instance_type, 28 | ami="ami-0cb7af6ec2ad3c332", 29 | iam_instance_profile=ec2_instance_profile.name, 30 | availability_zone=az, 31 | subnet_id=subnet_id, 32 | vpc_security_group_ids=[ 33 | allow_443_outbound_to_private_subnet_cidr.id, 34 | allow_outbound_to_anywhere.id, 35 | ], 36 | key_name=data_instance_keypair.id, 37 | root_block_device=aws.ec2.InstanceRootBlockDeviceArgs( 38 | delete_on_termination=False, 39 | # To keep Metabase and Airbyte data in case you didn't setup a proper persistent storage for them, like a db 40 | encrypted=True, 41 | volume_size=20, 42 | volume_type="gp3", 43 | tags={"Name": f"{ENV}-{resource_name}-volume", "env": ENV}, 44 | ), 45 | tags={"Name": f"{resource_name}", "env": ENV}, 46 | ) 47 | 48 | return instance 49 | -------------------------------------------------------------------------------- /stack/pulumi/instance/instance_profile.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pulumi_aws as aws 3 | from instance.config import ENV 4 | 5 | ec2_instance_role = aws.iam.Role( 6 | "ec2-instance-role", 7 | name=f"{ENV}-ec2-instance-role", 8 | assume_role_policy=json.dumps( 9 | { 10 | "Version": "2012-10-17", 11 | "Statement": [ 12 | { 13 | "Action": "sts:AssumeRole", 14 | "Principal": {"Service": "ec2.amazonaws.com"}, 15 | "Effect": "Allow", 16 | "Sid": "", 17 | } 18 | ], 19 | } 20 | ), 21 | tags={ 22 | "Name": f"{ENV}-ec2-instance-role", "env": ENV 23 | } 24 | ) 25 | 26 | aws.iam.RolePolicyAttachment( 27 | "ec2-instance-role-policy-attachment-ssm-role", 28 | role=ec2_instance_role.name, 29 | policy_arn="arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore", 30 | ) 31 | 32 | ec2_instance_profile = aws.iam.InstanceProfile( 33 | "ec2-instance-profile", 34 | name=f"{ENV}-ec2-instance-profile", 35 | role=ec2_instance_role.name, 36 | tags={ 37 | "Name": f"{ENV}-ec2-instance-profile", "env": ENV 38 | } 39 | ) 40 | -------------------------------------------------------------------------------- /stack/pulumi/instance/security_groups.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from network.vpc import data_vpc 3 | from network.private_subnets import private_subnet 4 | from instance.config import ENV 5 | 6 | allow_443_outbound_to_private_subnet_cidr = aws.ec2.SecurityGroup( 7 | "allow_443_outbound_to_private_subnet_cidr", 8 | description="Allow 443 tcp outbound traffic to private subnet cidr", 9 | vpc_id=data_vpc.id, 10 | tags={"Name": f"{ENV}-allow-443-outbound-to-private-subnet-cidr", "env": ENV}, 11 | ) 12 | 13 | aws.vpc.SecurityGroupEgressRule( 14 | "allow_443_outbound_to_private_subnet_cidr_egress_rule", 15 | security_group_id=allow_443_outbound_to_private_subnet_cidr.id, 16 | ip_protocol="tcp", 17 | cidr_ipv4=private_subnet.cidr_block, 18 | from_port=443, 19 | to_port=443, 20 | ) 21 | 22 | allow_outbound_to_anywhere = aws.ec2.SecurityGroup( 23 | "alllow_outbound_to_anywhere", 24 | description="Allow outbound traffic to anywhere, with the route table, this will go through NAT gw", 25 | vpc_id=data_vpc.id, 26 | tags={"Name": f"{ENV}-alllow-outbound-to-anywhere", "env": ENV}, 27 | ) 28 | 29 | aws.vpc.SecurityGroupEgressRule( 30 | "alllow_outbound_to_anywhere_egress_rule", 31 | security_group_id=allow_outbound_to_anywhere.id, 32 | ip_protocol="-1", 33 | cidr_ipv4="0.0.0.0/0", 34 | ) 35 | -------------------------------------------------------------------------------- /stack/pulumi/jobs/cloudwatch.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pulumi_aws as aws 3 | from jobs.config import ENV 4 | 5 | cloudwatch_ecs_log_group = aws.cloudwatch.LogGroup( 6 | "cloudwatch_ecs_log_group", 7 | name=f"{ENV}-ecs-log-group", 8 | retention_in_days=30, 9 | tags={"Name": f"{ENV}-ecs-log-group", "env": ENV}, 10 | ) 11 | 12 | cloudwatch_ecs_role = aws.iam.Role( 13 | "cloudwatch_ecs_role", 14 | assume_role_policy=json.dumps( 15 | { 16 | "Version": "2012-10-17", 17 | "Statement": [ 18 | { 19 | "Action": "sts:AssumeRole", 20 | "Effect": "Allow", 21 | "Sid": "", 22 | "Principal": {"Service": "ecs-tasks.amazonaws.com"}, 23 | }, 24 | { 25 | "Action": "sts:AssumeRole", 26 | "Effect": "Allow", 27 | "Sid": "", 28 | "Principal": {"Service": "events.amazonaws.com"}, 29 | }, 30 | ], 31 | } 32 | ), 33 | tags={"Name": f"{ENV}-ecs-cw-role", "env": ENV}, 34 | ) 35 | 36 | cloudwatch_managed_policy_attach = aws.iam.RolePolicyAttachment( 37 | "cloudwatch_managed_policy_attach", 38 | role=cloudwatch_ecs_role.name, 39 | policy_arn="arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceEventsRole", 40 | ) 41 | 42 | 43 | def cloud_watch_pass_role_policy(task_role_arn): 44 | return ( 45 | json.dumps( 46 | { 47 | "Version": "2012-10-17", 48 | "Statement": [ 49 | { 50 | "Action": ["iam:PassRole"], 51 | "Effect": "Allow", 52 | "Resource": [task_role_arn], 53 | } 54 | ], 55 | } 56 | ), 57 | ) 58 | 59 | 60 | cloud_watch_policy = aws.iam.Policy( 61 | "cloudwatch_policy_pass_role", 62 | description="Cloudwatch passRole for ecs task policy", 63 | policy=json.dumps( 64 | { 65 | "Version": "2012-10-17", 66 | "Statement": [ 67 | { 68 | "Action": ["iam:PassRole"], 69 | "Effect": "Allow", 70 | "Resource": "*", 71 | } 72 | ], 73 | } 74 | ), 75 | ) 76 | 77 | # Attach CloudWatch Policy 78 | cloudwatch_policy_attach = aws.iam.RolePolicyAttachment( 79 | "cloudwatch_pass_role_policy_attach", 80 | role=cloudwatch_ecs_role.name, 81 | policy_arn=cloud_watch_policy.arn, 82 | ) 83 | -------------------------------------------------------------------------------- /stack/pulumi/jobs/config.py: -------------------------------------------------------------------------------- 1 | from pulumi import Config 2 | 3 | cfg = Config() 4 | 5 | ENV = cfg.get("env", default="dev") 6 | -------------------------------------------------------------------------------- /stack/pulumi/jobs/ecr.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pulumi_aws as aws 3 | import pulumi_awsx as awsx 4 | from jobs.config import ENV 5 | 6 | data_ecr = aws.ecr.Repository( 7 | "data_job_image_repository", 8 | name="data_jobs", 9 | image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( 10 | scan_on_push=True, 11 | ), 12 | image_tag_mutability="MUTABLE", # Should be immutable but for simplicity we will stick to mutable 13 | force_delete=True, 14 | encryption_configurations=[ 15 | aws.ecr.RepositoryEncryptionConfigurationArgs(encryption_type="KMS") 16 | ], 17 | tags={"Name": f"{ENV}-data-jobs-ecr", "env": ENV}, 18 | ) 19 | 20 | aws.ecr.LifecyclePolicy( 21 | "data_ecr_lifecycle_policy", 22 | repository=data_ecr.name, 23 | policy=json.dumps( 24 | { 25 | "rules": [ 26 | { 27 | "rulePriority": 1, 28 | "description": "Keep last 30 images", 29 | "selection": { 30 | "tagStatus": "any", 31 | "countType": "imageCountMoreThan", 32 | "countNumber": 30, 33 | }, 34 | "action": {"type": "expire"}, 35 | } 36 | ] 37 | } 38 | ), 39 | ) 40 | 41 | data_jobs_image = awsx.ecr.Image( 42 | "data_jobs_image", 43 | repository_url=data_ecr.repository_url, 44 | context="../data_jobs", 45 | platform="linux/x86_64", 46 | ) 47 | -------------------------------------------------------------------------------- /stack/pulumi/jobs/ecs_cluster.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | import json 3 | 4 | from jobs.config import ENV 5 | 6 | data_ecs_cluster = aws.ecs.Cluster( 7 | "data_ecs_cluster", 8 | name=f"{ENV}-data-ecs-cluster", 9 | settings=[ 10 | aws.ecs.ClusterSettingArgs( 11 | name="containerInsights", 12 | value="enabled", 13 | ) 14 | ], 15 | tags={"Name": f"{ENV}-data-ecs-cluster", "env": ENV}, 16 | ) 17 | 18 | ecs_task_role = aws.iam.Role( 19 | "ecs_task_role", 20 | assume_role_policy=json.dumps( 21 | { 22 | "Version": "2012-10-17", 23 | "Statement": [ 24 | { 25 | "Action": "sts:AssumeRole", 26 | "Effect": "Allow", 27 | "Sid": "", 28 | "Principal": {"Service": "ecs-tasks.amazonaws.com"}, 29 | } 30 | ], 31 | } 32 | ), 33 | tags={"Name": f"{ENV}-ecs-task-role", "env": ENV}, 34 | ) 35 | 36 | ecs_task_policy_attach = aws.iam.RolePolicyAttachment( 37 | "ecs_task_policy_attach", 38 | role=ecs_task_role.name, 39 | policy_arn="arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", 40 | ) 41 | 42 | # If you pass Secret Manager secret to your fargate task, you need this policy to retrieve them 43 | 44 | # ecs_task_policy_secret_manager = aws.iam.Policy( 45 | # "ecs_task_policy_secret_manager", 46 | # description="Authorize ECS task to retrieve secret from secret manager", 47 | # policy=json.dumps( 48 | # { 49 | # "Version": "2012-10-17", 50 | # "Statement": [ 51 | # { 52 | # "Effect": "Allow", 53 | # "Action": ["secretsmanager:GetSecretValue"], 54 | # "Resource": [""], 55 | # } 56 | # ], 57 | # } 58 | # ), 59 | # ) 60 | 61 | # ecs_task_secret_manager_policy_attach = aws.iam.RolePolicyAttachment( 62 | # "ecs_task_secret_manager_allow_policy_attach", 63 | # role=ecs_task_role.name, 64 | # policy_arn=ecs_task_policy_secret_manager.arn, 65 | # ) 66 | -------------------------------------------------------------------------------- /stack/pulumi/jobs/scheduled_job.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pulumi 3 | import pulumi_aws as aws 4 | 5 | from typing import Tuple 6 | from jobs.ecs_cluster import ecs_task_role, data_ecs_cluster 7 | from jobs.cloudwatch import cloudwatch_ecs_role, cloudwatch_ecs_log_group 8 | from jobs.ecr import data_jobs_image 9 | from jobs.config import ENV 10 | from network.private_subnets import private_subnet 11 | from instance.security_groups import allow_outbound_to_anywhere 12 | 13 | 14 | def create_ecs_task_definition( 15 | name: str, file_name: str, memory: str, cpu: str 16 | ) -> aws.ecs.TaskDefinition: 17 | task_definition = aws.ecs.TaskDefinition( 18 | f"{name}", 19 | container_definitions=pulumi.Output.all( 20 | data_jobs_image.image_uri, cloudwatch_ecs_log_group.name 21 | ).apply( 22 | lambda args: json.dumps( 23 | [ 24 | { 25 | "name": f"{name}", 26 | "image": f"{args[0]}", 27 | "essential": True, 28 | "command": ["python", f"jobs/{file_name}"], 29 | "environment": [ 30 | {"name": "DATAWAREHOUSE_DB", "value": "datawarehouse"}, 31 | ], 32 | # "secrets": [ 33 | # { 34 | # "name": "DATAWAREHOUSE_PASSWORD", 35 | # "valueFrom": ":::", 36 | # }, 37 | # ], 38 | "logConfiguration": { 39 | "logDriver": "awslogs", 40 | "options": { 41 | "awslogs-group": f"{args[1]}", 42 | "awslogs-region": "eu-west-3", 43 | "awslogs-stream-prefix": "ecs", 44 | }, 45 | }, 46 | } 47 | ] 48 | ) 49 | ), 50 | runtime_platform=aws.ecs.TaskDefinitionRuntimePlatformArgs( 51 | cpu_architecture="X86_64" 52 | ), 53 | family=f"{name}", 54 | cpu=cpu, 55 | memory=memory, 56 | network_mode="awsvpc", 57 | requires_compatibilities=["FARGATE"], 58 | execution_role_arn=ecs_task_role.arn, 59 | tags={"Name": f"{ENV}-{name}-task-def", "env": ENV}, 60 | ) 61 | 62 | return task_definition 63 | 64 | 65 | def create_cloudwatch_event_rule( 66 | name: str, schedule: str, task: aws.ecs.TaskDefinition, state: str 67 | ) -> Tuple[aws.cloudwatch.EventRule, aws.cloudwatch.EventTarget]: 68 | cw_event_rule = aws.cloudwatch.EventRule( 69 | f"{name}_cw_rule", 70 | description=f"CloudWatch event rule for job {name}", 71 | name=f"{name}_cw_rule", 72 | role_arn=cloudwatch_ecs_role.arn, 73 | schedule_expression=schedule, 74 | state=state, 75 | tags={"Name": f"{ENV}-{name}-event-rule", "env": ENV}, 76 | ) 77 | 78 | cw_event_target = aws.cloudwatch.EventTarget( 79 | f"{name}_cw_target", 80 | rule=cw_event_rule.id, 81 | arn=data_ecs_cluster.arn, 82 | role_arn=cloudwatch_ecs_role.arn, 83 | target_id=data_ecs_cluster.name, 84 | ecs_target=aws.cloudwatch.EventTargetEcsTargetArgs( 85 | task_definition_arn=task.arn, 86 | launch_type="FARGATE", 87 | network_configuration=aws.cloudwatch.EventTargetEcsTargetNetworkConfigurationArgs( 88 | subnets=[private_subnet.id], 89 | security_groups=[allow_outbound_to_anywhere.id], 90 | assign_public_ip=False, 91 | ), 92 | ), 93 | ) 94 | 95 | return cw_event_rule, cw_event_target 96 | 97 | 98 | def create_scheduled_job( 99 | name: str, 100 | file_name: str, 101 | schedule: str, 102 | memory: str = "1024", 103 | cpu: str = "512", 104 | is_enabled: bool = True, 105 | ) -> None: 106 | state = "ENABLED" if is_enabled else "DISABLED" 107 | task = create_ecs_task_definition( 108 | name=name, file_name=file_name, memory=memory, cpu=cpu 109 | ) 110 | create_cloudwatch_event_rule(name=name, schedule=schedule, task=task, state=state) 111 | -------------------------------------------------------------------------------- /stack/pulumi/network/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremySrgt/mini-modern-data-stack/7004b55122e18714d9cb85c2ae91c41d0f54407a/stack/pulumi/network/__init__.py -------------------------------------------------------------------------------- /stack/pulumi/network/az.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | 3 | available_az = aws.get_availability_zones(state="available") 4 | primary_az = available_az.names[0] 5 | secondary_az = available_az.names[1] -------------------------------------------------------------------------------- /stack/pulumi/network/config.py: -------------------------------------------------------------------------------- 1 | from pulumi import Config 2 | 3 | cfg = Config() 4 | 5 | ENV = cfg.get("env", default="dev") 6 | -------------------------------------------------------------------------------- /stack/pulumi/network/nat_gateway.py: -------------------------------------------------------------------------------- 1 | from pulumi import ResourceOptions 2 | import pulumi_aws as aws 3 | from network.public_subnets import public_subnet 4 | from network.vpc import internet_gateway 5 | from network.config import ENV 6 | 7 | nat_gateway_eip = aws.ec2.Eip( 8 | "nat_gateway_eip", 9 | tags={"Name": f"{ENV}-nat-gateway-eip", "env": ENV}, 10 | ) 11 | 12 | nat_gateway = aws.ec2.NatGateway( 13 | "nat_gateway", 14 | allocation_id=nat_gateway_eip.id, 15 | subnet_id=public_subnet.id, 16 | tags={"Name": f"{ENV}-data-nat-gateway", "env": ENV}, 17 | opts=ResourceOptions(depends_on=[public_subnet, internet_gateway]), 18 | ) 19 | -------------------------------------------------------------------------------- /stack/pulumi/network/private_subnets.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from network.vpc import data_vpc 3 | from network.nat_gateway import nat_gateway 4 | from network.az import primary_az, secondary_az 5 | from network.config import ENV 6 | 7 | private_subnet = aws.ec2.Subnet( 8 | "private_subnet", 9 | vpc_id=data_vpc.id, 10 | availability_zone=primary_az, 11 | cidr_block="10.0.2.0/24", 12 | map_public_ip_on_launch=False, 13 | tags={"Name": f"{ENV}-data-private-subnet", "env": ENV}, 14 | ) 15 | 16 | private_subnet_route_table = aws.ec2.RouteTable( 17 | "private_subnet_route_table", 18 | vpc_id=data_vpc.id, 19 | routes=[ 20 | aws.ec2.RouteTableRouteArgs( 21 | cidr_block="0.0.0.0/0", nat_gateway_id=nat_gateway.id 22 | ) 23 | ], 24 | tags={"Name": f"{ENV}-private-subnet-rt", "env": ENV}, 25 | ) 26 | 27 | private_subnet_rt_association = aws.ec2.RouteTableAssociation( 28 | "private_subnet_rt_association", 29 | subnet_id=private_subnet.id, 30 | route_table_id=private_subnet_route_table.id, 31 | ) 32 | 33 | private_subnet_2 = aws.ec2.Subnet( 34 | "private_subnet_2", 35 | vpc_id=data_vpc.id, 36 | availability_zone=secondary_az, 37 | cidr_block="10.0.3.0/24", 38 | map_public_ip_on_launch=False, 39 | tags={"Name": f"{ENV}-data-private-subnet-2", "env": ENV}, 40 | ) 41 | -------------------------------------------------------------------------------- /stack/pulumi/network/public_subnets.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from network.vpc import data_vpc, internet_gateway 3 | from network.config import ENV 4 | from network.az import primary_az 5 | 6 | public_subnet = aws.ec2.Subnet( 7 | "public_subnet", 8 | vpc_id=data_vpc.id, 9 | availability_zone=primary_az, 10 | cidr_block="10.0.1.0/24", 11 | map_public_ip_on_launch=True, 12 | tags={"Name": f"{ENV}-data-public-subnet", "env": ENV}, 13 | ) 14 | 15 | public_subnet_route_table = aws.ec2.RouteTable( 16 | "public_subnet_route_table", 17 | vpc_id=data_vpc.id, 18 | routes=[ 19 | aws.ec2.RouteTableRouteArgs( 20 | cidr_block="0.0.0.0/0", gateway_id=internet_gateway.id 21 | ) 22 | ], 23 | tags={"Name": f"{ENV}-public-subnet-rt", "env": ENV}, 24 | ) 25 | 26 | public_subnet_rt_association = aws.ec2.RouteTableAssociation( 27 | "public_subnet_rt_association", 28 | subnet_id=public_subnet.id, 29 | route_table_id=public_subnet_route_table.id, 30 | ) 31 | -------------------------------------------------------------------------------- /stack/pulumi/network/security_groups.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from instance.security_groups import allow_443_outbound_to_private_subnet_cidr 3 | from network.vpc import data_vpc 4 | from network.config import ENV 5 | 6 | allow_443_inbound_from_private_instance_sg = aws.ec2.SecurityGroup( 7 | "allow_443_inbound_from_private_instance_sg", 8 | description="Allow inbound traffic from instance SG in private subnet", 9 | vpc_id=data_vpc.id, 10 | tags={"Name": f"{ENV}-allow-443-inbound-from-private-instance-sg", "env": ENV}, 11 | ) 12 | 13 | aws.vpc.SecurityGroupIngressRule( 14 | "allow_443_inbound_from_private_instance_sg_ingress_rule", 15 | security_group_id=allow_443_inbound_from_private_instance_sg.id, 16 | ip_protocol="tcp", 17 | referenced_security_group_id=allow_443_outbound_to_private_subnet_cidr.id, 18 | from_port=443, 19 | to_port=443, 20 | ) 21 | -------------------------------------------------------------------------------- /stack/pulumi/network/vpc.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from network.config import ENV 3 | 4 | data_vpc = aws.ec2.Vpc( 5 | "data_vpc", 6 | cidr_block="10.0.0.0/16", 7 | enable_dns_hostnames=True, 8 | enable_dns_support=True, 9 | tags={"Name": f"{ENV}-mini-data-stack", "env": ENV}, 10 | ) 11 | 12 | internet_gateway = aws.ec2.InternetGateway( 13 | "internet_gateway", 14 | vpc_id=data_vpc.id, 15 | tags={"Name": f"{ENV}-internet-gateway", "env": ENV}, 16 | ) -------------------------------------------------------------------------------- /stack/pulumi/network/vpc_endpoints.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | 3 | from typing import List 4 | from network.vpc import data_vpc 5 | from network.private_subnets import private_subnet 6 | from network.security_groups import allow_443_inbound_from_private_instance_sg 7 | from network.config import ENV 8 | 9 | 10 | def ssm_vpc_endpoints( 11 | vpc_id: str, sg_ids: List[str], subnet_ids: List[str] 12 | ) -> List[aws.ec2.VpcEndpoint]: 13 | service_names = [ 14 | "com.amazonaws.eu-west-3.ssm", 15 | "com.amazonaws.eu-west-3.ssmmessages", 16 | "com.amazonaws.eu-west-3.ec2messages", 17 | ] 18 | endpoints = [] 19 | 20 | for service in service_names: 21 | resource_name = f"vpc_endpoint_{service.split('.')[-1]}" 22 | endpoints.append( 23 | aws.ec2.VpcEndpoint( 24 | resource_name, 25 | service_name=service, 26 | ip_address_type="ipv4", 27 | private_dns_enabled=True, 28 | vpc_id=vpc_id, 29 | auto_accept=True, 30 | security_group_ids=sg_ids, 31 | subnet_ids=subnet_ids, 32 | vpc_endpoint_type="Interface", 33 | tags={"Name": f"{ENV}-{resource_name}", "env": ENV}, 34 | ) 35 | ) 36 | 37 | return endpoints 38 | 39 | 40 | vpc_endpoints = ssm_vpc_endpoints( 41 | vpc_id=data_vpc.id, 42 | sg_ids=[allow_443_inbound_from_private_instance_sg.id], 43 | subnet_ids=[private_subnet.id], 44 | ) 45 | -------------------------------------------------------------------------------- /stack/pulumi/warehouse/config.py: -------------------------------------------------------------------------------- 1 | from pulumi import Config 2 | 3 | cfg = Config() 4 | 5 | ENV = cfg.get("env", default="dev") 6 | WAREHOUSE_INSTANCE_CLASS = cfg.get("warehouse_instance_class", default="db.t3.micro") 7 | WAREHOUSE_DB_NAME = cfg.get("warehouse_db_name", default="company_data_warehouse") 8 | DATA_WAREHOUSE_MASTER_USER = cfg.require_secret("dwh_master_user") 9 | DATA_WAREHOUSE_MASTER_PASSWORD = cfg.require_secret("dwh_master_password") 10 | -------------------------------------------------------------------------------- /stack/pulumi/warehouse/data_warehouse.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from warehouse.subnet_groups import warehouse_subnet_group 3 | from warehouse.security_groups import allow_5432_inbound_from_private_subnet 4 | from network.az import primary_az 5 | from warehouse.config import ( 6 | ENV, 7 | DATA_WAREHOUSE_MASTER_USER, 8 | DATA_WAREHOUSE_MASTER_PASSWORD, 9 | WAREHOUSE_INSTANCE_CLASS, 10 | WAREHOUSE_DB_NAME, 11 | ) 12 | 13 | data_warehouse = aws.rds.Instance( 14 | "data_warehouse", 15 | instance_class=WAREHOUSE_INSTANCE_CLASS, 16 | allocated_storage=20, 17 | allow_major_version_upgrade=False, 18 | auto_minor_version_upgrade=True, 19 | availability_zone=primary_az, 20 | backup_retention_period=1, 21 | backup_window="23:00-23:30", 22 | db_name=WAREHOUSE_DB_NAME, 23 | db_subnet_group_name=warehouse_subnet_group.name, 24 | engine="postgres", 25 | engine_version="14", 26 | identifier=f"{ENV}-data-warehouse-rds-instance", 27 | maintenance_window="thu:02:00-thu:05:00", 28 | max_allocated_storage=0, 29 | multi_az=False, 30 | publicly_accessible=False, 31 | skip_final_snapshot=True, 32 | storage_encrypted=True, 33 | storage_type="gp2", 34 | username=DATA_WAREHOUSE_MASTER_USER, 35 | password=DATA_WAREHOUSE_MASTER_PASSWORD, 36 | vpc_security_group_ids=[allow_5432_inbound_from_private_subnet.id], 37 | tags={"Name": f"{ENV}-data-warehouse-rds", "env": ENV}, 38 | ) 39 | -------------------------------------------------------------------------------- /stack/pulumi/warehouse/security_groups.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from warehouse.config import ENV 3 | from network.vpc import data_vpc 4 | from network.private_subnets import private_subnet 5 | 6 | allow_5432_inbound_from_private_subnet = aws.ec2.SecurityGroup( 7 | "data_warehouse_sg", 8 | description="Allow inbound traffic only if it is coming from instance inside the private subnet", 9 | vpc_id=data_vpc.id, 10 | tags={"Name": f"{ENV}-data-warehouse-sg", "env": ENV}, 11 | ) 12 | 13 | allow_5432_inbound_from_private_subnet_ingress_rule = aws.vpc.SecurityGroupIngressRule( 14 | "data_warehouse_sg_ingress_rule", 15 | security_group_id=allow_5432_inbound_from_private_subnet.id, 16 | from_port=5432, 17 | to_port=5432, 18 | ip_protocol="tcp", 19 | cidr_ipv4=private_subnet.cidr_block, 20 | ) 21 | -------------------------------------------------------------------------------- /stack/pulumi/warehouse/subnet_groups.py: -------------------------------------------------------------------------------- 1 | import pulumi_aws as aws 2 | from network.private_subnets import private_subnet, private_subnet_2 3 | from warehouse.config import ENV 4 | 5 | warehouse_subnet_group = aws.rds.SubnetGroup( 6 | "data_warehouse_subnet_group", 7 | subnet_ids=[private_subnet.id, private_subnet_2.id], 8 | tags={"Name": f"{ENV}-data-warehouse-subnet-group", "env": ENV}, 9 | ) 10 | -------------------------------------------------------------------------------- /stack/requirements.txt: -------------------------------------------------------------------------------- 1 | pulumi>=3.0.0,<4.0.0 2 | pulumi-aws>=6.0.2,<7.0.0 3 | pulumi_awsx==2.5.0 4 | boto3==1.34.36 5 | ansible==9.2.0 6 | python-dotenv==1.0.1 --------------------------------------------------------------------------------