├── .gitignore ├── README.md ├── airflow ├── dags │ └── elt_reddit_pipeline.py ├── docker-compose.yaml └── extraction │ ├── download_redshift_to_csv.py │ ├── extract_reddit_etl.py │ ├── upload_aws_redshift_etl.py │ ├── upload_aws_s3_etl.py │ └── validation.py ├── images ├── GDS-Dashboard.png ├── airflow.png ├── redshift.png └── workflow.png ├── instructions ├── aws.md ├── config.md ├── dbt.md ├── docker_airflow.md ├── improvements.md ├── overview.md ├── reddit.md ├── setup_infrastructure.md ├── terminate.md ├── tools.md └── visualisation.md ├── requirements.txt └── terraform ├── main.tf ├── output.tf └── variable.tf /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | extraction/secrets/ 3 | notes/ 4 | airflow/extraction/*.csv 5 | airflow/dags/__pycache__ 6 | airflow/plugins/* 7 | airflow/logs/* 8 | airflow/extracts/* 9 | airflow/.env 10 | airflow/.DS_Store 11 | airflow/extraction/pipeline_secret.conf 12 | terraform/* 13 | !*.tf 14 | *.conf 15 | .DS_Store 16 | __pycache__ 17 | airflow/redshift_download 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reddit ETL Pipeline 2 | 3 | A data pipeline to extract Reddit data from [r/dataengineering](https://www.reddit.com/r/dataengineering/). 4 | 5 | Output is a Google Data Studio report, providing insight into the Data Engineering official subreddit. 6 | 7 | ## Motivation 8 | 9 | Project was based on an interest in Data Engineering and the types of Q&A found on the official subreddit. 10 | 11 | It also provided a good opportunity to develop skills and experience in a range of tools. As such, project is more complex than required, utilising dbt, airflow, docker and cloud based storage. 12 | 13 | ## Architecture 14 | 15 | 16 | 17 | 1. Extract data using [Reddit API](https://www.reddit.com/dev/api/) 18 | 1. Load into [AWS S3](https://aws.amazon.com/s3/) 19 | 1. Copy into [AWS Redshift](https://aws.amazon.com/redshift/) 20 | 1. Transform using [dbt](https://www.getdbt.com) 21 | 1. Create [PowerBI](https://powerbi.microsoft.com/en-gb/) or [Google Data Studio](https://datastudio.google.com) Dashboard 22 | 1. Orchestrate with [Airflow](https://airflow.apache.org) in [Docker](https://www.docker.com) 23 | 1. Create AWS resources with [Terraform](https://www.terraform.io) 24 | 25 | ## Output 26 | 27 | [](https://datastudio.google.com/reporting/e927fef6-b605-421c-ae29-89a66e11ea18) 28 | 29 | * Final output from Google Data Studio. Link [here](https://datastudio.google.com/reporting/e927fef6-b605-421c-ae29-89a66e11ea18). Note that Dashboard is reading from a static CSV output from Redshift. Redshift database was deleted so as not to incur cost. 30 | 31 | ## Setup 32 | 33 | Follow below steps to setup pipeline. I've tried to explain steps where I can. Feel free to make improvements/changes. 34 | 35 | > **NOTE**: This was developed using an M1 Macbook Pro. If you're on Windows or Linux, you may need to amend certain components if issues are encountered. 36 | 37 | As AWS offer a free tier, this shouldn't cost you anything unless you amend the pipeline to extract large amounts of data, or keep infrastructure running for 2+ months. However, please check [AWS free tier](https://aws.amazon.com/free/?all-free-tier.sort-by=item.additionalFields.SortRank&all-free-tier.sort-order=asc&awsf.Free%20Tier%20Types=*all&awsf.Free%20Tier%20Categories=*all) limits, as this may change. 38 | 39 | First clone the repository into your home directory and follow the steps. 40 | 41 | ```bash 42 | git clone https://github.com/ABZ-Aaron/Reddit-API-Pipeline.git 43 | cd Reddit-API-Pipeline 44 | ``` 45 | 46 | 1. [Overview](instructions/overview.md) 47 | 1. [Reddit API Configuration](instructions/reddit.md) 48 | 1. [AWS Account](instructions/aws.md) 49 | 1. [Infrastructure with Terraform](instructions/setup_infrastructure.md) 50 | 1. [Configuration Details](instructions/config.md) 51 | 1. [Docker & Airflow](instructions/docker_airflow.md) 52 | 1. [dbt](instructions/dbt.md) 53 | 1. [Dashboard](instructions/visualisation.md) 54 | 1. [Final Notes & Termination](instructions/terminate.md) 55 | 1. [Improvements](instructions/improvements.md) 56 | -------------------------------------------------------------------------------- /airflow/dags/elt_reddit_pipeline.py: -------------------------------------------------------------------------------- 1 | from os import remove 2 | from airflow import DAG 3 | from airflow.operators.bash_operator import BashOperator 4 | from airflow.utils.dates import days_ago 5 | from datetime import timedelta, datetime 6 | 7 | """ 8 | DAG to extract Reddit data, load into AWS S3, and copy to AWS Redshift 9 | """ 10 | 11 | # Output name of extracted file. This be passed to each 12 | # DAG task so they know which file to process 13 | output_name = datetime.now().strftime("%Y%m%d") 14 | 15 | # Run our DAG daily and ensures DAG run will kick off 16 | # once Airflow is started, as it will try to "catch up" 17 | schedule_interval = "@daily" 18 | start_date = days_ago(1) 19 | 20 | default_args = {"owner": "airflow", "depends_on_past": False, "retries": 1} 21 | 22 | with DAG( 23 | dag_id="elt_reddit_pipeline", 24 | description="Reddit ELT", 25 | schedule_interval=schedule_interval, 26 | default_args=default_args, 27 | start_date=start_date, 28 | catchup=True, 29 | max_active_runs=1, 30 | tags=["RedditETL"], 31 | ) as dag: 32 | 33 | extract_reddit_data = BashOperator( 34 | task_id="extract_reddit_data", 35 | bash_command=f"python /opt/airflow/extraction/extract_reddit_etl.py {output_name}", 36 | dag=dag, 37 | ) 38 | extract_reddit_data.doc_md = "Extract Reddit data and store as CSV" 39 | 40 | upload_to_s3 = BashOperator( 41 | task_id="upload_to_s3", 42 | bash_command=f"python /opt/airflow/extraction/upload_aws_s3_etl.py {output_name}", 43 | dag=dag, 44 | ) 45 | upload_to_s3.doc_md = "Upload Reddit CSV data to S3 bucket" 46 | 47 | copy_to_redshift = BashOperator( 48 | task_id="copy_to_redshift", 49 | bash_command=f"python /opt/airflow/extraction/upload_aws_redshift_etl.py {output_name}", 50 | dag=dag, 51 | ) 52 | copy_to_redshift.doc_md = "Copy S3 CSV file to Redshift table" 53 | 54 | extract_reddit_data >> upload_to_s3 >> copy_to_redshift 55 | -------------------------------------------------------------------------------- /airflow/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.3.2 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 31 | # 32 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 33 | # Default: airflow 34 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 35 | # Default: airflow 36 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 37 | # Default: '' 38 | # 39 | # Feel free to modify this file to suit your needs. 40 | --- 41 | version: '3' 42 | x-airflow-common: 43 | &airflow-common 44 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 45 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 46 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 47 | image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.3.2} 48 | # build: . 49 | environment: 50 | &airflow-common-env 51 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 52 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 53 | # For backward compatibility, with Airflow <2.3 54 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 55 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 56 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 57 | AIRFLOW__CORE__FERNET_KEY: '' 58 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'false' 59 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 60 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth' 61 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- praw boto3 configparser psycopg2-binary} 62 | volumes: 63 | - ./dags:/opt/airflow/dags 64 | - ./logs:/opt/airflow/logs 65 | - ./plugins:/opt/airflow/plugins 66 | - ./extraction:/opt/airflow/extraction 67 | - $HOME/.aws/credentials:/home/airflow/.aws/credentials:ro 68 | user: "${AIRFLOW_UID:-50000}:0" 69 | depends_on: 70 | &airflow-common-depends-on 71 | redis: 72 | condition: service_healthy 73 | postgres: 74 | condition: service_healthy 75 | 76 | services: 77 | postgres: 78 | image: postgres:13 79 | environment: 80 | POSTGRES_USER: airflow 81 | POSTGRES_PASSWORD: airflow 82 | POSTGRES_DB: airflow 83 | volumes: 84 | - postgres-db-volume:/var/lib/postgresql/data 85 | healthcheck: 86 | test: ["CMD", "pg_isready", "-U", "airflow"] 87 | interval: 5s 88 | retries: 5 89 | restart: always 90 | 91 | redis: 92 | image: redis:latest 93 | expose: 94 | - 6379 95 | healthcheck: 96 | test: ["CMD", "redis-cli", "ping"] 97 | interval: 5s 98 | timeout: 30s 99 | retries: 50 100 | restart: always 101 | 102 | airflow-webserver: 103 | <<: *airflow-common 104 | command: webserver 105 | ports: 106 | - 8080:8080 107 | healthcheck: 108 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 109 | interval: 10s 110 | timeout: 10s 111 | retries: 5 112 | restart: always 113 | depends_on: 114 | <<: *airflow-common-depends-on 115 | airflow-init: 116 | condition: service_completed_successfully 117 | 118 | airflow-scheduler: 119 | <<: *airflow-common 120 | command: scheduler 121 | healthcheck: 122 | test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] 123 | interval: 10s 124 | timeout: 10s 125 | retries: 5 126 | restart: always 127 | depends_on: 128 | <<: *airflow-common-depends-on 129 | airflow-init: 130 | condition: service_completed_successfully 131 | 132 | airflow-worker: 133 | <<: *airflow-common 134 | command: celery worker 135 | healthcheck: 136 | test: 137 | - "CMD-SHELL" 138 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 139 | interval: 10s 140 | timeout: 10s 141 | retries: 5 142 | environment: 143 | <<: *airflow-common-env 144 | # Required to handle warm shutdown of the celery workers properly 145 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 146 | DUMB_INIT_SETSID: "0" 147 | restart: always 148 | depends_on: 149 | <<: *airflow-common-depends-on 150 | airflow-init: 151 | condition: service_completed_successfully 152 | 153 | airflow-triggerer: 154 | <<: *airflow-common 155 | command: triggerer 156 | healthcheck: 157 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 158 | interval: 10s 159 | timeout: 10s 160 | retries: 5 161 | restart: always 162 | depends_on: 163 | <<: *airflow-common-depends-on 164 | airflow-init: 165 | condition: service_completed_successfully 166 | 167 | airflow-init: 168 | <<: *airflow-common 169 | entrypoint: /bin/bash 170 | # yamllint disable rule:line-length 171 | command: 172 | - -c 173 | - | 174 | function ver() { 175 | printf "%04d%04d%04d%04d" $${1//./ } 176 | } 177 | airflow_version=$$(gosu airflow airflow version) 178 | airflow_version_comparable=$$(ver $${airflow_version}) 179 | min_airflow_version=2.2.0 180 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 181 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 182 | echo 183 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 184 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 185 | echo 186 | exit 1 187 | fi 188 | if [[ -z "${AIRFLOW_UID}" ]]; then 189 | echo 190 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 191 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 192 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 193 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 194 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#setting-the-right-airflow-user" 195 | echo 196 | fi 197 | one_meg=1048576 198 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 199 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 200 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 201 | warning_resources="false" 202 | if (( mem_available < 4000 )) ; then 203 | echo 204 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 205 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 206 | echo 207 | warning_resources="true" 208 | fi 209 | if (( cpus_available < 2 )); then 210 | echo 211 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 212 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 213 | echo 214 | warning_resources="true" 215 | fi 216 | if (( disk_available < one_meg * 10 )); then 217 | echo 218 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 219 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 220 | echo 221 | warning_resources="true" 222 | fi 223 | if [[ $${warning_resources} == "true" ]]; then 224 | echo 225 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 226 | echo "Please follow the instructions to increase amount of resources available:" 227 | echo " https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html#before-you-begin" 228 | echo 229 | fi 230 | mkdir -p /sources/logs /sources/dags /sources/plugins 231 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 232 | exec /entrypoint airflow version 233 | # yamllint enable rule:line-length 234 | environment: 235 | <<: *airflow-common-env 236 | _AIRFLOW_DB_UPGRADE: 'true' 237 | _AIRFLOW_WWW_USER_CREATE: 'true' 238 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 239 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 240 | _PIP_ADDITIONAL_REQUIREMENTS: '' 241 | user: "0:0" 242 | volumes: 243 | - .:/sources 244 | 245 | airflow-cli: 246 | <<: *airflow-common 247 | profiles: 248 | - debug 249 | environment: 250 | <<: *airflow-common-env 251 | CONNECTION_CHECK_MAX_COUNT: "0" 252 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 253 | command: 254 | - bash 255 | - -c 256 | - airflow 257 | 258 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 259 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 260 | # See: https://docs.docker.com/compose/profiles/ 261 | flower: 262 | <<: *airflow-common 263 | command: celery flower 264 | profiles: 265 | - flower 266 | ports: 267 | - 5555:5555 268 | healthcheck: 269 | test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 270 | interval: 10s 271 | timeout: 10s 272 | retries: 5 273 | restart: always 274 | depends_on: 275 | <<: *airflow-common-depends-on 276 | airflow-init: 277 | condition: service_completed_successfully 278 | 279 | volumes: 280 | postgres-db-volume: 281 | -------------------------------------------------------------------------------- /airflow/extraction/download_redshift_to_csv.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import pathlib 3 | import psycopg2 4 | from psycopg2 import sql 5 | import csv 6 | import sys 7 | 8 | """ 9 | Download Redshift table to CSV file. Will be stored under /tmp folder. 10 | """ 11 | 12 | # Parse configuration file 13 | script_path = pathlib.Path(__file__).parent.resolve() 14 | parser = configparser.ConfigParser() 15 | parser.read(f"{script_path}/configuration.conf") 16 | 17 | # Store configuration variables 18 | USERNAME = parser.get("aws_config", "redshift_username") 19 | PASSWORD = parser.get("aws_config", "redshift_password") 20 | HOST = parser.get("aws_config", "redshift_hostname") 21 | PORT = parser.get("aws_config", "redshift_port") 22 | DATABASE = parser.get("aws_config", "redshift_database") 23 | TABLE_NAME = "reddit" 24 | 25 | # TODO Improve error handling 26 | def connect_to_redshift(): 27 | """Connect to Redshift instance""" 28 | try: 29 | rs_conn = psycopg2.connect( 30 | dbname=DATABASE, user=USERNAME, password=PASSWORD, host=HOST, port=PORT 31 | ) 32 | return rs_conn 33 | except Exception as e: 34 | print(f"Unable to connect to Redshift. Error {e}") 35 | sys.exit(1) 36 | 37 | # TODO Error handling 38 | def download_redshift_data(rs_conn): 39 | """Download data from Redshift table to CSV""" 40 | with rs_conn: 41 | cur = rs_conn.cursor() 42 | cur.execute( 43 | sql.SQL("SELECT * FROM {table};").format(table=sql.Identifier(TABLE_NAME)) 44 | ) 45 | result = cur.fetchall() 46 | headers = [col[0] for col in cur.description] 47 | result.insert(0, tuple(headers)) 48 | fp = open("/tmp/redshift_output.csv", "w") 49 | myFile = csv.writer(fp) 50 | myFile.writerows(result) 51 | fp.close() 52 | 53 | 54 | if __name__ == "__main__": 55 | rs_conn = connect_to_redshift() 56 | download_redshift_data(rs_conn) 57 | -------------------------------------------------------------------------------- /airflow/extraction/extract_reddit_etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import datetime 3 | import pandas as pd 4 | import pathlib 5 | import praw 6 | import sys 7 | import numpy as np 8 | from validation import validate_input 9 | 10 | """ 11 | Part of Airflow DAG. Takes in one command line argument of format YYYYMMDD. 12 | Script will connect to Reddit API and extract top posts from past day 13 | with no limit. For a small subreddit like Data Engineering, this should extract all posts 14 | from the past 24 hours. 15 | """ 16 | 17 | # Read Configuration File 18 | parser = configparser.ConfigParser() 19 | script_path = pathlib.Path(__file__).parent.resolve() 20 | config_file = "configuration.conf" 21 | parser.read(f"{script_path}/{config_file}") 22 | 23 | # Configuration Variables 24 | SECRET = parser.get("reddit_config", "secret") 25 | CLIENT_ID = parser.get("reddit_config", "client_id") 26 | 27 | # Options for extracting data from PRAW 28 | SUBREDDIT = "dataengineering" 29 | TIME_FILTER = "day" 30 | LIMIT = None 31 | 32 | # Fields that will be extracted from Reddit. 33 | # Check PRAW documentation for additional fields. 34 | # NOTE: if you change these, you'll need to update the create table 35 | # sql query in the upload_aws_redshift.py file 36 | POST_FIELDS = ( 37 | "id", 38 | "title", 39 | "score", 40 | "num_comments", 41 | "author", 42 | "created_utc", 43 | "url", 44 | "upvote_ratio", 45 | "over_18", 46 | "edited", 47 | "spoiler", 48 | "stickied", 49 | ) 50 | 51 | # TODO Improve error handling 52 | # Use command line argument as output file 53 | # name and also store as column value 54 | try: 55 | output_name = sys.argv[1] 56 | except Exception as e: 57 | print(f"Error with file input. Error {e}") 58 | sys.exit(1) 59 | date_dag_run = datetime.datetime.strptime(output_name, "%Y%m%d") 60 | 61 | 62 | def main(): 63 | """Extract Reddit data and load to CSV""" 64 | validate_input(output_name) 65 | reddit_instance = api_connect() 66 | subreddit_posts_object = subreddit_posts(reddit_instance) 67 | extracted_data = extract_data(subreddit_posts_object) 68 | transformed_data = transform_basic(extracted_data) 69 | load_to_csv(transformed_data) 70 | 71 | 72 | # TODO: Improve error handling 73 | def api_connect(): 74 | """Connect to Reddit API""" 75 | try: 76 | instance = praw.Reddit( 77 | client_id=CLIENT_ID, client_secret=SECRET, user_agent="My User Agent" 78 | ) 79 | return instance 80 | except Exception as e: 81 | print(f"Unable to connect to API. Error: {e}") 82 | sys.exit(1) 83 | 84 | 85 | # TODO: Improve error handling 86 | def subreddit_posts(reddit_instance): 87 | """Create posts object for Reddit instance""" 88 | try: 89 | subreddit = reddit_instance.subreddit(SUBREDDIT) 90 | posts = subreddit.top(time_filter=TIME_FILTER, limit=LIMIT) 91 | return posts 92 | except Exception as e: 93 | print(f"There's been an issue. Error: {e}") 94 | sys.exit(1) 95 | 96 | 97 | # TODO: Improve error handling 98 | def extract_data(posts): 99 | """Extract Data to Pandas DataFrame object""" 100 | list_of_items = [] 101 | try: 102 | for submission in posts: 103 | to_dict = vars(submission) 104 | sub_dict = {field: to_dict[field] for field in POST_FIELDS} 105 | list_of_items.append(sub_dict) 106 | extracted_data_df = pd.DataFrame(list_of_items) 107 | except Exception as e: 108 | print(f"There has been an issue. Error {e}") 109 | sys.exit(1) 110 | 111 | return extracted_data_df 112 | 113 | # TODO: Remove all but the edited line, as not necessary. For edited line, rather 114 | # than force as boolean, keep date-time of last edit and set all else to None. 115 | def transform_basic(df): 116 | """Some basic transformation of data. To be refactored at a later point.""" 117 | 118 | # Convert epoch to UTC 119 | df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s") 120 | # Fields don't appear to return as booleans (e.g. False or Epoch time). Needs further investigation but forcing as False or True for now. 121 | df["over_18"] = np.where( 122 | (df["over_18"] == "False") | (df["over_18"] == False), False, True 123 | ).astype(bool) 124 | df["edited"] = np.where( 125 | (df["edited"] == "False") | (df["edited"] == False), False, True 126 | ).astype(bool) 127 | df["spoiler"] = np.where( 128 | (df["spoiler"] == "False") | (df["spoiler"] == False), False, True 129 | ).astype(bool) 130 | df["stickied"] = np.where( 131 | (df["stickied"] == "False") | (df["stickied"] == False), False, True 132 | ).astype(bool) 133 | return df 134 | 135 | 136 | def load_to_csv(extracted_data_df): 137 | """Save extracted data to CSV file in /tmp folder""" 138 | extracted_data_df.to_csv(f"/tmp/{output_name}.csv", index=False) 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /airflow/extraction/upload_aws_redshift_etl.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import pathlib 3 | import psycopg2 4 | import sys 5 | from validation import validate_input 6 | from psycopg2 import sql 7 | 8 | """ 9 | Part of DAG. Upload S3 CSV data to Redshift. Takes one argument of format YYYYMMDD. This is the name of 10 | the file to copy from S3. Script will load data into temporary table in Redshift, delete 11 | records with the same post ID from main table, then insert these from temp table (along with new data) 12 | to main table. This means that if we somehow pick up duplicate records in a new DAG run, 13 | the record in Redshift will be updated to reflect any changes in that record, if any (e.g. higher score or more comments). 14 | """ 15 | 16 | # Parse our configuration file 17 | script_path = pathlib.Path(__file__).parent.resolve() 18 | parser = configparser.ConfigParser() 19 | parser.read(f"{script_path}/configuration.conf") 20 | 21 | # Store our configuration variables 22 | USERNAME = parser.get("aws_config", "redshift_username") 23 | PASSWORD = parser.get("aws_config", "redshift_password") 24 | HOST = parser.get("aws_config", "redshift_hostname") 25 | PORT = parser.get("aws_config", "redshift_port") 26 | REDSHIFT_ROLE = parser.get("aws_config", "redshift_role") 27 | DATABASE = parser.get("aws_config", "redshift_database") 28 | BUCKET_NAME = parser.get("aws_config", "bucket_name") 29 | ACCOUNT_ID = parser.get("aws_config", "account_id") 30 | TABLE_NAME = "reddit" 31 | 32 | # TODO Improve error handling 33 | # Check command line argument passed 34 | try: 35 | output_name = sys.argv[1] 36 | except Exception as e: 37 | print(f"Command line argument not passed. Error {e}") 38 | sys.exit(1) 39 | 40 | # Our S3 file & role_string 41 | file_path = f"s3://{BUCKET_NAME}/{output_name}.csv" 42 | role_string = f"arn:aws:iam::{ACCOUNT_ID}:role/{REDSHIFT_ROLE}" 43 | 44 | # Create Redshift table if it doesn't exist 45 | sql_create_table = sql.SQL( 46 | """CREATE TABLE IF NOT EXISTS {table} ( 47 | id varchar PRIMARY KEY, 48 | title varchar(max), 49 | num_comments int, 50 | score int, 51 | author varchar(max), 52 | created_utc timestamp, 53 | url varchar(max), 54 | upvote_ratio float, 55 | over_18 bool, 56 | edited bool, 57 | spoiler bool, 58 | stickied bool 59 | );""" 60 | ).format(table=sql.Identifier(TABLE_NAME)) 61 | 62 | # TODO Improve process. Creating a temp table may be unnecessary 63 | # If ID already exists in table, we remove it and add new ID record during load. 64 | create_temp_table = sql.SQL( 65 | "CREATE TEMP TABLE our_staging_table (LIKE {table});" 66 | ).format(table=sql.Identifier(TABLE_NAME)) 67 | sql_copy_to_temp = f"COPY our_staging_table FROM '{file_path}' iam_role '{role_string}' IGNOREHEADER 1 DELIMITER ',' CSV;" 68 | delete_from_table = sql.SQL( 69 | "DELETE FROM {table} USING our_staging_table WHERE {table}.id = our_staging_table.id;" 70 | ).format(table=sql.Identifier(TABLE_NAME)) 71 | insert_into_table = sql.SQL( 72 | "INSERT INTO {table} SELECT * FROM our_staging_table;" 73 | ).format(table=sql.Identifier(TABLE_NAME)) 74 | drop_temp_table = "DROP TABLE our_staging_table;" 75 | 76 | 77 | def main(): 78 | """Upload file form S3 to Redshift Table""" 79 | validate_input(output_name) 80 | rs_conn = connect_to_redshift() 81 | load_data_into_redshift(rs_conn) 82 | 83 | # TODO Improve error handling 84 | def connect_to_redshift(): 85 | """Connect to Redshift instance""" 86 | try: 87 | rs_conn = psycopg2.connect( 88 | dbname=DATABASE, user=USERNAME, password=PASSWORD, host=HOST, port=PORT 89 | ) 90 | return rs_conn 91 | except Exception as e: 92 | print(f"Unable to connect to Redshift. Error {e}") 93 | sys.exit(1) 94 | 95 | 96 | def load_data_into_redshift(rs_conn): 97 | """Load data from S3 into Redshift""" 98 | with rs_conn: 99 | 100 | cur = rs_conn.cursor() 101 | cur.execute(sql_create_table) 102 | cur.execute(create_temp_table) 103 | cur.execute(sql_copy_to_temp) 104 | cur.execute(delete_from_table) 105 | cur.execute(insert_into_table) 106 | cur.execute(drop_temp_table) 107 | 108 | # Commit only at the end, so we won't end up 109 | # with a temp table and deleted main table if something fails 110 | rs_conn.commit() 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /airflow/extraction/upload_aws_s3_etl.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import botocore 3 | import configparser 4 | import pathlib 5 | import sys 6 | from validation import validate_input 7 | 8 | """ 9 | Part of DAG. Take Reddit data and upload to S3 bucket. Takes one command line argument of format YYYYMMDD. 10 | This represents the file downloaded from Reddit, which will be in the /tmp folder. 11 | """ 12 | 13 | # Load AWS credentials 14 | parser = configparser.ConfigParser() 15 | script_path = pathlib.Path(__file__).parent.resolve() 16 | parser.read(f"{script_path}/configuration.conf") 17 | BUCKET_NAME = parser.get("aws_config", "bucket_name") 18 | AWS_REGION = parser.get("aws_config", "aws_region") 19 | 20 | # TODO Improve error handling 21 | try: 22 | output_name = sys.argv[1] 23 | except Exception as e: 24 | print(f"Command line argument not passed. Error {e}") 25 | sys.exit(1) 26 | 27 | # Name for our S3 file 28 | FILENAME = f"{output_name}.csv" 29 | KEY = FILENAME 30 | 31 | 32 | def main(): 33 | """Upload input file to S3 bucket""" 34 | validate_input(output_name) 35 | conn = connect_to_s3() 36 | create_bucket_if_not_exists(conn) 37 | upload_file_to_s3(conn) 38 | 39 | # TODO Improve error handling 40 | def connect_to_s3(): 41 | """Connect to S3 Instance""" 42 | try: 43 | conn = boto3.resource("s3") 44 | return conn 45 | except Exception as e: 46 | print(f"Can't connect to S3. Error: {e}") 47 | sys.exit(1) 48 | 49 | # TODO Improve error handling 50 | def create_bucket_if_not_exists(conn): 51 | """Check if bucket exists and create if not""" 52 | exists = True 53 | try: 54 | conn.meta.client.head_bucket(Bucket=BUCKET_NAME) 55 | except botocore.exceptions.ClientError as e: 56 | error_code = e.response["Error"]["Code"] 57 | if error_code == "404": 58 | exists = False 59 | if not exists: 60 | conn.create_bucket( 61 | Bucket=BUCKET_NAME, 62 | CreateBucketConfiguration={"LocationConstraint": AWS_REGION}, 63 | ) 64 | 65 | 66 | def upload_file_to_s3(conn): 67 | """Upload file to S3 Bucket""" 68 | conn.meta.client.upload_file( 69 | Filename="/tmp/" + FILENAME, Bucket=BUCKET_NAME, Key=KEY 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /airflow/extraction/validation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime 3 | 4 | def validate_input(date_input): 5 | """Validate that input is of correct format""" 6 | try: 7 | datetime.datetime.strptime(date_input, '%Y%m%d') 8 | except ValueError: 9 | raise ValueError("Input parameter should be YYYYMMDD") 10 | sys.exit(1) -------------------------------------------------------------------------------- /images/GDS-Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ABZ-Aaron/reddit-api-pipeline/16631edca3ff905111ed24c34ca888cf3ec62e04/images/GDS-Dashboard.png -------------------------------------------------------------------------------- /images/airflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ABZ-Aaron/reddit-api-pipeline/16631edca3ff905111ed24c34ca888cf3ec62e04/images/airflow.png -------------------------------------------------------------------------------- /images/redshift.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ABZ-Aaron/reddit-api-pipeline/16631edca3ff905111ed24c34ca888cf3ec62e04/images/redshift.png -------------------------------------------------------------------------------- /images/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ABZ-Aaron/reddit-api-pipeline/16631edca3ff905111ed24c34ca888cf3ec62e04/images/workflow.png -------------------------------------------------------------------------------- /instructions/aws.md: -------------------------------------------------------------------------------- 1 | # AWS 2 | 3 | We'll be using the cloud to store Reddit data; specifically, Amazon Web Service (AWS). This offers a free tier. 4 | 5 | We're going to be using 2 services: 6 | 7 | * [Simple Storage Service (S3)](https://aws.amazon.com/s3/) ~ This is Object Storage. When we extract data from Reddit, we'll store it in a CSV and push to an S3 Bucket as an object (think of a Bucket as a bit like a folder and an object as a file). This allows us to store all our raw data in the cloud. 8 | 9 | * [Redshift](https://aws.amazon.com/redshift/) ~ This is a Data Warehousing service. Utilising its Massively Parallel Processing (MPP) technology, Redshift is able to execute operations on large datasets at fast speeds. It's based on PostgreSQL, so we can use SQL to run operations here. 10 | 11 | In our case, we'd be fine to use a local database like PostgreSQL. However, it's good practice to work with cloud tools like this. 12 | 13 | To get started with AWS, follow the below steps: 14 | 15 | ## Setup 16 | 17 | 1. Setup a personal [AWS account](https://portal.aws.amazon.com/billing/signup?nc2=h_ct&src=header_signup&redirect_url=https%3A%2F%2Faws.amazon.com%2Fregistration-confirmation#/start). Follow instructions [here](https://aws.amazon.com/getting-started/guides/setup-environment/module-one/) and setup with free tier. 18 | 19 | 2. Secure your account following these [steps](https://aws.amazon.com/getting-started/guides/setup-environment/module-two/). 20 | 21 | Here we are setting up MFA for the root user. The root is a special account that has access to everything. Therefore it's important we secure this. Also be sure to setup an IAM user which will have its own set of permissions, in this case, admin permissions. Generally in production, you should only use the root account for tasks that can only be done with the root account. 22 | 23 | 3. Setup CLI following this [guide](https://aws.amazon.com/getting-started/guides/setup-environment/module-three/). 24 | 25 | This allows us to control AWS services from the command line interface. The goal by the end of this is you should have a folder in your home directory called `.aws` which contains a `credentials` file. It will look something like this: 26 | 27 | ```config 28 | [default] 29 | aws_access_key_id = XXXX 30 | aws_secret_access_key = XXXX 31 | ``` 32 | 33 | This will allow our scripts to interact with AWS without having to include our access key and secret access key within the scripts. 34 | 35 | --- 36 | 37 | [Previous Step](reddit.md) | [Next Step](setup_infrastructure.md) 38 | 39 | or 40 | 41 | [Back to main README](../README.md) 42 | -------------------------------------------------------------------------------- /instructions/config.md: -------------------------------------------------------------------------------- 1 | 2 | # Configuration 3 | 4 | Next, you'll need to create a configuration file with your details. The extract and load scripts in our pipeline will utilise the details here. 5 | 6 | ## Setup 7 | 8 | 1. Create a configuration file under `~/Reddit-API-Pipeline/airflow/extraction/` called `configuration.conf`: 9 | 10 | ```bash 11 | touch ~/Reddit-API-Pipeline/airflow/extraction/configuration.conf 12 | ``` 13 | 14 | 1. Copy in the following: 15 | 16 | ```conf 17 | [aws_config] 18 | bucket_name = XXXXX 19 | redshift_username = awsuser 20 | redshift_password = XXXXX 21 | redshift_hostname = XXXXX 22 | redshift_role = RedShiftLoadRole 23 | redshift_port = 5439 24 | redshift_database = dev 25 | account_id = XXXXX 26 | aws_region = XXXXX 27 | 28 | [reddit_config] 29 | secret = XXXXX 30 | developer = XXXXX 31 | name = XXXXX 32 | client_id = XXXXX 33 | ``` 34 | 35 | 36 | 1. Change `XXXXX` values 37 | 38 | * If you need a reminder of your `aws_config` details, change folder back into the terraform folder and run the command. It will output the values you need to store under `aws_config`. Just be sure to remove any `"` from the strings. 39 | 40 | ```bash 41 | terraform output 42 | ``` 43 | 44 | * For `reddit_config` these are the details you took note of after setting up your Reddit App. Note the `developer` is your Reddit name. 45 | 46 | --- 47 | 48 | [Previous Step](setup_infrastructure.md) | [Next Step](docker_airflow.md) 49 | 50 | or 51 | 52 | [Back to main README](../README.md) 53 | -------------------------------------------------------------------------------- /instructions/dbt.md: -------------------------------------------------------------------------------- 1 | # DBT 2 | 3 | dbt (data build tool) is a transformation tools that sits on top of our data warehouse. 4 | 5 | We don't actually require any real transformation on our data; however, like before, consider this good practice. There is a bit of setup required here, so feel free to skip this if you just want to see your Redshift data in Google Data Studio. 6 | 7 | In production, this could be used to create multiple different tables with different columns. Data scientists might be given access to one table, and data analysts the other, as an example. dbt would be able to run tests when creating these new tables, produce documentation for analysts to examine, help manage dependencies between models, and so forth. 8 | 9 | If you continue, I'd recommend taking a quick look at some dbt [tutorials](https://docs.getdbt.com/docs/dbt-cloud/cloud-quickstart). I'll only go through some basic steps to setup a transformation here. 10 | 11 | For reference, here's a [link](https://github.com/ABZ-Aaron/Reddit-API-Pipeline-DBT) to the separate repo I set up for dbt. 12 | 13 | ## Setup (development) 14 | 15 | 1. Create a dbt account [here](https://www.getdbt.com/signup/). 16 | 17 | 1. Create a project or just stick to the default project created. 18 | 19 | 1. Setup Up a `Database Connection` - Select Redshift 20 | 21 | 1. On the next page, enter the relevant Redshift details. This includes the `hostname`. You'll find this in the AWS Redshift console, or from the `terraform output` command mentioned earlier. It will start with the name of your cluster and end with `amazonaws.com`. It will also require the port (likely `5439`), the database name (likely `dev`). You'll also need the database username (likely `awsuser`) and password for the database you specified in the `Terraform` step. Once everything is input, click the `Test` button at the top. If successful, click `Continue`. 22 | 23 | 1. Once connection is established, choose `managed directory` and give it a name. You can also choose Github if you have a Github repo setup for the dbt part of this project, like I have, although this will require a bit of configuration. 24 | 25 | 1. Once you've worked through these initial steps, click on `Start Developing`. 26 | 27 | You are now in an IDE which is connected to your Redshift cluster. Here we'll run some basic transformations on our data. 28 | 29 | 1. Click on `initialize project`. This will populate the directory on the left hand side with folder and files we may need. 30 | 31 | 1. Under the `models` folder, create new files called `reddit_transformed.sql` (this name will be the name of the new model you create) and `schema.yml`. You can delete the `example` folder. 32 | 33 | 1. In the `schema.yml` file, copy the following and save. Here we are defining some basic tests and documentation for our table. I haven't added much, as it's mostly for demonstration purposes. You can see that I've added a `not_null` test for `id`. For the rest, I've only added in the name of the column and a description. 34 | 35 | ```yaml 36 | version: 2 37 | 38 | models: 39 | - name: reddit_transformed 40 | description: Transformed Reddit Data 41 | columns: 42 | - name: id 43 | description: Reddit ID of Post 44 | tests: 45 | - not_null 46 | - name: title 47 | description: Title of Reddit Post 48 | - name: text 49 | description: Body Text of Reddit Post 50 | - name: score 51 | description: Score of Reddit Post 52 | - name: comments 53 | description: Number of Comments for Post 54 | - name: url 55 | description: Full URL of Reddit Post 56 | - name: comment 57 | description: Top comment for Reddit Post 58 | - name: dateposted 59 | description: Date Reddit Data was Downloaded 60 | ``` 61 | 1. In the `text_posts.sql` file, copy the following and save. This wil be our only transformation. This is basically removing some columns that don't really need, and also splitting the UTC datetime column into `utc_date` and `utc_time`. Feel free to transform the data in whichever way you want. This is just a VERY basic example. 62 | 63 | ```SQL 64 | SELECT id, 65 | title, 66 | num_comments, 67 | score, 68 | author, 69 | created_utc, 70 | url, 71 | upvote_ratio, 72 | created_utc::date as utc_date, 73 | created_utc::time as utc_time 74 | FROM dev.public.reddit 75 | ``` 76 | 77 | 1. If you check the bottom right of the screen, you'll see a preview button. Click this to see what your outputted table will look like based on the above SQL query. Basically, when we run `dbt run` in the UI (further down this page) what'll happen is this table will be created in a new schema within our Redshift database. 78 | 79 | 1. Under the `dbt_project.yml`, update it to the following. All we've really changed here is the project name to `reddit_project` and told dbt to create all models as tables (rather than views). You can leave it as views if you wish. 80 | 81 | ```yaml 82 | # Name your project! Project names should contain only lowercase characters 83 | # and underscores. A good package name should reflect your organization's 84 | # name or the intended use of these models 85 | name: 'reddit_project' 86 | version: '1.0.0' 87 | config-version: 2 88 | 89 | # This setting configures which "profile" dbt uses for this project. 90 | profile: 'default' 91 | 92 | # These configurations specify where dbt should look for different types of files. 93 | # The `source-paths` config, for example, states that models in this project can be 94 | # found in the "models/" directory. You probably won't need to change these! 95 | model-paths: ["models"] 96 | analysis-paths: ["analyses"] 97 | test-paths: ["tests"] 98 | seed-paths: ["seeds"] 99 | macro-paths: ["macros"] 100 | snapshot-paths: ["snapshots"] 101 | 102 | target-path: "target" # directory which will store compiled SQL files 103 | clean-targets: # directories to be removed by `dbt clean` 104 | - "target" 105 | - "dbt_packages" 106 | 107 | 108 | # Configuring models 109 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 110 | 111 | # In this example config, we tell dbt to build all models in the example/ directory 112 | # as tables. These settings can be overridden in the individual model files 113 | # using the `{{ config(...) }}` macro. 114 | models: 115 | reddit_project: 116 | materialized: table 117 | ``` 118 | 119 | 1. To test what we've done, we can run the following commands at the bottom of the DBT IDE and make sure an error isn't returned: 120 | 121 | ```bash 122 | dbt run 123 | ``` 124 | 125 | ```bash 126 | dbt test 127 | ``` 128 | 129 | 1. `dbt run` will generate our table within a different schema in Redshift. Easiest way to see this is to navigate to the AWS Console, login, search for Redshift, and use the Query Editor V2. `dbt test` will check the new model passes the tests (or test in our case) specified in the `schema.yml` file. 130 | 131 | 1. The next step is to click `commit` on the left hand menu to commit our changes. 132 | 133 | 134 | If you ran `dbt run` above and no error was returned, a new table will have been created in our Redshift database, under a new schema name. You would have specified this schema name during the initial setup of dbt during the Redshift connection phase. If you left it as the default, it'll likely be something like `dbt_`. 135 | 136 | To check this, navigate to your Redshift cluster in AWS, and click on Query Data on the top right (orange button). Here, you want to navigate to the `dev` database, select the relevant schema, and check that the new table is there. You can query it with: 137 | 138 | ```sql 139 | SELECT * FROM .reddit_transformed; 140 | ``` 141 | 142 | 143 | 144 | ## Setup (production) 145 | 146 | When you working in your DBT development environment, this is where the models are created. 147 | 148 | However, consider this schema to be our development environment. We now want to setup a production run, as we wouldn't want analysts accessing our models from within our development area. 149 | 150 | 1. To do this, navigate to the left hand side menu and select `Environments` then click `New Environments`. 151 | 152 | 1. The `Type` option should be set to `Deployment`. Change the `Name` to something like `Production Run`. 153 | 1. Under `Deployment Credentials` enter your database username and password again. Also set a schema name, something like `Analytics`, and Save. 154 | 1. Click on `New Job`. 155 | 1. Give your job a name. Set environment as the `Production Run` you just created. 156 | 1. Select the `Generate Docs` radio button. 157 | 1. Under `Commands` ensure that `dbt run` and `dbt test` are both there. 158 | 1. Under `Triggers` ,normally you'd have this on a schedule, but for our purposes, just de-select so that it does not run on a schedule. We'll just run it manually for now. 159 | 1. Scroll to the top and save. Once saved, click `Run Now`. After a minute or two, you can then check the Redshift cluster, where you should find a new schema folder with our production table/model! 160 | 161 | --- 162 | 163 | [Previous Step](docker_airflow.md) | [Next Step](visualisation.md) 164 | 165 | or 166 | 167 | [Back to main README](../README.md) 168 | -------------------------------------------------------------------------------- /instructions/docker_airflow.md: -------------------------------------------------------------------------------- 1 | # Docker & Airflow 2 | 3 | We're going to run our pipeline daily, for demonstration purposes, although this could be changed at a later point. Each day, we'll extract the top Reddit posts for `r/DataEngineering`. Because `LIMIT` is set to `None` in the Reddit extraction script, it should in theory return all posts from the past 24 hours. Feel free to play around with this. 4 | 5 | ## Airflow 6 | 7 | To orchestrate this, we'll use Apache Airflow, which allows us to define [DAGs](https://en.wikipedia.org/wiki/Directed_acyclic_graph). Although Airflow is overkill in our case, consider it good practice. It will allow us automate our extraction and loading within our pipeline. 8 | 9 | Tutorial [here](https://airflow.apache.org/docs/apache-airflow/stable/tutorial.html) 10 | 11 | ## Docker 12 | 13 | Another tool we'll use is Docker. This allows us to create and maintain 'containers'. Think of a container a bit like a special kind of virtual machine which, in our case, includes everything we need to run Airflow, bypassing the need to install a load of dependencies. 14 | 15 | Tutorial [here](https://www.youtube.com/watch?v=3c-iBn73dDE) 16 | 17 | ## Airflow in Docker Info 18 | 19 | For this project, the `docker-compose.yaml` file comes from the Airflow in Docker quick-start guide [here](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html). This defines all the services we need for Airflow, e.g., scheduler, web server, and so forth. 20 | 21 | > **NOTE:** Ths quickstart shouldn't be used in production environments. 22 | 23 | When we run this docker-compose file further down, it will start our containers/services. I've only changed a few things in this file: 24 | 25 | * These two extra lines added under `volumes` will mount these folders on our local file system to the docker containers. You can see other volumes are defined, one being to mount the `./dags` folder (this is where we store dags airflow should run). The first line below mounts our `extraction` folder to `/opt/airflow`, which contains the scripts our airflow DAG will run. The second line mounts our aws credentials into the docker containers as read only. 26 | 27 | ```yaml 28 | - ./extraction:/opt/airflow/extraction 29 | - $HOME/.aws/credentials:/home/airflow/.aws/credentials:ro 30 | ``` 31 | 32 | * This line pip installs the specified packages within the containers. Note that there are others ways we could have done this. 33 | 34 | ```yaml 35 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:- praw boto3 configparser psycopg2-binary} 36 | ``` 37 | 38 | ### Installing Docker 39 | 40 | 1. First install Docker. Instructions [here](https://docs.docker.com/get-docker/). 41 | 42 | 1. Next install Docker Compose. Instructions [here](https://docs.docker.com/compose/install/.). 43 | 44 | ### Running Airflow 45 | 46 | To start our pipeline, we need to kick off Airflow which requires a couple more prerequisite steps. 47 | 48 | 1. If using Windows, you may need to make a small update to the below line in the `docker-compose.yaml` file. Here we are mounting our aws credentials file on to a docker container. 49 | 50 | ```yaml 51 | - $HOME/.aws/credentials:/home/airflow/.aws/credentials:ro 52 | ``` 53 | 54 | 1. Increase CPU and Memory in Docker Desktop resource settings to whatever you think your PC can handle. 55 | 56 | 1. Run the following. You may be able to skip this step if you're not on linux. See [here](https://airflow.apache.org/docs/apache-airflow/stable/start/docker.html) for more details. 57 | 58 | ```bash 59 | cd ~/Reddit-API-Pipeline/airflow 60 | 61 | # Create folders required by airflow. 62 | # dags folder has already been created, and 63 | # contains the dag script utilised by Airflow 64 | mkdir -p ./logs ./plugins 65 | 66 | # This Airflow quick-start needs to know your 67 | # host user id 68 | echo -e "AIRFLOW_UID=$(id -u)" > .env 69 | ``` 70 | 71 | 1. Making sure you are still in the airflow directory, initialise the airflow database. This will take a few minutes. Make sure the Docker daemon (background process) is running before doing this. 72 | 73 | ```bash 74 | docker-compose up airflow-init 75 | ``` 76 | 77 | 1. Create our Airflow containers. This could take a while. You'll know when it's done when you get an Airflow login screen at http://localhost:8080. 78 | 79 | ```bash 80 | docker-compose up 81 | ``` 82 | 83 | 1. If interested, once containers are created, you can view them in Docker Desktop, or list them from the command line with: 84 | 85 | ```bash 86 | docker ps 87 | ``` 88 | 1. You can even connect into a docker container and navigate around the filesystem: 89 | 90 | ```bash 91 | docker exec -it bash 92 | ``` 93 | 94 | 1. As mentioned above, navigate to `http://localhost:8080` to access the Airflow Web Interface. This is running within one of the Docker containers, which is mapping onto our local machine with port 8080. If nothing shows up, give it a few minutes more. Password and username are both `airflow`. For understanding the UI, I'd recommend looking at some guides like this [one](https://airflow.apache.org/docs/apache-airflow/stable/ui.html). 95 | 96 | 97 | 1. The dag `etl_reddit_pipeline` should be set to start running automatically once the containers are created. It may have already finished by the time you login. This option is set within the docker-compose file (`AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'false'`). The next DAG run will be at midnight. If you click on the DAG and look under the Tree view, all boxes should be dark green if the DAG run was successful. If there's any issues, this [resource](https://www.astronomer.io/guides/airflow-ui/) or the ones linked previously might help. Essentially, you'll want to click on any box that's red, click `logs` and scan through it until you find the issue. 98 | 99 | 1. If you want to shut down the airflow containers, run the following command from the airflow directory: 100 | 101 | ```bash 102 | docker-compose down 103 | ``` 104 | 105 | 1. Or if you want stop and delete containers, delete volumes with database data and download images, run the following. This can be useful if you want to remove everything and start from scratch. It's a good idea to do some reading into docker commands before running something like this though, so you understand what it's doing. 106 | 107 | ```bash 108 | docker-compose down --volumes --rmi all 109 | ``` 110 | 111 | ## Explanation 112 | 113 | If you check in the `airflow/dags` folder, you'll find a file titled `elt_reddit_pipeline.py`. This is our DAG which you saw in Airflow's UI. 114 | 115 | It's a very simple DAG. All it's doing is running 3 tasks, one after the other. This DAG will run everyday at midnight. It will also run once as soon as you create the Docker containers. These tasks are using `BashOperator`, meaning that they are running a bash command. The tasks here are running a bash command to call external Python scripts (these Python scripts also exist within our docker container through the use of volumes). You'll find them under the `extraction` folder. 116 | 117 | Read below for more details: 118 | 119 | 1. `extract_reddit_data_task` 120 | 121 | This is extracting Reddit data. Specifically, it's taking the top posts of the day from `r/DataEngineering` and collecting a few different attributes, like the number of comments. It's then saving this to a CSV within the /tmp folder. 122 | 123 | 1. `upload_to_s3` 124 | 125 | This is uploading the newly created CSV to AWS S3 for storage within the bucket Terraform created. 126 | 127 | 1. `copy_to_redshift` 128 | 129 | This is creating a table in Redshift if it doesn't already exist. It's then using the COPY command to copy data from the newly uploaded CSV file in S3 to Redshift. This is designed to avoid duplicate data based on post id. If the same post id is in a later DAG run load, then warehouse will be updated with that record. Read [here](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) for information on the COPY command. 130 | 131 | --- 132 | 133 | [Previous Step](config.md) | [Next Step](dbt.md) 134 | 135 | or 136 | 137 | [Back to main README](../README.md) 138 | -------------------------------------------------------------------------------- /instructions/improvements.md: -------------------------------------------------------------------------------- 1 | # Improvements 2 | 3 | These are some improvements that could be made. 4 | 5 | ## Setup Alerts & Notifications 6 | 7 | With Airflow, we can have it send emails when there has been a failure. 8 | 9 | ## Improve Airflow & Docker process 10 | 11 | Docker/Aiflow files used were pulled from online with very few changes made. These could be simplified and/or refactored with a real production environment in mind. 12 | 13 | ## Testing & Validation 14 | 15 | Better validation checks could be implemented to check data is correct, check all components of the pipeline work together and on their own, remove duplicates, and so forth. 16 | 17 | ## Simplify Process 18 | 19 | The use of Airflow and dbt is overkill. Alternative ways to run this pipeline could be with Cron for orchestration and PostgreSQL or SQLite for storage. 20 | 21 | ## Stream over Batch Processing 22 | 23 | If we want our Dashboard to always be up-to-date, we could benefit from something like Kafka. 24 | 25 | ## Optimisation 26 | 27 | Look for performance improvements, reduce code redundancy, and implement software engineering best practices. For example, consider using Parquet file format over CSV, or consider whether warehouse data could be modeled as a star schema. 28 | 29 | [Previous Step](terminate.md) 30 | 31 | or 32 | 33 | [Back to main README](../README.md) 34 | 35 | 36 | -------------------------------------------------------------------------------- /instructions/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This pipeline was designed not only to create a dashboard, but to gain exposure to a range of tools, develop new skills, and hopefully provide help to others. 4 | 5 | ## How this pipeline works 6 | 7 | The pipeline is a single DAG which extracts Reddit data using the Reddit API. Python's [PRAW](https://praw.readthedocs.io/en/stable/) API wrapper is used here. 8 | 9 | It is setup to extract data from the past 24 hours and store in a CSV with fields such as post ID, author name, among others. 10 | 11 | This CSV is then loaded directly into an AWS S3 bucket (cloud storage) before being copied to AWS Redshift (cloud data warehouse). 12 | 13 | This entire process is running with Apache Airflow (orchestration tool) running with Docker (a container). This saves us having to manually setup Airflow. 14 | 15 | Another two components make up this project that are not controlled with Airflow: 16 | 17 | * We use dbt to connect to our data warehouse and transform the data. We're only using dbt to gain some familiarity with it and build our skills. 18 | 19 | * We connect a BI tool to our warehouse and create some visualisations. I recommend Google Data Studio, but feel free to use something else. 20 | 21 | Proceed to the next step to get started. 22 | 23 | --- 24 | 25 | [Next Step](reddit.md) 26 | 27 | or 28 | 29 | [Back to main README](../README.md) 30 | -------------------------------------------------------------------------------- /instructions/reddit.md: -------------------------------------------------------------------------------- 1 | ## Reddit API 2 | 3 | For this project we'll be taking data from Reddit. Specifically, the `r/DataEngineering` sub. 4 | 5 | > Feel free to change the subreddit in the `extract_reddit_etl.py` script. 6 | 7 | To extract Reddit data, we need to use its Application Programming Interface ([API](https://www.mulesoft.com/resources/api/what-is-an-api)). There's a couple steps you'll need to follow to set this up. 8 | 9 | 1. Create a [Reddit account](https://www.reddit.com/register/). 10 | 2. Navigate [here](https://www.reddit.com/prefs/apps) and create an app. Make sure you select "script" from the radio buttons during the setup process. 11 | 3. Take a note of a few things once this is setup: 12 | 13 | - the App name 14 | - the App ID 15 | - API Secret Key 16 | 17 | --- 18 | 19 | [Previous Step](overview.md) | [Next Step](aws.md) 20 | 21 | or 22 | 23 | [Back to main README](../README.md) 24 | -------------------------------------------------------------------------------- /instructions/setup_infrastructure.md: -------------------------------------------------------------------------------- 1 | # AWS Infrastucture 2 | 3 | We'll use an infrastructure-as-code tool called `Terraform`. This will allow us to quickly setup (and destroy) our AWS resources using code. 4 | 5 | >Note that Terraform works with multiple cloud resources, not just AWS. 6 | 7 | If you want a quick introduction, check [this](https://learn.hashicorp.com/terraform?utm_source=terraform_io) tutorial out. 8 | 9 | We'll use Terraform to create: 10 | 11 | * **Redshift Cluster** 12 | 13 | *Redshift is a columnar data warehousing solution offered by AWS. This will be the end destination for our data.* 14 | 15 | * **IAM Role for Redshift** 16 | 17 | *Role we assign to Redshift which will give it permission to read data from S3.* 18 | 19 | * **S3 Bucket** 20 | 21 | *Object storage for our extracted Reddit data.* 22 | 23 | * **Security Group** 24 | 25 | *This particular security group will be applied to Redshift, and will allow all incoming traffic so our dashboard can connect to it. NOTE: In a real production environment, it's not a good idea to allow all traffic into your resource.* 26 | 27 | ## Setup 28 | 29 | 1. Install Terraform 30 | 31 | You can find installation instructions [here](https://learn.hashicorp.com/tutorials/terraform/install-cli) for your OS. 32 | 33 | 1. Change into `terraform` directory 34 | 35 | ```bash 36 | cd ~/Reddit-API-Pipeline/terraform 37 | ``` 38 | 39 | 1. Open the `variables.tf` file 40 | 41 | 1. Fill in the `default` parameters. 42 | 43 | * Specify a master DB user password for Redshift. Note that this may show up in logs and the terraform state file. Password should contain upper and lowercase letters, as well as numbers. 44 | 45 | * Specify a bucket name. This should be unique and not violate any S3 bucket naming constraints (e.g. `_reddit_bucket`). 46 | 47 | * Specify a region (e.g. `eu-west-2`). You'll find a list [here](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Concepts.RegionsAndAvailabilityZones.html). Ideally choose somewhere close by. 48 | 49 | 1. May be a good idea to amend `.gitignore` to ignore all terraform files so you don't accidentally commit your password and other details. You'll need to remove the `!*.tf` line. 50 | 51 | 1. Making sure you are still in the terraform directory, run this command to download the AWS terraform plugin: 52 | 53 | ```bash 54 | terraform init 55 | ``` 56 | 57 | 1. Run this command to create a plan based on `main.tf` and execute the planned changes to create resources in AWS: 58 | 59 | ```bash 60 | terraform apply 61 | ``` 62 | 63 | 1. (optional) Run this command to terminate the resources: 64 | 65 | ``` 66 | terraform destroy 67 | ``` 68 | 69 | 70 | In the [AWS Console](https://aws.amazon.com/console/), you can now view your Redshift cluster, IAM Role, and S3 Bucket. You can also manually delete or customize them here and query any Redshift databases using the query editor. Just be sure to specify the correct region in the top right hand side of the AWS console when looking for your Redshift cluster. 71 | 72 | --- 73 | 74 | [Previous Step](aws.md) | [Next Step](config.md) 75 | 76 | or 77 | 78 | [Back to main README](../README.md) 79 | -------------------------------------------------------------------------------- /instructions/terminate.md: -------------------------------------------------------------------------------- 1 | # Finishing Up 2 | 3 | If you followed this through, congrats! You now have a functioning pipeline. If you encountered any issues along the way, please let me know so I can make improvements. 4 | 5 | I'd recommend leaving docker running for a few days so your Redshift table contains a few day's worth of data. Or even update the reddit etl script to pull more data per run (so your dashboard looks more interesting). You can do this by changing the time filer to `week` or `all` instead of `day`. 6 | 7 | >If you don't want to run docker running on your local machine, there are options to run it in the cloud. I provided some details on setting Docker up on an AWS EC2 in another project [here](https://github.com/ABZ-Aaron/CoinCap-API-Pipeline). You could also consider setting up a virtual machine in [Google Cloud](https://www.learningjournal.guru/article/google-cloud/free-learning-virtual-machine/). I believe the former will cost you, as AWS EC2 instances that fall under the free tier tend to be limited in memory and storage, and will likely need to be upgraded. 8 | 9 | You'll then probably want to terminate your resources, as you don't want to incur a charge for leaving your AWS resources up. See below section for details. 10 | 11 | >If you want to keep a dashboard up in Google Data Studio, you can change the data source to a CSV that's been downloaded from Redshift. It won't update with new data each, but is still something you can put on your resume. See previous section for details. You could also consider re-creating this pipeline without using cloud computing or Docker. You can opt for a simple local PostgreSQL or SQLite database (for storage) and CRON or Windows Task Scheduler (for orchestration). This will be free and won't drain your computer of memory & battery, while also updating your dashboard each day. 12 | 13 | ## Termination 14 | 15 | To terminate your resources, follow the below steps: 16 | 17 | 18 | 1. Terminate your AWS resources by running the following Terraform command under the terraform directory: 19 | 20 | ```bash 21 | terraform destroy 22 | ``` 23 | 24 | You can then check in the AWS console that Terraform has done it's job of deleting all the resources we created earlier. 25 | 26 | 27 | 1. Stop and delete containers, delete volumes with database data and download images. To do so, navigate to the `airflow` directory you first ran `docker-compose up` and run the following: 28 | 29 | ```bash 30 | docker-compose down --volumes --rmi all 31 | ``` 32 | 33 | 1. The following command removes all stopped containers, all networks not used by at least one container, all unused images, all volumes, and all dangling build cache: 34 | 35 | ```bash 36 | docker system prune -a --volumes 37 | ``` 38 | 39 | 1. Delete your DBT account if you wish, along with any EC2 instances you may have setup. 40 | 41 | --- 42 | 43 | [Previous Step](visualisation.md) | [Next Step](improvements.md) 44 | 45 | or 46 | 47 | [Back to main README](../README.md) 48 | -------------------------------------------------------------------------------- /instructions/tools.md: -------------------------------------------------------------------------------- 1 | # Tools 2 | 3 | IN PROGRESS 4 | 5 | Feel free to skip this section. Here I've laid out a quick summary of why each tool was chosen, or in cases where tool is overkill, why it would be useful. 6 | 7 | ## Airflow 8 | 9 | ## Docker 10 | 11 | ## AWS S3 12 | 13 | ## AWS Redshift 14 | 15 | ## dbt 16 | 17 | ## Google Data Studio 18 | -------------------------------------------------------------------------------- /instructions/visualisation.md: -------------------------------------------------------------------------------- 1 | # Data Visualisation 2 | 3 | We now want to visualise our data. It's up to you how to do this. 4 | 5 | Below I've provided some basic instructions on connecting Redshift to [PowerBI](https://powerbi.microsoft.com/en-gb/) and [Google Data Studio](https://datastudio.google.com). 6 | 7 | Feel free to use the default table in Redshift (i.e. reddit) or the newly transformed one we created with dbt (i.e. reddit_transformed). 8 | 9 | > Google Data Studio is the better option for a personal project, as reports created here can freely and easily be shared. 10 | 11 | ## Google Data Studio 12 | 13 | 1. Navigate [here](https://datastudio.google.com) and follow the setup instructions. 14 | 1. Click `Create` on the top right, then `Report` 15 | 1. Under `Connect to data` search for `Amazon Redshift` 16 | 1. Enter the relevant details and click `Authenticate` 17 | 1. Select your table 18 | 19 | You can now feel free to create some visualisations. Some tutorial/guides [here](https://support.google.com/datastudio/answer/6283323?hl=en). Here's an example of mine: 20 | 21 | [](https://datastudio.google.com/reporting/e927fef6-b605-421c-ae29-89a66e11ea18) 22 | 23 | You can then publicly share your report by navigating to Share > Manage access. 24 | 25 | ### What to do once resources are terminated 26 | 27 | One thing to note... you don't want to keep your Redshift cluster up past 2 months, as it'll incur a cost once the free trial period comes to an end. You also probably don't want the Airflow Docker containers running on your local machine all the time as this will drain resources and memory. 28 | 29 | As such, your Redshift-Google Data Studio connection will eventually be broken. If you want to display a dashboard on your resume even after this happens, one option is to download your Redshift as a CSV as use this as the data source in Google Data Studio: 30 | 31 | 1. Run the `download_redshift_to_csv.py` file under the `extraction` folder to download your Redshift table as a CSV to your `/tmp` folder. Store this CSV somewhere safe. If you want to download the transformed version of your table, you may need to amend this script slightly to include the new table name, as well as the schema. 32 | 1. If you've already created your report in Google, try navigating to File > Make a copy, and select the CSV as the new data source. This should maintain all your existing visualisations. 33 | 1. You could also refactor the pipeline to use [CRON](https://en.wikipedia.org/wiki/Cron) and [PostgreSQL](https://www.postgresql.org). You could leave this pipeline running as long as you want without incurring a charge, and your Google Data Studio report will be continuously updated with new data. 34 | 35 | ## PowerBI 36 | 37 | For PowerBI, you'll need to use Windows OS and install PowerBI Desktop. If you're on Mac or Linux, you can consider a virtualisation software like [virtualbox](https://www.virtualbox.org) to use Windows. 38 | 39 | To connect Redshift to PowerBI: 40 | 41 | 1. Create an account with PowerBI. If you don't have a work or school email address, consider setting up an account with a [temporary email address](https://tempmail.net), as it won't accept Gmail and other services used for personal accounts. 42 | 1. Open PowerBI and click `Get Data`. 43 | 1. Search for `Redshift` in the search box and click `Connect`. 44 | 1. Enter your Redshift server/host name, and the name of the database (e.g. dev) and click `OK`. 45 | 1. Enter the username (e.g. awsuser) and password for the database, and then select the relevant table you'd like to load in. 46 | 47 | You can now feel free to create some visualisations. Some tutorials/guides [here](https://docs.microsoft.com/en-us/learn/powerplatform/power-bi). 48 | 49 | --- 50 | 51 | [Previous Step](dbt.md) | [Next Step](terminate.md) 52 | 53 | or 54 | 55 | [Back to main README](../README.md) 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.24.7 2 | botocore==1.27.7 3 | certifi==2022.5.18.1 4 | charset-normalizer==2.0.12 5 | configparser==5.2.0 6 | idna==3.3 7 | jmespath==1.0.0 8 | numpy==1.22.4 9 | pandas==1.4.2 10 | praw==7.6.0 11 | prawcore==2.3.0 12 | psycopg2-binary==2.9.3 13 | pyarrow==8.0.0 14 | python-dateutil==2.8.2 15 | pytz==2022.1 16 | requests==2.28.0 17 | s3transfer==0.6.0 18 | six==1.16.0 19 | update-checker==0.18.0 20 | urllib3==1.26.9 21 | websocket-client==1.3.2 22 | -------------------------------------------------------------------------------- /terraform/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.2.0" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = "~> 4.16" 8 | } 9 | } 10 | } 11 | 12 | # Configure AWS provider 13 | provider "aws" { 14 | region = var.aws_region 15 | } 16 | 17 | # Configure redshift cluster. This will fall under free tier as of June 2022. 18 | resource "aws_redshift_cluster" "redshift" { 19 | cluster_identifier = "redshift-cluster-pipeline" 20 | skip_final_snapshot = true # must be set so we can destroy redshift with terraform destroy 21 | master_username = "awsuser" 22 | master_password = var.db_password 23 | node_type = "dc2.large" 24 | cluster_type = "single-node" 25 | publicly_accessible = "true" 26 | iam_roles = [aws_iam_role.redshift_role.arn] 27 | vpc_security_group_ids = [aws_security_group.sg_redshift.id] 28 | 29 | } 30 | 31 | # Confuge security group for Redshift allowing all inbound/outbound traffic 32 | resource "aws_security_group" "sg_redshift" { 33 | name = "sg_redshift" 34 | ingress { 35 | from_port = 0 36 | to_port = 0 37 | protocol = "-1" 38 | cidr_blocks = ["0.0.0.0/0"] 39 | } 40 | egress { 41 | from_port = 0 42 | to_port = 0 43 | protocol = "-1" 44 | cidr_blocks = ["0.0.0.0/0"] 45 | } 46 | } 47 | 48 | # Create S3 Read only access role. This is assigned to Redshift cluster so that it can read data from S3 49 | resource "aws_iam_role" "redshift_role" { 50 | name = "RedShiftLoadRole" 51 | managed_policy_arns = ["arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"] 52 | assume_role_policy = jsonencode({ 53 | Version = "2012-10-17" 54 | Statement = [ 55 | { 56 | Action = "sts:AssumeRole" 57 | Effect = "Allow" 58 | Sid = "" 59 | Principal = { 60 | Service = "redshift.amazonaws.com" 61 | } 62 | }, 63 | ] 64 | }) 65 | } 66 | 67 | # Create S3 bucket 68 | resource "aws_s3_bucket" "reddit_bucket" { 69 | bucket = var.s3_bucket 70 | force_destroy = true # will delete contents of bucket when we run terraform destroy 71 | } 72 | 73 | # Set access control of bucket to private 74 | resource "aws_s3_bucket_acl" "s3_reddit_bucket_acl" { 75 | bucket = aws_s3_bucket.reddit_bucket.id 76 | acl = "private" 77 | 78 | # NOTE: TO BE TESTED 79 | # Add by Yuzhen, start 80 | # Depends_on resource to avoid error "AccessControlListNotSupported: The bucket does not allow ACLs" 81 | depends_on = [aws_s3_bucket_ownership_controls.s3_bucket_acl_ownership] 82 | # Add by Yuzhen, end 83 | 84 | } 85 | 86 | # NOTE: TO BE TESTED 87 | # Add by Yuzhen, start 88 | # Resource to avoid error "AccessControlListNotSupported: The bucket does not allow ACLs" 89 | resource "aws_s3_bucket_ownership_controls" "s3_bucket_acl_ownership" { 90 | bucket = aws_s3_bucket.reddit_bucket.id 91 | rule { 92 | object_ownership = "ObjectWriter" 93 | } 94 | } 95 | # Add by Yuzhen, end 96 | -------------------------------------------------------------------------------- /terraform/output.tf: -------------------------------------------------------------------------------- 1 | # Output hostname of Redshift 2 | output "redshift_cluster_hostname" { 3 | description = "ID of the Redshift instance" 4 | value = replace( 5 | aws_redshift_cluster.redshift.endpoint, 6 | format(":%s", aws_redshift_cluster.redshift.port),"", 7 | ) 8 | } 9 | 10 | # Output port of Redshift 11 | output "redshift_port" { 12 | description = "Port of Redshift cluster" 13 | value = aws_redshift_cluster.redshift.port 14 | } 15 | 16 | # Output Redshift password 17 | output "redshift_password" { 18 | description = "Password of Redshift cluster" 19 | value = var.db_password 20 | } 21 | 22 | # Output Redshift username 23 | output "redshift_username" { 24 | description = "Username of Redshift cluster" 25 | value = aws_redshift_cluster.redshift.master_username 26 | } 27 | 28 | # Output Role assigned to Redshift 29 | output "redshift_role" { 30 | description = "Role assigned to Redshift" 31 | value = aws_iam_role.redshift_role.name 32 | } 33 | 34 | # Output Account ID of AWS 35 | data "aws_caller_identity" "current" {} 36 | output "account_id" { 37 | value = data.aws_caller_identity.current.account_id 38 | } 39 | 40 | # Output Region set for AWS 41 | output "aws_region" { 42 | description = "Region set for AWS" 43 | value = var.aws_region 44 | } 45 | 46 | output "s3_bucket_name" { 47 | description = "Region set for AWS" 48 | value = var.s3_bucket 49 | } 50 | -------------------------------------------------------------------------------- /terraform/variable.tf: -------------------------------------------------------------------------------- 1 | variable "db_password" { 2 | description = "Password for Redshift master DB user" 3 | type = string 4 | default = "" 5 | } 6 | 7 | variable "s3_bucket" { 8 | description = "Bucket name for S3" 9 | type = string 10 | default = "" 11 | } 12 | 13 | variable "aws_region" { 14 | description = "Region for AWS" 15 | type = string 16 | default = "" 17 | } --------------------------------------------------------------------------------