├── .dockerignore ├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── airflow ├── README.md ├── config │ └── airflow.cfg ├── dags │ └── redshift │ │ ├── dags │ │ ├── increment_aggregates.py │ │ └── refresh_aggregates.py │ │ └── sql │ │ ├── increment │ │ ├── aggregate_metrics_by_day.sql │ │ └── aggregate_transaction_metrics_by_block.sql │ │ └── refresh │ │ ├── aggregate_metrics_by_day.sql │ │ └── aggregate_transaction_metrics_by_block.sql └── entrypoint.sh ├── docs ├── challenges.md ├── getting_started.md ├── img │ ├── dags.png │ ├── eks_worker_autoscaling.png │ ├── eks_worker_no_autoscaling.png │ ├── grafana_dashboard.png │ ├── kubernetes_dashboard.png │ ├── range_restricted_scan.png │ ├── runtime_of_daily_incremental_update.png │ └── tech_stack.png └── tech_stack.md ├── k8s ├── README.md ├── config_maps │ └── aws-auth-cm.yaml ├── service_accounts │ └── eks-admin-service-account.yaml ├── services │ ├── airflow-webserver.yaml │ ├── go-ethereum.yaml │ ├── grafana.yaml │ └── superset.yaml └── setup.sh ├── redshift ├── check_load_errors.sql ├── schema │ ├── coinmarketcap.sql │ ├── coinmetrics.sql │ └── multipl.sql └── users.sql └── terraform ├── README.md ├── environments ├── base │ └── ops │ │ └── ecr │ │ ├── ecr.tf │ │ └── terraform.tfvars └── prod │ ├── compute │ └── eks │ │ ├── eks.tf │ │ └── terraform.tfvars │ ├── data │ ├── rds-airflow │ │ ├── rds-airflow.tf │ │ └── terraform.tfvars │ ├── rds-grafana │ │ ├── rds-grafana.tf │ │ └── terraform.tfvars │ ├── rds-superset │ │ ├── rds-superset.tf │ │ └── terraform.tfvars │ └── redshift │ │ ├── redshift.tf │ │ └── terraform.tfvars │ ├── network │ ├── bastion │ │ ├── bastion.tf │ │ └── terraform.tfvars │ └── vpc │ │ ├── terraform.tfvars │ │ └── vpc.tf │ └── services │ ├── airflow │ ├── airflow.tf │ └── terraform.tfvars │ └── grafana │ ├── grafana.tf │ └── terraform.tfvars └── modules └── services └── airflow ├── main.tf ├── outputs.tf └── variables.tf /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Terraform state 2 | .terraform/* 3 | **/.terraform/* 4 | **/terraform.tfstate.backup 5 | **/errored.tfstate 6 | tfplan 7 | 8 | # Secrets 9 | k8s/secrets/ 10 | 11 | # VIM swap files 12 | *.swp 13 | 14 | # Temp files 15 | # tmp/* 16 | 17 | *.zip 18 | 19 | # mypy 20 | .mypy_cache 21 | 22 | # pid file created during tests 23 | *.pid 24 | 25 | # Jetbrains IDE files 26 | *.iml 27 | .idea/ 28 | 29 | # Temp sql 30 | redshift/scratch.sql 31 | 32 | # VIM swap files 33 | *.swp 34 | 35 | # Byte-compiled / optimized / DLL files 36 | __pycache__/ 37 | *.py[cod] 38 | 39 | # C extensions 40 | *.so 41 | 42 | # Distribution / packaging 43 | .Python 44 | env/ 45 | build/ 46 | develop-eggs/ 47 | dist/ 48 | downloads/ 49 | eggs/ 50 | .eggs/ 51 | lib/ 52 | lib64/ 53 | parts/ 54 | sdist/ 55 | var/ 56 | *.egg-info/ 57 | .installed.cfg 58 | *.egg 59 | 60 | # PyInstaller 61 | # Usually these files are written by a python script from a template 62 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 63 | *.manifest 64 | *.spec 65 | 66 | # Installer logs 67 | pip-log.txt 68 | pip-delete-this-directory.txt 69 | 70 | # Unit test / coverage reports 71 | htmlcov/ 72 | .tox/ 73 | .coverage 74 | .coverage.* 75 | .cache 76 | nosetests.xml 77 | coverage.xml 78 | *,cover 79 | .pytest_cache 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "airflow/dags/ethereum-etl-airflow"] 2 | path = airflow/dags/ethereum-etl-airflow 3 | url = https://github.com/iter-io/ethereum-etl-airflow.git 4 | branch = feature-aws 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | # This Dockerfile is used to build a Docker image for Airflow that contains all 3 | # dependencies and DAGs. This same image is used for the scheduler, webserver, 4 | # and workers. 5 | # 6 | # It's based on the ones in the Airflow repository: 7 | # 8 | # https://github.com/apache/airflow/blob/master/Dockerfile 9 | # https://github.com/apache/airflow/blob/master/scripts/ci/kubernetes/docker/Dockerfile 10 | # 11 | # It also borrows from Puckel's popular image: 12 | # 13 | # https://github.com/puckel/docker-airflow 14 | # 15 | # The major differences: 16 | # 17 | # 1. Airflow is installed directly from the master branch instead of the tagged 18 | # releases. This involves cloning the code from Github into the image and 19 | # building the frontend with npm. 20 | # 21 | # 2. Dependencies were added for ethereum-etl and bitcoin-etl. 22 | # 23 | # 24 | 25 | FROM python:3.6-slim 26 | 27 | # Never prompts the user for choices on installation/configuration of packages 28 | ENV DEBIAN_FRONTEND noninteractive 29 | ENV TERM linux 30 | 31 | # Dependences required for the build but not at runtime 32 | ARG buildDeps="\ 33 | freetds-dev \ 34 | libczmq-dev \ 35 | libkrb5-dev \ 36 | libsasl2-dev \ 37 | libssl-dev \ 38 | libffi-dev \ 39 | libpq-dev \ 40 | git \ 41 | nodejs" 42 | 43 | # Dependencies required by Airflow at runtime 44 | ARG APT_DEPS="\ 45 | $buildDeps \ 46 | bind9utils \ 47 | libsasl2-dev \ 48 | freetds-bin \ 49 | build-essential \ 50 | default-libmysqlclient-dev \ 51 | inetutils-telnet \ 52 | apt-utils \ 53 | curl \ 54 | rsync \ 55 | netcat \ 56 | locales \ 57 | wget \ 58 | zip \ 59 | unzip" 60 | 61 | # Dependencies installed via pip 62 | ARG PYTHON_DEPS="\ 63 | pytz \ 64 | cryptography \ 65 | requests \ 66 | pyOpenSSL \ 67 | ndg-httpsclient \ 68 | pyasn1 \ 69 | psycopg2-binary \ 70 | Flask-Bcrypt \ 71 | Flask-WTF==0.14 \ 72 | click \ 73 | kubernetes \ 74 | setuptools \ 75 | wheel" 76 | 77 | # http://airflow.apache.org/installation.html 78 | ARG AIRFLOW_DEPS="postgres,s3,devel" 79 | ARG AIRFLOW_HOME=/usr/local/airflow 80 | 81 | # Required by ethereum-etl 82 | ARG ETHEREUM_ETL_DEPS="\ 83 | google-api-python-client \ 84 | httplib2 \ 85 | bitcoin-etl \ 86 | ethereum-etl \ 87 | mythril \ 88 | pyetherchain \ 89 | pandas \ 90 | pandas-gbq" 91 | 92 | ENV AIRFLOW_GPL_UNIDECODE yes 93 | 94 | # Define en_US. 95 | ENV LANGUAGE en_US.UTF-8 96 | ENV LANG en_US.UTF-8 97 | ENV LC_ALL en_US.UTF-8 98 | ENV LC_CTYPE en_US.UTF-8 99 | ENV LC_MESSAGES en_US.UTF-8 100 | 101 | WORKDIR /opt/ 102 | 103 | RUN set -ex \ 104 | # Update our currently installed packages 105 | && apt-get update -yqq \ 106 | && apt-get upgrade -yqq \ 107 | # Install Airflow dependencies 108 | && apt install -y $APT_DEPS \ 109 | && pip install --upgrade pip \ 110 | && pip install --no-cache-dir ${PYTHON_DEPS} \ 111 | # Get the master branch of Airflow from Github 112 | && git clone --depth=1 https://github.com/apache/airflow.git \ 113 | # Build the Airflow frontend 114 | && curl -sL https://deb.nodesource.com/setup_11.x | bash - \ 115 | && apt-get install -y nodejs \ 116 | && npm --prefix /opt/airflow/airflow/www install /opt/airflow/airflow/www \ 117 | && npm --prefix /opt/airflow/airflow/www run-script build \ 118 | # Install Airflow from source 119 | && pip install --no-cache-dir -e /opt/airflow[$AIRFLOW_DEPS] \ 120 | # Required by Airflow S3 Hook 121 | && useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \ 122 | && pip install boto3 \ 123 | # Change the local to UTF-8 124 | && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ 125 | && locale-gen \ 126 | && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ 127 | # Required by ethereum-etl-airflow 128 | && pip install --no-cache-dir ${ETHEREUM_ETL_DEPS} \ 129 | # Remove unncessary files from this layer 130 | && apt-get purge --auto-remove -yqq $buildDeps \ 131 | && apt-get autoremove -yqq --purge \ 132 | && apt-get clean \ 133 | && rm -rf \ 134 | /var/lib/apt/lists/* \ 135 | /tmp/* \ 136 | /var/tmp/* \ 137 | /usr/share/man \ 138 | /usr/share/doc \ 139 | /usr/share/doc-base 140 | 141 | WORKDIR ${AIRFLOW_HOME} 142 | 143 | COPY airflow/entrypoint.sh /entrypoint.sh 144 | COPY airflow/config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg 145 | 146 | COPY ./airflow/dags ${AIRFLOW_HOME}/dags 147 | 148 | # Trying to get Kubernetes workers to load our dags 149 | COPY ./airflow/dags /tmp/dags 150 | 151 | RUN chown -R airflow: ${AIRFLOW_HOME} 152 | 153 | EXPOSE 8080 5555 8793 154 | 155 | USER airflow 156 | ENTRYPOINT ["/entrypoint.sh"] 157 | 158 | # sets default arg for entrypoint 159 | CMD ["webserver"] 160 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, ITERIO INC. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build_venv build_docker ensure_image lint_code lint_tests run run_tests test \ 2 | update_dependencies upload_docker venv_activate deploy 3 | 4 | DOCKER_IMAGE = 772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics 5 | KUBE = kubectl --context="arn:aws:eks:us-east-1:772681551441:cluster/insight-prod-cluster" 6 | VERSION = $(shell git rev-parse --short HEAD) 7 | IMAGE_FOUND = $(shell docker images \ 8 | --format "{{ .Repository }}" --filter "reference=security_token_analytics") 9 | AIRFLOW_TF_CONFIG_DIR = terraform/environments/prod/services/airflow 10 | 11 | test: build_venv 12 | 13 | # 14 | # Uploads our Docker image to ECR with both latest and version tags. 15 | # 16 | upload_docker: build_docker 17 | 18 | # Tag our image 19 | docker tag $(DOCKER_IMAGE):latest $(DOCKER_IMAGE):$(VERSION) 20 | 21 | # Authenticate with ECR and push the image 22 | eval $(aws ecr get-login --no-include-email) && docker push $(DOCKER_IMAGE):$(VERSION) 23 | eval $(aws ecr get-login --no-include-email) && docker push $(DOCKER_IMAGE):latest 24 | 25 | 26 | deploy: upload_docker 27 | echo "Redeploying the Airflow webserver" 28 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform taint --module=airflow kubernetes_deployment.airflow_webserver 29 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform plan -target=module.airflow.kubernetes_deployment.airflow_webserver -out=tfplan 30 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform apply "tfplan" 31 | 32 | echo "Redeploying the Airflow scheduler" 33 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform taint --module=airflow kubernetes_deployment.airflow_scheduler 34 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform plan -target=module.airflow.kubernetes_deployment.airflow_scheduler -out=tfplan 35 | cd $(AIRFLOW_TF_CONFIG_DIR) && terraform apply "tfplan" 36 | 37 | 38 | venv_activate: 39 | pipenv shell 40 | 41 | 42 | update_dependencies: 43 | pipenv update --dev 44 | make test 45 | 46 | 47 | # ========================== 48 | # Internal targets 49 | # ========================== 50 | 51 | build_docker: 52 | docker build --build-arg VERSION=$(VERSION) -t $(DOCKER_IMAGE) . 53 | 54 | build_venv: 55 | pipenv sync --dev 56 | 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # security-token-analytics 3 | 4 | ### Overview 5 | 6 | This project uses blockchain data to provide a platform for financial analysis 7 | of crypto assets. In particular we are researching in 8 | emerging standards for [security tokens](https://github.com/ethereum/EIPs/issues/1411) 9 | and potential methods of fundamental analysis. More broadly this project aims 10 | to serve as an example for how to build a data pipeline for analyzing public 11 | blockchains. 12 | 13 | ## Documentation 14 | 15 | ### Table of Contents 16 | 1. [Getting Started](/docs/getting_started.md) 17 | 2. [Terraform Configs](/terraform/README.md) 18 | 3. [Setting up Kubernetes / EKS](/k8s/README.md) 19 | 4. [Airflow DAGs](/airflow/README.md) 20 | 5. [Tech Stack](/docs/tech_stack.md) 21 | 6. [Engineering Challenges](/docs/challenges.md) 22 | 23 | 24 | ### High-level Architecture 25 | ![high level architecture](docs/img/tech_stack.png) 26 | 27 | ### Example Dashboard 28 | ![example dashboard](docs/img/grafana_dashboard.png) 29 | -------------------------------------------------------------------------------- /airflow/README.md: -------------------------------------------------------------------------------- 1 | ![dags](../docs/img/dags.png) 2 | 3 | #### DAGs Overview 4 | 5 | 1. [ethereum_etl_export_dag](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/export_dag.py) - 6 | Runs the [ethereum-etl library](https://github.com/blockchain-etl/ethereum-etl) 7 | to export blockchain data into CSV and JSON files. These files are then uploaded 8 | to S3. 9 | 10 | 2. [ethereum_etl_load_dag_redshift](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/load_dag_redshift.py) - 11 | Uses the Redshift [COPY command](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) 12 | to load files from S3 into Redshift. 13 | 14 | 3. [redshift_increment_aggregates](https://github.com/iter-io/security-token-analytics/blob/master/airflow/dags/redshift/dags/increment_aggregates.py) - 15 | Executes [SQL queries](https://github.com/iter-io/security-token-analytics/tree/master/airflow/dags/redshift/sql/increment) 16 | in Redshift for incrementally updating the aggregate data models. 17 | 18 | 4. [redshift_refresh_aggregates](https://github.com/iter-io/security-token-analytics/blob/master/airflow/dags/redshift/dags/refresh_aggregates.py) - 19 | Executes [SQL queries](https://github.com/iter-io/security-token-analytics/tree/master/airflow/dags/redshift/sql/refresh) 20 | in Redshift for doing a full refresh the aggregate data models. We keep 21 | this DAG off but can manually trigger it to rebuild the tables if necessary. 22 | 23 | 24 | #### Why the DAGs are not combined 25 | 26 | The original export and load DAGs are separated because the export 27 | DAG produces single partitions while the load DAG imported all partitions into 28 | BigQuery. A secondary goal of this project is to contribute back to the 29 | blockchain-etl project as opposed to forking it. So it made sense to keep them 30 | separated for now. 31 | 32 | DAGs #1-2 will be included in the pull request while DAGs #3-4 will remain in this 33 | project. After getting feedback on the pull request then DAGs #1-3 will probably 34 | be combined into one (or run as subdags). 35 | 36 | 37 | #### Airflow build 38 | 39 | The [Dockerfile](https://github.com/iter-io/security-token-analytics/blob/master/Dockerfile) 40 | in the root of this repository is used to build a Docker image for Airflow that 41 | contains all dependencies and DAGs. This same image is used for the scheduler, 42 | webserver, and workers. In order to have the latest Kubernetes features, this 43 | build uses the [master branch](https://github.com/apache/airflow/tree/master) for 44 | airflow instead of a tagged release. 45 | 46 | Info on the frontend npm build: 47 | 48 | https://github.com/apache/airflow/blob/master/CONTRIBUTING.md#setting-up-the-node--npm-javascript-environment 49 | -------------------------------------------------------------------------------- /airflow/config/airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The home folder for airflow, default is ~/airflow 3 | airflow_home = /usr/local/airflow 4 | 5 | # The folder where your airflow pipelines live, most likely a 6 | # subfolder in a code repository 7 | dags_folder = /usr/local/airflow/dags 8 | 9 | # The folder where airflow should store its log files. 10 | base_log_folder = /usr/local/airflow/logs 11 | 12 | # https://github.com/apache/airflow/blob/master/docs/howto/write-logs.rst 13 | remote_logging = True 14 | #remote_base_log_folder = ENVRIONMENT VARIABLE 15 | #remote_log_conn_id = ENVRIONMENT VARIABLE 16 | #encrypt_s3_logs = ENVRIONMENT VARIABLE 17 | 18 | # Logging level 19 | logging_level = DEBUG 20 | fab_logging_level = WARN 21 | 22 | # Logging class 23 | # Specify the class that will specify the logging configuration 24 | # This class has to be on the python classpath 25 | # logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 26 | logging_config_class = 27 | 28 | # Log format 29 | log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s 30 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 31 | 32 | # Log filename format 33 | #log_filename_template = {{{{ ti.dag_id }}}}/{{{{ ti.task_id }}}}/{{{{ ts }}}}/{{{{ try_number }}}}.log 34 | #log_processor_filename_template = {{{{ filename }}}}.log 35 | #dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log 36 | 37 | # Hostname by providing a path to a callable, which will resolve the hostname 38 | #hostname_callable = socket:getfqdn 39 | 40 | # Default timezone in case supplied date times are naive 41 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) 42 | default_timezone = utc 43 | 44 | # The executor class that airflow should use. Choices include 45 | # SequentialExecutor, LocalExecutor, CeleryExecutor 46 | #executor = LocalExecutor 47 | executor = KubernetesExecutor 48 | 49 | # The SqlAlchemy connection string to the metadata database. 50 | # SqlAlchemy supports many different database engine, more information 51 | # their website 52 | #sql_alchemy_conn = ENVIRONMENT VARIABLE 53 | 54 | # The encoding for the databases 55 | sql_engine_encoding = utf-8 56 | 57 | # The SqlAlchemy pool size is the maximum number of database connections 58 | # in the pool. 59 | # TODO: Does this affect our max concurrency? 60 | sql_alchemy_pool_size = 32 61 | 62 | # The SqlAlchemy pool recycle is the number of seconds a connection 63 | # can be idle in the pool before it is invalidated. This config does 64 | # not apply to sqlite. 65 | sql_alchemy_pool_recycle = 3600 66 | 67 | # How many seconds to retry re-establishing a DB connection after 68 | # disconnects. Setting this to 0 disables retries. 69 | sql_alchemy_reconnect_timeout = 300 70 | 71 | # The schema to use for the metadata database 72 | # SqlAlchemy supports databases with the concept of multiple schemas. 73 | sql_alchemy_schema = 74 | 75 | # The amount of parallelism as a setting to the executor. This defines 76 | # the max number of task instances that should run simultaneously 77 | # on this airflow installation 78 | parallelism = 128 79 | 80 | # The number of task instances allowed to run concurrently by the scheduler 81 | dag_concurrency = 128 82 | 83 | # Are DAGs paused by default at creation 84 | dags_are_paused_at_creation = True 85 | 86 | # The maximum number of active DAG runs per DAG 87 | max_active_runs_per_dag = 128 88 | 89 | load_examples = False 90 | 91 | # Where your Airflow plugins are stored 92 | plugins_folder = /usr/local/airflow/plugins 93 | 94 | # Secret key to save connection passwords in the db 95 | #fernet_key = ENVRIONMENT VARIABLE 96 | 97 | # Whether to disable pickling dags 98 | donot_pickle = False 99 | 100 | # How long before timing out a python file import while filling the DagBag 101 | # Increased this from 60 due to dag_id could not be found errors 102 | dagbag_import_timeout = 180 103 | 104 | # The class to use for running task instances in a subprocess 105 | task_runner = StandardTaskRunner 106 | 107 | # If set, tasks without a `run_as_user` argument will be run with this user 108 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 109 | default_impersonation = 110 | 111 | # What security module to use (for example kerberos): 112 | security = 113 | 114 | # If set to False enables some unsecure features like Charts and Ad Hoc Queries. 115 | # In 2.0 will default to True. 116 | secure_mode = False 117 | 118 | # Turn unit test mode on (overwrites many configuration options with test 119 | # values at runtime) 120 | unit_test_mode = False 121 | 122 | # Name of handler to read task instance logs. 123 | # Default to use task handler. 124 | task_log_reader = task 125 | 126 | # Whether to enable pickling for xcom (note that this is insecure and allows for 127 | # RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False). 128 | enable_xcom_pickling = True 129 | 130 | # When a task is killed forcefully, this is the amount of time in seconds that 131 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 132 | killed_task_cleanup_time = 60 133 | 134 | # Whether to override params with dag_run.conf. If you pass some key-value pairs through `airflow backfill -c` or 135 | # `airflow trigger_dag -c`, the key-value pairs will override the existing ones in params. 136 | dag_run_conf_overrides_params = False 137 | 138 | # Worker initialisation check to validate Metadata Database connection 139 | worker_precheck = False 140 | 141 | [webserver] 142 | # The base url of your website as airflow cannot guess what domain or 143 | # cname you are using. This is used in automated emails that 144 | # airflow sends to point links to the right web server 145 | base_url = http://localhost:8080 146 | 147 | # The ip specified when starting the web server 148 | web_server_host = 0.0.0.0 149 | 150 | # The port on which to run the web server 151 | web_server_port = 8080 152 | 153 | # Paths to the SSL certificate and key for the web server. When both are 154 | # provided SSL will be enabled. This does not change the web server port. 155 | web_server_ssl_cert = 156 | web_server_ssl_key = 157 | 158 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 159 | web_server_master_timeout = 120 160 | 161 | # Number of seconds the gunicorn webserver waits before timing out on a worker 162 | web_server_worker_timeout = 120 163 | 164 | # Number of workers to refresh at a time. When set to 0, worker refresh is 165 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 166 | # bringing up new ones and killing old ones. 167 | worker_refresh_batch_size = 1 168 | 169 | # Number of seconds to wait before refreshing a batch of workers. 170 | worker_refresh_interval = 30 171 | 172 | # Secret key used to run your flask app 173 | # It should be as random as possible 174 | secret_key = {SECRET_KEY} 175 | 176 | # Number of workers to run the Gunicorn web server 177 | workers = 4 178 | 179 | # The worker class gunicorn should use. Choices include 180 | # sync (default), eventlet, gevent 181 | worker_class = sync 182 | 183 | # Log files for the gunicorn webserver. '-' means log to stderr. 184 | access_logfile = - 185 | error_logfile = - 186 | 187 | # Expose the configuration file in the web server 188 | # This is only applicable for the flask-admin based web UI (non FAB-based). 189 | # In the FAB-based web UI with RBAC feature, 190 | # access to configuration is controlled by role permissions. 191 | expose_config = True 192 | 193 | # Set to true to turn on authentication: 194 | # https://airflow.apache.org/security.html#web-authentication 195 | authenticate = False 196 | rbac = False 197 | 198 | # Filter the list of dags by owner name (requires authentication to be enabled) 199 | filter_by_owner = False 200 | 201 | # Filtering mode. Choices include user (default) and ldapgroup. 202 | # Ldap group filtering requires using the ldap backend 203 | # 204 | # Note that the ldap server needs the "memberOf" overlay to be set up 205 | # in order to user the ldapgroup mode. 206 | owner_mode = user 207 | 208 | # Default DAG view. Valid values are: 209 | # tree, graph, duration, gantt, landing_times 210 | dag_default_view = tree 211 | 212 | # Default DAG orientation. Valid values are: 213 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top) 214 | dag_orientation = LR 215 | 216 | # Puts the webserver in demonstration mode; blurs the names of Operators for 217 | # privacy. 218 | demo_mode = False 219 | 220 | # The amount of time (in secs) webserver will wait for initial handshake 221 | # while fetching logs from other worker machine 222 | log_fetch_timeout_sec = 5 223 | 224 | # By default, the webserver shows paused DAGs. Flip this to hide paused 225 | # DAGs by default 226 | hide_paused_dags_by_default = False 227 | 228 | # Consistent page size across all listing views in the UI 229 | page_size = 100 230 | 231 | # Define the color of navigation bar 232 | navbar_color = #007A87 233 | 234 | # Default dagrun to show in UI 235 | default_dag_run_display_number = 25 236 | 237 | # Enable werkzeug `ProxyFix` middleware 238 | enable_proxy_fix = False 239 | 240 | [aws_default] 241 | aws_default_region = us-east-1 242 | 243 | [email] 244 | email_backend = airflow.utils.email.send_email_smtp 245 | 246 | [smtp] 247 | # If you want airflow to send emails on retries, failure, and you want to 248 | # the airflow.utils.send_email function, you have to configure an smtp 249 | # server here 250 | smtp_host = localhost 251 | smtp_starttls = True 252 | smtp_ssl = False 253 | smtp_user = airflow 254 | smtp_port = 25 255 | smtp_password = airflow 256 | smtp_mail_from = airflow@airflow.local 257 | 258 | 259 | [scheduler] 260 | # Task instances listen for external kill signal (when you clear tasks 261 | # from the CLI or the UI), this defines the frequency at which they should 262 | # listen (in seconds). 263 | job_heartbeat_sec = 5 264 | 265 | # The scheduler constantly tries to trigger new tasks (look at the 266 | # scheduler section in the docs for more information). This defines 267 | # how often the scheduler should run (in seconds). 268 | # 269 | # We increased this value from 5 to 60 in an attempt to reduce airflow 270 | # scheduling latency. 271 | # 272 | scheduler_heartbeat_sec = 60 273 | 274 | # after how much time (seconds) a new DAGs should be picked up from the filesystem 275 | min_file_process_interval = 0 276 | 277 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 278 | dag_dir_list_interval = 300 279 | 280 | # How often should stats be printed to the logs 281 | print_stats_interval = 30 282 | 283 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold ago (in seconds), 284 | # scheduler is considered unhealthy. 285 | # This is used by the health check in the "/health" endpoint 286 | scheduler_health_check_threshold = 30 287 | 288 | child_process_log_directory = /usr/local/airflow/logs/scheduler 289 | 290 | # Local task jobs periodically heartbeat to the DB. If the job has 291 | # not heartbeat in this many seconds, the scheduler will mark the 292 | # associated task instance as failed and will re-schedule the task. 293 | scheduler_zombie_task_threshold = 300 294 | 295 | # Turn off scheduler catchup by setting this to False. 296 | # Default behavior is unchanged and 297 | # Command Line Backfills still work, but the scheduler 298 | # will not do scheduler catchup if this is False, 299 | # however it can be set on a per DAG basis in the 300 | # DAG definition (catchup) 301 | catchup_by_default = True 302 | 303 | # This changes the batch size of queries in the scheduling main loop. 304 | # If this is too high, SQL query performance may be impacted by one 305 | # or more of the following: 306 | # - reversion to full table scan 307 | # - complexity of query predicate 308 | # - excessive locking 309 | # 310 | # Additionally, you may hit the maximum allowable query length for your db. 311 | # 312 | # Set this to 0 for no limit (not advised) 313 | max_tis_per_query = 512 314 | 315 | # Statsd (https://github.com/etsy/statsd) integration settings 316 | #statsd_on = False 317 | #statsd_host = localhost 318 | #statsd_port = 8125 319 | #statsd_prefix = airflow 320 | 321 | # The scheduler can run multiple threads in parallel to schedule dags. 322 | # This defines how many threads will run. 323 | max_threads = 8 324 | 325 | authenticate = False 326 | 327 | # Turn off scheduler use of cron intervals by setting this to False. 328 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 329 | use_job_schedule = True 330 | 331 | 332 | [kubernetes] 333 | worker_container_repository = 772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics 334 | worker_container_tag = latest 335 | worker_container_image_pull_policy = Always 336 | worker_dags_folder = /usr/local/airflow/dags 337 | 338 | # If True (default), worker pods will be deleted upon termination 339 | delete_worker_pods = True 340 | 341 | # Number of Kubernetes Worker Pod creation calls per scheduler loop 342 | worker_pods_creation_batch_size = 1 343 | 344 | # The Kubernetes namespace where airflow workers should be created. Defaults to `default` 345 | namespace = default 346 | 347 | # The name of the Kubernetes ConfigMap Containing the Airflow Configuration (this file) 348 | #airflow_configmap = /usr/local/airflow/airflow.cfg 349 | airflow_configmap = 350 | 351 | # For docker image already contains DAGs, this is set to `True`, and the worker will search for dags in dags_folder, 352 | # otherwise use git sync or dags volume claim to mount DAGs 353 | dags_in_image = True 354 | 355 | # The name of the Kubernetes service account to be associated with airflow workers, if any. 356 | # Service accounts are required for workers that require access to secrets or cluster resources. 357 | # See the Kubernetes RBAC documentation for more: 358 | # https://kubernetes.io/docs/admin/authorization/rbac/ 359 | worker_service_account_name = default 360 | 361 | # Any image pull secrets to be given to worker pods, If more than one secret is 362 | # required, provide a comma separated list: secret_a,secret_b 363 | image_pull_secrets = 364 | 365 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 366 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 367 | # It will raise an exception if called from a process not running in a kubernetes environment. 368 | in_cluster = True 369 | 370 | # Affinity configuration as a single line formatted JSON object. 371 | # See the affinity model for top-level key names (e.g. `nodeAffinity`, etc.): 372 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core 373 | affinity = 374 | 375 | # A list of toleration objects as a single line formatted JSON array 376 | # See: 377 | # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core 378 | tolerations = 379 | 380 | 381 | [kubernetes_node_selectors] 382 | # The Key-value pairs to be given to worker pods. 383 | # The worker pods will be scheduled to the nodes of the specified key-value pairs. 384 | # Should be supplied in the format: key = value 385 | 386 | [kubernetes_secrets] 387 | # The scheduler mounts the following secrets into your workers as they are launched by the 388 | # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the 389 | # defined secrets and mount them as secret environment variables in the launched workers. 390 | 391 | # Airflow settings 392 | AIRFLOW__CORE__AIRFLOW_HOME = prod-airflow=AIRFLOW__CORE__AIRFLOW_HOME 393 | AIRFLOW__CORE__DAGS_FOLDER = prod-airflow=AIRFLOW__CORE__DAGS_FOLDER 394 | AIRFLOW__CORE__FERNET_KEY = prod-airflow=AIRFLOW__CORE__FERNET_KEY 395 | AIRFLOW__CORE__REMOTE_LOG_CONN_ID = prod-airflow=AIRFLOW__CORE__REMOTE_LOG_CONN_ID 396 | AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER = prod-airflow=AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER 397 | AIRFLOW__CORE__ENCRYPT_S3_LOGS = prod-airflow=AIRFLOW__CORE__ENCRYPT_S3_LOGS 398 | AIRFLOW__CORE__SQL_ALCHEMY_CONN = prod-airflow=AIRFLOW__CORE__SQL_ALCHEMY_CONN 399 | AIRFLOW__KUBERNETES__DAGS_IN_IMAGE = prod-airflow=AIRFLOW__KUBERNETES__DAGS_IN_IMAGE 400 | AIRFLOW_CONN_AWS_DEFAULT = prod-airflow=AIRFLOW_CONN_AWS_DEFAULT 401 | PYTHONPATH = prod-airflow=PYTHONPATH 402 | 403 | 404 | # ethereum-etl env vars 405 | CLOUD_PROVIDER = prod-airflow=CLOUD_PROVIDER 406 | OUTPUT_BUCKET = prod-airflow=OUTPUT_BUCKET 407 | AWS_ACCESS_KEY_ID = prod-airflow=AWS_ACCESS_KEY_ID 408 | AWS_SECRET_ACCESS_KEY = prod-airflow=AWS_SECRET_ACCESS_KEY 409 | DAGS_FOLDER = prod-airflow=DAGS_FOLDER 410 | REDSHIFT_SQL_FOLDER = prod-airflow=REDSHIFT_SQL_FOLDER 411 | EXPORT_BLOCKS_AND_TRANSACTIONS = prod-airflow=EXPORT_BLOCKS_AND_TRANSACTIONS 412 | EXPORT_RECEIPTS_AND_LOGS = prod-airflow=EXPORT_RECEIPTS_AND_LOGS 413 | EXTRACT_TOKEN_TRANSFERS = prod-airflow=EXTRACT_TOKEN_TRANSFERS 414 | EXPORT_CONTRACTS = prod-airflow=EXPORT_CONTRACTS 415 | EXPORT_TOKENS = prod-airflow=EXPORT_TOKENS 416 | EXPORT_TRACES = prod-airflow=EXPORT_TRACES 417 | NOTIFICATION_EMAILS = prod-airflow=NOTIFICATION_EMAILS 418 | EXPORT_MAX_WORKERS = prod-airflow=EXPORT_MAX_WORKERS 419 | EXPORT_BATCH_SIZE = prod-airflow=EXPORT_BATCH_SIZE 420 | WEB3_PROVIDER_URI_BACKUP = prod-airflow=WEB3_PROVIDER_URI_BACKUP 421 | WEB3_PROVIDER_URI_ARCHIVAL = prod-airflow=WEB3_PROVIDER_URI_ARCHIVAL 422 | DESTINATION_DATASET_PROJECT_ID = prod-airflow=DESTINATION_DATASET_PROJECT_ID -------------------------------------------------------------------------------- /airflow/dags/redshift/dags/increment_aggregates.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | from airflow import models 4 | from airflow.hooks.postgres_hook import PostgresHook 5 | from airflow.operators.python_operator import PythonOperator 6 | from datetime import datetime, timedelta 7 | from time import mktime 8 | 9 | import logging 10 | import os 11 | 12 | logging.basicConfig() 13 | logging.getLogger().setLevel(logging.DEBUG) 14 | 15 | default_dag_args = { 16 | 'depends_on_past': False, 17 | 'start_date': datetime(2015, 7, 30), 18 | 'email_on_failure': True, 19 | 'email_on_retry': True, 20 | 'retries': 5, 21 | 'retry_delay': timedelta(minutes=5) 22 | } 23 | 24 | notification_emails = os.environ.get('NOTIFICATION_EMAILS') 25 | if notification_emails and len(notification_emails) > 0: 26 | default_dag_args['email'] = [email.strip() for email in notification_emails.split(',')] 27 | 28 | dag = models.DAG( 29 | dag_id='redshift_increment_aggregates', 30 | # Daily at 2:00am 31 | schedule_interval='0 2 * * *', 32 | concurrency=1, 33 | max_active_runs=1, 34 | default_args=default_dag_args 35 | ) 36 | 37 | sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER', "/usr/local/airflow/dags/redshift/sql") 38 | if sql_folder is None: 39 | raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable") 40 | 41 | 42 | def run_sql(ds, **kwargs): 43 | conn_id = kwargs.get('conn_id') 44 | sql_file_path = kwargs.get('sql_file_path') 45 | pg_hook = PostgresHook(conn_id) 46 | 47 | # Get inclusive timestamp bounds of the execution day 48 | year, month, day = map(int, ds.split('-')) 49 | start_timestamp = int(mktime(datetime(year, month, day, 0, 0, 0).timetuple())) 50 | end_timestamp = int(mktime(datetime(year, month, day, 23, 59, 59).timetuple())) 51 | 52 | with open(sql_file_path, 'r') as sql_file: 53 | sql = sql_file.read().format( 54 | start_timestamp=start_timestamp, 55 | end_timestamp=end_timestamp 56 | ) 57 | pg_hook.run(sql) 58 | 59 | 60 | def add_refresh_task(task_id, sql_file_path, dependencies=None): 61 | 62 | operator = PythonOperator( 63 | task_id=task_id, 64 | dag = dag, 65 | python_callable=run_sql, 66 | provide_context=True, 67 | op_kwargs={ 68 | 'conn_id' : 'redshift', 69 | 'sql_file_path' : sql_file_path 70 | }, 71 | ) 72 | if dependencies is not None and len(dependencies) > 0: 73 | for dependency in dependencies: 74 | if dependency is not None: 75 | dependency >> operator 76 | return operator 77 | 78 | 79 | transaction_metrics_operator = add_refresh_task( 80 | 'aggregate_transaction_metrics_by_block', 81 | sql_folder + '/increment/aggregate_transaction_metrics_by_block.sql' 82 | ) 83 | 84 | transaction_metrics_operator = add_refresh_task( 85 | 'aggregate_metrics_by_day', 86 | sql_folder + '/increment/aggregate_metrics_by_day.sql', 87 | dependencies=[transaction_metrics_operator] 88 | ) 89 | -------------------------------------------------------------------------------- /airflow/dags/redshift/dags/refresh_aggregates.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | from airflow import models 4 | from airflow.hooks.postgres_hook import PostgresHook 5 | from airflow.operators.python_operator import PythonOperator 6 | from datetime import datetime, timedelta 7 | 8 | import logging 9 | import os 10 | 11 | logging.basicConfig() 12 | logging.getLogger().setLevel(logging.DEBUG) 13 | 14 | default_dag_args = { 15 | 'depends_on_past': False, 16 | 'start_date': datetime(2019, 2, 1), 17 | 'email_on_failure': True, 18 | 'email_on_retry': True, 19 | 'retries': 5, 20 | 'retry_delay': timedelta(minutes=5) 21 | } 22 | 23 | notification_emails = os.environ.get('NOTIFICATION_EMAILS') 24 | if notification_emails and len(notification_emails) > 0: 25 | default_dag_args['email'] = [email.strip() for email in notification_emails.split(',')] 26 | 27 | dag = models.DAG( 28 | dag_id='redshift_refresh_aggregates', 29 | schedule_interval=None, 30 | concurrency=1, 31 | max_active_runs=1, 32 | default_args=default_dag_args 33 | ) 34 | 35 | sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER', "/usr/local/airflow/dags/redshift/sql") 36 | if sql_folder is None: 37 | raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable") 38 | 39 | 40 | def run_sql(**kwargs): 41 | conn_id = kwargs.get('conn_id') 42 | sql_file_path = kwargs.get('sql_file_path') 43 | pg_hook = PostgresHook(conn_id) 44 | 45 | with open(sql_file_path, 'r') as sql_file: 46 | sql = sql_file.read() 47 | pg_hook.run(sql) 48 | 49 | 50 | def add_refresh_task(task_id, sql_file_path, dependencies=None): 51 | 52 | operator = PythonOperator( 53 | task_id=task_id, 54 | dag = dag, 55 | python_callable=run_sql, 56 | provide_context=True, 57 | op_kwargs={ 58 | 'conn_id' : 'redshift', 59 | 'sql_file_path' : sql_file_path 60 | }, 61 | ) 62 | if dependencies is not None and len(dependencies) > 0: 63 | for dependency in dependencies: 64 | if dependency is not None: 65 | dependency >> operator 66 | return operator 67 | 68 | 69 | transaction_metrics_operator = add_refresh_task( 70 | 'aggregate_transaction_metrics_by_block', 71 | sql_folder + '/refresh/aggregate_transaction_metrics_by_block.sql' 72 | ) 73 | 74 | transaction_metrics_operator = add_refresh_task( 75 | 'aggregate_metrics_by_day', 76 | sql_folder + '/refresh/aggregate_metrics_by_day.sql', 77 | dependencies=[transaction_metrics_operator] 78 | ) 79 | -------------------------------------------------------------------------------- /airflow/dags/redshift/sql/increment/aggregate_metrics_by_day.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Join at the block level and aggregate by day. 4 | -- 5 | 6 | DROP TABLE IF EXISTS ethereum.aggregate_metrics_by_day_incr_tmp; 7 | 8 | CREATE TABLE ethereum.aggregate_metrics_by_day_incr_tmp 9 | (LIKE ethereum.aggregate_metrics_by_day); 10 | 11 | INSERT INTO ethereum.aggregate_metrics_by_day_incr_tmp 12 | SELECT 13 | DATE_TRUNC('day', TIMESTAMP 'epoch' + blocks.timestamp * INTERVAL '1 second') 14 | AS day, 15 | 16 | COUNT(DISTINCT blocks.hash) AS blocks_cnt, 17 | COUNT(DISTINCT blocks.miner) AS unique_miners, 18 | AVG(blocks.difficulty) AS median_difficulty, 19 | MAX(blocks.total_difficulty) AS cumulative_difficulty, 20 | SUM(blocks.size) AS total_blocksize_bytes, 21 | SUM(blocks.gas_used) AS gas_used, 22 | SUM(transaction_count) AS transactions_cnt_from_blocks, 23 | SUM(transactions_cnt) AS transactions_cnt, 24 | SUM(new_addresses) AS new_addresses, 25 | SUM(unique_senders) AS unique_senders, 26 | SUM(unique_receivers) AS unique_receivers, 27 | SUM(value_transferred_wei) AS value_transferred_wei, 28 | SUM(value_transferred_eth) AS value_transferred_eth, 29 | SUM(total_gas_provided) AS total_gas_provided 30 | FROM 31 | ethereum.blocks AS blocks 32 | INNER JOIN 33 | ethereum.aggregate_transaction_metrics_by_block AS transactions 34 | ON 35 | blocks.number = transactions.block_number 36 | WHERE 37 | timestamp BETWEEN {start_timestamp} AND {end_timestamp} 38 | GROUP BY 39 | day 40 | ORDER BY 41 | day DESC; 42 | 43 | 44 | BEGIN TRANSACTION; 45 | 46 | DELETE FROM ethereum.aggregate_metrics_by_day 47 | USING ethereum.aggregate_metrics_by_day_incr_tmp 48 | WHERE 49 | ethereum.aggregate_metrics_by_day.day = ethereum.aggregate_metrics_by_day_incr_tmp.day; 50 | 51 | INSERT INTO ethereum.aggregate_metrics_by_day 52 | SELECT * FROM ethereum.aggregate_metrics_by_day_incr_tmp; 53 | 54 | END TRANSACTION; 55 | 56 | DROP TABLE ethereum.aggregate_metrics_by_day_incr_tmp; -------------------------------------------------------------------------------- /airflow/dags/redshift/sql/increment/aggregate_transaction_metrics_by_block.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Rollup our transactions to the block level. 4 | -- 5 | 6 | DROP TABLE IF EXISTS ethereum.aggregate_transaction_metrics_by_block_incr_tmp; 7 | 8 | CREATE TABLE ethereum.aggregate_transaction_metrics_by_block_incr_tmp 9 | (LIKE ethereum.aggregate_transaction_metrics_by_block); 10 | 11 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block_incr_tmp 12 | SELECT 13 | transactions.block_number AS block_number, 14 | COUNT(DISTINCT transactions.hash) AS transactions_cnt, 15 | 16 | COUNT(DISTINCT 17 | CASE 18 | WHEN transactions.nonce = 0 THEN from_address 19 | ELSE NULL 20 | END 21 | ) AS new_addresses, 22 | 23 | COUNT(DISTINCT transactions.from_address) AS unique_senders, 24 | COUNT(DISTINCT transactions.to_address) AS unique_receivers, 25 | SUM(transactions.value) AS value_transferred_wei, 26 | 27 | SUM(transactions.value)::NUMERIC(32, 6) / POWER(10, 18)::NUMERIC(32, 6) 28 | AS value_transferred_eth, 29 | 30 | SUM(transactions.gas) AS total_gas_provided, 31 | AVG(transactions.gas_price) AS avg_gas_price 32 | FROM 33 | ethereum.transactions AS transactions 34 | WHERE 35 | transactions.block_number BETWEEN 36 | (SELECT MIN(number) FROM ethereum.blocks WHERE timestamp >= {start_timestamp}) 37 | AND 38 | (SELECT MAX(number) FROM ethereum.blocks WHERE timestamp <= {end_timestamp}) 39 | GROUP BY 40 | transactions.block_number 41 | ORDER BY 42 | transactions.block_number ASC; 43 | 44 | 45 | BEGIN TRANSACTION; 46 | 47 | DELETE FROM ethereum.aggregate_transaction_metrics_by_block 48 | USING ethereum.aggregate_transaction_metrics_by_block_incr_tmp 49 | WHERE 50 | ethereum.aggregate_transaction_metrics_by_block.block_number = 51 | ethereum.aggregate_transaction_metrics_by_block_incr_tmp.block_number; 52 | 53 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block 54 | SELECT * FROM ethereum.aggregate_transaction_metrics_by_block_incr_tmp; 55 | 56 | END TRANSACTION; 57 | 58 | DROP TABLE ethereum.aggregate_transaction_metrics_by_block_incr_tmp; 59 | -------------------------------------------------------------------------------- /airflow/dags/redshift/sql/refresh/aggregate_metrics_by_day.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Join at the block level and aggregate by day. 4 | -- 5 | 6 | DROP TABLE IF EXISTS ethereum.aggregate_metrics_by_day_tmp; 7 | 8 | CREATE TABLE ethereum.aggregate_metrics_by_day_tmp 9 | (LIKE ethereum.aggregate_metrics_by_day); 10 | 11 | INSERT INTO ethereum.aggregate_metrics_by_day_tmp 12 | SELECT 13 | DATE_TRUNC('day', TIMESTAMP 'epoch' + blocks.timestamp * INTERVAL '1 second') 14 | AS day, 15 | 16 | COUNT(DISTINCT blocks.hash) AS blocks_cnt, 17 | COUNT(DISTINCT blocks.miner) AS unique_miners, 18 | AVG(blocks.difficulty) AS median_difficulty, 19 | MAX(blocks.total_difficulty) AS cumulative_difficulty, 20 | SUM(blocks.size) AS total_blocksize_bytes, 21 | SUM(blocks.gas_used) AS gas_used, 22 | SUM(transaction_count) AS transactions_cnt_from_blocks, 23 | SUM(transactions_cnt) AS transactions_cnt, 24 | SUM(new_addresses) AS new_addresses, 25 | SUM(unique_senders) AS unique_senders, 26 | SUM(unique_receivers) AS unique_receivers, 27 | SUM(value_transferred_wei) AS value_transferred_wei, 28 | SUM(value_transferred_eth) AS value_transferred_eth, 29 | SUM(total_gas_provided) AS total_gas_provided 30 | FROM 31 | ethereum.blocks AS blocks 32 | INNER JOIN 33 | ethereum.aggregate_transaction_metrics_by_block AS transactions 34 | ON 35 | blocks.number = transactions.block_number 36 | GROUP BY 37 | day 38 | ORDER BY 39 | day DESC; 40 | 41 | BEGIN; 42 | DROP TABLE ethereum.aggregate_metrics_by_day; 43 | ALTER TABLE ethereum.aggregate_metrics_by_day_tmp RENAME TO aggregate_metrics_by_day; 44 | COMMIT; -------------------------------------------------------------------------------- /airflow/dags/redshift/sql/refresh/aggregate_transaction_metrics_by_block.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Rollup our transactions to the block level. 4 | -- 5 | 6 | DROP TABLE IF EXISTS ethereum.aggregate_transaction_metrics_by_block_tmp; 7 | 8 | CREATE TABLE ethereum.aggregate_transaction_metrics_by_block_tmp 9 | (LIKE ethereum.aggregate_transaction_metrics_by_block); 10 | 11 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block_tmp 12 | SELECT 13 | transactions.block_number AS block_number, 14 | COUNT(DISTINCT transactions.hash) AS transactions_cnt, 15 | 16 | COUNT(DISTINCT 17 | CASE 18 | WHEN transactions.nonce = 0 THEN from_address 19 | ELSE NULL 20 | END 21 | ) AS new_addresses, 22 | 23 | COUNT(DISTINCT transactions.from_address) AS unique_senders, 24 | COUNT(DISTINCT transactions.to_address) AS unique_receivers, 25 | SUM(transactions.value) AS value_transferred_wei, 26 | 27 | SUM(transactions.value)::NUMERIC(32, 6) / POWER(10, 18)::NUMERIC(32, 6) 28 | AS value_transferred_eth, 29 | 30 | SUM(transactions.gas) AS total_gas_provided, 31 | AVG(transactions.gas_price) AS avg_gas_price 32 | FROM 33 | ethereum.transactions AS transactions 34 | GROUP BY 35 | transactions.block_number 36 | ORDER BY 37 | transactions.block_number ASC; 38 | 39 | BEGIN; 40 | DROP TABLE ethereum.aggregate_transaction_metrics_by_block; 41 | ALTER TABLE ethereum.aggregate_transaction_metrics_by_block_tmp RENAME TO aggregate_transaction_metrics_by_block; 42 | COMMIT; 43 | -------------------------------------------------------------------------------- /airflow/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CMD="airflow" 4 | TRY_LOOP="${TRY_LOOP:-10}" 5 | POSTGRES_HOST="${POSTGRES_HOST:-postgres}" 6 | POSTGRES_PORT=5432 7 | POSTGRES_CREDS="${POSTGRES_CREDS:-airflow:airflow}" 8 | AIRFLOW_URL_PREFIX="${AIRFLOW_URL_PREFIX:-}" 9 | 10 | sed -i "s/{{ POSTGRES_HOST }}/${POSTGRES_HOST}/" ${AIRFLOW_HOME}/airflow.cfg 11 | sed -i "s/{{ POSTGRES_CREDS }}/${POSTGRES_CREDS}/" ${AIRFLOW_HOME}/airflow.cfg 12 | sed -i "s#{{ AIRFLOW_URL_PREFIX }}#${AIRFLOW_URL_PREFIX}#" ${AIRFLOW_HOME}/airflow.cfg 13 | 14 | # ethereum-etl 15 | export CLOUD_PROVIDER="aws" 16 | export OUTPUT_BUCKET="insight-prod-ethereum-etl-output" 17 | export EXPORT_BLOCKS_AND_TRANSACTIONS=True 18 | export EXPORT_RECEIPTS_AND_LOGS=True 19 | export EXTRACT_TOKEN_TRANSFERS=True 20 | export EXPORT_CONTRACTS=True 21 | export EXPORT_TOKENS=True 22 | export EXPORT_TRACES=False 23 | export NOTIFICATION_EMAILS="webster@iteriodata.com" 24 | export EXPORT_MAX_WORKERS=4 25 | export EXPORT_BATCH_SIZE=10 26 | export WEB3_PROVIDER_URI_BACKUP="https://mainnet.infura.io" 27 | export WEB3_PROVIDER_URI_ARCHIVAL="https://mainnet.infura.io" 28 | export DESTINATION_DATASET_PROJECT_ID="test" 29 | export DAGS_FOLDER="/usr/local/airflow/dags/ethereum-etl-airflow/dags" 30 | export PYTHONPATH="${PYTHONPATH}:/${DAGS_FOLDER}" 31 | 32 | 33 | # Install custom python package if requirements.txt is present 34 | if [ -e "/requirements.txt" ]; then 35 | $(which pip) install --user -r /requirements.txt 36 | fi 37 | 38 | # wait for postgres 39 | sleep 60 40 | #if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then 41 | # i=0 42 | # while ! nc $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do 43 | # i=`expr $i + 1` 44 | # if [ $i -ge $TRY_LOOP ]; then 45 | # echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up" 46 | # exit 1 47 | # fi 48 | # echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP" 49 | # sleep 5 50 | # done 51 | 52 | case "$1" in 53 | webserver) 54 | sleep 10 55 | echo "Initialize database..." 56 | # # TODO: move to a Helm hook 57 | # https://github.com/kubernetes/helm/blob/master/docs/charts_hooks.md 58 | $CMD initdb 59 | # https://github.com/apache/airflow/blob/master/docs/security.rst 60 | airflow users --create --username admin --password password --role Admin --email webster@iteriodata.com --firstname Webster --lastname Cook 61 | exec $CMD webserver 62 | ;; 63 | worker) 64 | # To give the webserver time to run initdb. 65 | sleep 30 66 | exec $CMD "$@" 67 | ;; 68 | scheduler) 69 | # To give the webserver time to run initdb. 70 | sleep 30 71 | # Via Tobias Kaymak 72 | # https://github.com/puckel/docker-airflow/issues/55 73 | while echo "Running Scheduler"; do 74 | # See https://airflow.apache.org/cli.html#scheduler 75 | airflow scheduler 76 | exitcode=$? 77 | if [ $exitcode -ne 0 ]; then 78 | echo "ERROR: Scheduler exited with exit code $?." 79 | echo $(date) 80 | exit $exitcode 81 | fi 82 | sleep 30 83 | done 84 | ;; 85 | *) 86 | # The command is something like bash, not an airflow subcommand. Just run it in the right environment. 87 | exec "$@" 88 | ;; 89 | esac 90 | -------------------------------------------------------------------------------- /docs/challenges.md: -------------------------------------------------------------------------------- 1 | 2 | ## Engineering Challenges 3 | 4 | #### Scalability / Resource Utilization 5 | 6 | Exporting historical blockchain data requires scaling the Airflow / Kubernetes 7 | cluster. On our first run utilizing 3 x m4.large instances, it took 4 days to 8 | export the Ethereum blockchain at a rate of 10 GB / hour. The Kubernetes nodes 9 | were fully utilizing CPU resources during that time: 10 | 11 | ![eks_worker_no_autoscaling](img/eks_worker_no_autoscaling.png) 12 | 13 | This was handled by scaling the Kubernetes cluster and increasing concurrency 14 | of Airflow workers. The following parameters were modified in the Airflow 15 | configuration: 16 | 17 | ``` 18 | # The amount of parallelism as a setting to the executor. This defines 19 | # the max number of task instances that should run simultaneously 20 | # on this airflow installation 21 | parallelism = 128 22 | 23 | # The number of task instances allowed to run concurrently by the scheduler 24 | dag_concurrency = 128 25 | 26 | # The maximum number of active DAG runs per DAG 27 | max_active_runs_per_dag = 128 28 | ``` 29 | 30 | And a "target tracking" EC2 autoscaling policy was implemented using Terraform: 31 | 32 | ``` 33 | # 34 | # Scales the EKS Worker autoscaling group based on CPU utilization 35 | # 36 | resource "aws_autoscaling_policy" "eks_workers" { 37 | name = "${var.project}-${var.environment}-eks-worker-cpu-target-tracking" 38 | autoscaling_group_name = "${element(module.eks.workers_asg_names, 0)}" 39 | adjustment_type = "ChangeInCapacity" 40 | policy_type = "TargetTrackingScaling" 41 | estimated_instance_warmup = 300 42 | 43 | target_tracking_configuration { 44 | predefined_metric_specification { 45 | predefined_metric_type = "ASGAverageCPUUtilization" 46 | } 47 | target_value = 70.0 48 | } 49 | } 50 | 51 | ``` 52 | 53 | This mechanism was used to temporarily scale from 3 to 10 EC2 instances while 54 | exporting the historical data. In this case it took 10 hours at 100 GB / hour. 55 | This chart shows the ramp up in CPU usage followed by a drop when the scaling 56 | occurs: 57 | 58 | ![eks_worker_no_autoscaling](img/eks_worker_autoscaling.png) 59 | 60 | Once the historical data is exported, the additional nodes are removed from the 61 | cluster to reduce costs. 62 | 63 | The latency of the scheduler made it difficult to reach the CPU target of 70% 64 | (i.e. the scheduler couldn't launch tasks quick enough). Apparently the latency 65 | can be decreased by changing config settings such as `scheduler_heartbeat_sec`. 66 | But this requires additional testing. 67 | 68 | Another open problem is having the Airflow scheduler backoff if spawning 69 | additional tasks will exceed the cluster's resource capacity. Running a large 70 | number of idle nodes will lead to unacceptable costs. Furthermore, the 71 | Kubernetes API appears to drop requests when cluster resources are exhausted. 72 | This can cause issues when additional nodes are attempting to join the cluster. 73 | 74 | 75 | #### DAG deployments 76 | 77 | There are operational trade-offs associated with the DAG deployment process. 78 | The following methodologies are most prevalent: 79 | 80 | 1. Syncing a shared volume with remote storage such as git or S3 81 | 2. "Pre-baked" DAGs deployed w/ the airflow container. 82 | 83 | Our implementation uses "pre-baked" DAGs and the `dags_in_image` configuration 84 | options. The motivation here is to avoid errors related to syncing files at 85 | runtime. The downside is that this approach is unlikely to scale as well in 86 | large organization with thousands of DAGs. 87 | 88 | 89 | #### DAG task visibility 90 | 91 | The initial version of this project used the 92 | [KubernetesPodOperator](https://airflow.apache.org/kubernetes.html?highlight=kubernetes%20pod%20operator#airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator) 93 | to spin up ethereum-etl containers. The intention was to avoid mixing code for 94 | orchestration and blockchain data export. With a specialized container for 95 | exporting data from each blockchain (Bitcoin, Ethereum, Monero, etc). 96 | Dependencies for each blockchain export would be managed separately and debugging 97 | would be simplified. 98 | 99 | Although this approach could allow a large development team to collaborate more 100 | easily, the downside is that Airflow has limited visibility into the processes 101 | running within the container. For example, a container could run 3 steps of 102 | the data export and 2nd could fail. But it could still show up in Airflow 103 | as successful and not trigger any alerts. 104 | 105 | Since the ethereum-etl container didn't have clear success/failure exit 106 | statuses that would bubble up to Airflow, it made more sense to 107 | [fork](https://github.com/iter-io/ethereum-etl-airflow) the ethereum-etl-airflow 108 | project and run tasks using the Python Operator / Kubernetes Executor. This 109 | task visibility trade-off also motivated the use of raw SQL for updating Redshift 110 | tables instead of using [DBT](https://www.getdbt.com/). 111 | 112 | 113 | #### Incremental updates in Redshift 114 | 115 | Our use case requires joining and aggregating time series data to create metrics 116 | for end users. For example, we need to aggregate blocks and transactions then 117 | join the result. This is expensive because the transactions table is ~400 million 118 | rows and growing. Using a single "dense storage" Redshift node (ds2.xlarge), it 119 | takes about an hour to aggregate and join the full historical dataset. 120 | 121 | We cannot cost effectively do a full refresh on these data models many times per 122 | day. So it's necessary to incrementally update them as the data is ingested. 123 | 124 | Initially the incremental update on one day of data had a runtime of ~20 minutes 125 | due to a sequential scan on the transactions table. This would grow linearly 126 | with the size of the table. The performance of this query was improved by selecting 127 | appropriate sortkeys for the blocks and transactions tables. The `timestamp` column 128 | is used as a sortkey on the blocks table and the block_number is used as a sortkey 129 | on the transactions table. After reloading the tables with these sorkeys in place, 130 | the incremental update on one day of data had a runtime of 2 minutes (10x improvement). 131 | 132 | ![runtime daily incremental update](img/runtime_of_daily_incremental_update.png) 133 | 134 | ![range restricted scan](img/range_restricted_scan.png) 135 | 136 | 137 | #### Data security 138 | 139 | There is a growing demand for joining public blockchain data on proprietary 140 | datasets. As a result security features must be sufficient for protecting 141 | proprietary data stored on the platform. 142 | 143 | During the initial setup of Kubernetes / EKS, we identified the need to 144 | place some Kubernetes services in public subnet with load balancers 145 | accessible from the public internet (e.g. ethereum nodes). While other 146 | services should run within a private subnet with internal load balancers 147 | (e.g. Airflow). 148 | 149 | This information was used as a reference for setting up annotations to 150 | select a subnet: 151 | 152 | * https://github.com/kubernetes/kubernetes/pull/22064/files#diff-07ba008af9c76b0539556ff7fde3105eR62 153 | * https://github.com/terraform-aws-modules/terraform-aws-eks/issues/15 154 | 155 | We specifically expect security issues related to protecting our network 156 | while receiving data on our blockchain nodes. Running these nodes will 157 | announce our endponts to potential adversaries. In the event one of our 158 | nodes are compromised, we must control access to services such as S3 and 159 | the Kubernetes cluster management API. These policies are implemented 160 | using Kubernetes RBAC and AWS IAM role configuration. 161 | 162 | 163 | -------------------------------------------------------------------------------- /docs/getting_started.md: -------------------------------------------------------------------------------- 1 | ## Getting Started 2 | 3 | 4 | #### Step #1 - Install Docker 5 | 6 | Mac OS X: 7 | 1. `brew install docker` 8 | 9 | Ubuntu Linux: 10 | 1. Install [Docker CE](https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/) 11 | 12 | 2. To use docker without root credentials, create a docker group: 13 | 14 | ```bash 15 | sudo groupadd docker 16 | ``` 17 | 18 | Then add your user to this group: 19 | ```bash 20 | sudo usermod -aG docker $USER 21 | ``` 22 | 23 | You can read more in the [official docker documentation](https://docs.docker.com/install/linux/linux-postinstall/#manage-docker-as-a-non-root-user). 24 | 25 | 3. Log out and then log in again to re-evaluate your groups. 26 | 27 | 28 | #### Step #2 - Install the aws-cli 29 | 30 | 31 | 1. Install the [AWS-CLI](http://docs.aws.amazon.com/cli/latest/userguide/awscli-install-linux.html). 32 | 33 | 2. Download your AWS login and keypair from the IAM console. 34 | 35 | 3. Run `aws configure` and input your credentials. 36 | 37 | 4. Now you can use your Docker client authenticate with [AWS ECR](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html): 38 | 39 | ```bash 40 | eval $(aws ecr get-login --no-include-email) 41 | ``` 42 | 43 | You should also add this alias to your bash profile: 44 | ```bash 45 | alias ecrlogin='eval $(aws ecr get-login --no-include-email)' 46 | ``` 47 | 48 | In the future you can authenticate yourself with ECR using this command: 49 | ```bash 50 | ecrlogin 51 | ``` 52 | 53 | 54 | #### Step #3 - Install Terraform 55 | 56 | Follow the [getting started guide](https://learn.hashicorp.com/terraform/getting-started/install.html#installing-terraform) 57 | from Hashicorp. 58 | 59 | 60 | #### Step #4 - Install kube-ctl 61 | 62 | Follow the [task doc](https://kubernetes.io/docs/tasks/tools/install-kubectl/) 63 | on kubernetes.io. 64 | 65 | 66 | #### Step #5 - Use Terraform to provision the base environment and prod network 67 | 68 | Apply the following Terraform configs: 69 | 70 | * environments 71 | * base 72 | * ops 73 | * [ecr](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/base/ops/ecr/ecr.tf) 74 | * prod 75 | * compute 76 | * [eks](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/compute/eks/eks.tf) 77 | * network 78 | * [bastion](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/bastion/bastion.tf) 79 | * [vpc](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/vpc/vpc.tf) 80 | 81 | Here's the example commands for the EKS config: 82 | 83 | ``` 84 | cd terraform/environments/prod/compute/eks 85 | terraform init 86 | terraform apply 87 | ``` 88 | 89 | Repeat this for each of the configs above. 90 | 91 | 92 | #### Step #6 - Setup EKS / Kubernetes 93 | 94 | Follow the [setup guide](https://github.com/iter-io/security-token-analytics/blob/master/k8s/README.md) 95 | in this repo. 96 | 97 | 98 | #### Step #7 - Build and deploy the Airflow container 99 | 100 | Run `make build_docker` in the project root. 101 | 102 | 103 | #### Step #8 - Use Terraform to provision the prod databases and services 104 | 105 | * environments 106 | * prod 107 | * data 108 | * [rds-airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-airflow/rds-airflow.tf) 109 | * [rds-grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-grafana/rds-grafana.tf) 110 | * [rds-superset](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-superset/rds-superset.tf) 111 | * [redshift](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/redshift/redshift.tf) 112 | * services 113 | * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/airflow/airflow.tf) 114 | * [grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/grafana/grafana.tf) 115 | 116 | 117 | #### Step #9 - Create the schema and users in Redshift 118 | 119 | 1. Use psql to create the Redshift schemas: 120 | 121 | * [ethereum schema](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/resources/stages/raw/schemas_redshift/schema.sql) 122 | * [3rd party schemas](https://github.com/iter-io/security-token-analytics/tree/master/redshift/schema) 123 | 124 | 2. Use psql to run [users.sql](https://github.com/iter-io/security-token-analytics/blob/master/redshift/users.sql) 125 | for setting up the Redshift user accounts. -------------------------------------------------------------------------------- /docs/img/dags.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/dags.png -------------------------------------------------------------------------------- /docs/img/eks_worker_autoscaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/eks_worker_autoscaling.png -------------------------------------------------------------------------------- /docs/img/eks_worker_no_autoscaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/eks_worker_no_autoscaling.png -------------------------------------------------------------------------------- /docs/img/grafana_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/grafana_dashboard.png -------------------------------------------------------------------------------- /docs/img/kubernetes_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/kubernetes_dashboard.png -------------------------------------------------------------------------------- /docs/img/range_restricted_scan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/range_restricted_scan.png -------------------------------------------------------------------------------- /docs/img/runtime_of_daily_incremental_update.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/runtime_of_daily_incremental_update.png -------------------------------------------------------------------------------- /docs/img/tech_stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/tech_stack.png -------------------------------------------------------------------------------- /docs/tech_stack.md: -------------------------------------------------------------------------------- 1 | 2 | ## Tech Stack 3 | 4 | #### Kubernetes / EKS 5 | 6 | Kubernetes provides cluster orchestration services for running the apps in our 7 | data pipeline. This allows us to scale out ETL workloads and maintain availability 8 | of services such as the Airflow scheduler and webserver. 9 | 10 | We utilize [Amazon's EKS service](https://aws.amazon.com/eks/) for a managed 11 | Kubernetes control plane. The Kubernetes cluster is scaled using an [EC2 12 | Autoscaling Group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroup.html) 13 | in conjunction with a [scaling policy](https://docs.aws.amazon.com/autoscaling/ec2/userguide/scaling_plan.html). 14 | Our cluster scales in response high CPU usage and attempts to maintain a target 15 | utilization of 70% by adding and removing nodes from the cluster. 16 | 17 | 18 | #### Airflow 19 | 20 | Airflow serves as a scheduler for coordinating the following pipeline tasks: 21 | 22 | 1. Running tasks to export data from Ethereum nodes using the JSON RPC API. Then 23 | upload it to S3. 24 | 2. Loading output data from S3 into Redshift using the COPY command. 25 | 3. Executing SQL in Redshift to update our data models. 26 | 27 | The Dockerfile in the root of this repository is used to build a Docker image 28 | for Airflow that contains all dependencies and DAGs. This same image is used for 29 | the scheduler, webserver, and workers. 30 | 31 | The Airflow scheduler and webserver are deployed on Kubernetes to make them 32 | highly available. In the event one of these processes fails, Kubernetes will 33 | launch new pods in an attempt to keep them running. This setup could also be 34 | used to scale out the webserver service if necessary in a large organization. 35 | 36 | A Postgres RDS instance is used as the backend database for Airflow. So our 37 | scheduler and webserver containers are stateless and can be redeployed as 38 | needed. 39 | 40 | Airflow workers are each launched in their own pod using the [Kubernetes 41 | Executor](https://airflow.readthedocs.io/en/stable/kubernetes.html). Due to 42 | the experimental status of Airflow's Kubernetes functionality, we are building 43 | our image directly from the Airflow master branch (instead of a tagged release). 44 | 45 | 46 | #### go-ethereum 47 | 48 | [go-ethereum](https://github.com/ethereum/go-ethereum/wiki/Geth) is an ethereum 49 | implementation written in Go. We run a "full archive node" containing a complete 50 | history of blockchain transactions. Then we export data via the 51 | [JSON RPC API](https://github.com/ethereum/wiki/wiki/JSON-RPC). The goal of 52 | using Kubernetes to run go-ethereum is to scale the data export process by storing 53 | multiple copies of the blockchain and load balancing the JSON RPC API. 54 | 55 | 56 | #### blockchain-etl 57 | 58 | We utilize code from the [ethereum-etl project](https://github.com/blockchain-etl) 59 | to export raw data from the blockchain. We [forked](https://github.com/iter-io/ethereum-etl-airflow) 60 | the project's Airflow DAGs for GCP and added support for using them with S3 and 61 | Redshift on AWS. This minimized time to market by leveraging an existing 62 | high-quality codebase (credit goes to [Evgeny Medvedev](https://github.com/medvedev1088) 63 | and [Allen Day](https://github.com/allenday)). 64 | 65 | 66 | #### S3 67 | S3 serves as a ["data lake"](https://aws.amazon.com/big-data/datalakes-and-analytics/what-is-a-data-lake/) 68 | for storing exported blockchain data. Data is loaded from S3 into Redshift using 69 | the [COPY command](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) 70 | 71 | 72 | #### Redshift 73 | 74 | Redshift is used as a [data warehouse](https://aws.amazon.com/data-warehouse/). 75 | This allows us to build useful data models and run interactive queries on 76 | them. Redshift was chosen because the columnar storage format is well-suited 77 | for aggregate queries on the historical data. We expect this will be an 78 | important access pattern. 79 | 80 | 81 | #### Grafana 82 | 83 | Grafana is used for data visualization and presentation. We expect our end 84 | users will be familiar with SQL. So using our examples they should be able to 85 | combine simple SQL queries into shareable dashboards. The goal is to build a 86 | "self-service" system where end-users can assist us in identifying important 87 | metrics and improving the Redshift data models. 88 | 89 | 90 | #### Terraform 91 | 92 | Terraform modules are used to provision the following AWS resources: 93 | 94 | 1. VPC components 95 | 2. Bastion server 96 | 3. EKS cluster 97 | 4. S3 buckets 98 | 5. RDS instances 99 | 6. Redshift cluster 100 | 101 | 102 | #### Security 103 | 104 | AWS IAM, VPC security groups, and Kubernetes RBAC Authorization are used to 105 | protect AWS resources. The [AWS Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-paramstore.html) 106 | and [Kubernetes secrets](https://kubernetes.io/docs/concepts/configuration/secret/) 107 | are used to protect credentials and provide them to containers as environment 108 | variables at runtime. -------------------------------------------------------------------------------- /k8s/README.md: -------------------------------------------------------------------------------- 1 | ![high level architecture](../docs/img/kubernetes_dashboard.png) 2 | 3 | ## Kubernetes setup 4 | 5 | 1. Setup the AWS IAM authenticator: 6 | 7 | https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html 8 | 9 | 2. Create a kubeconfig: 10 | 11 | https://docs.aws.amazon.com/eks/latest/userguide/create-kubeconfig.html 12 | 13 | 3. Download the aws-auth config map template: 14 | 15 | `curl -O https://amazon-eks.s3-us-west-2.amazonaws.com/cloudformation/2019-01-09/aws-auth-cm.yaml` 16 | 17 | Edit the aws-auth-cm.yaml file by adding Role ARN of the EC2 worker instances. 18 | 19 | 4. Run our shell script k8s setup script: 20 | 21 | `source k8s/setup.sh` 22 | 23 | 5. Run the proxy: 24 | 25 | `kubectl --context= proxy` 26 | 27 | 6. Open the Kubernetes dashboard login url in your browser: 28 | 29 | http://localhost:8001/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy/#!/login 30 | 31 | 7. Insert the token printed by the setup.sh and click login. 32 | 33 | 8. To access internal services such as the Airflow web UI, setup a SOCKS proxy 34 | using an alias such as the following: 35 | 36 | `alias insocks='ssh -D 8123 -f -C -q -N inprodbastion'` 37 | 38 | 9. For debugging purposes you can ssh from your local system through the bastion 39 | server to access any of the k8s nodes: 40 | 41 | `ssh -J inprodbastion ec2-user@ip-10-0-102-76.ec2.internal` 42 | 43 | For ease of use setup an alias such as: 44 | 45 | `function jumpto() { ssh -J inprodbastion ec2-user@$1; }` 46 | -------------------------------------------------------------------------------- /k8s/config_maps/aws-auth-cm.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: aws-auth 5 | namespace: kube-system 6 | data: 7 | mapRoles: | 8 | - rolearn: 9 | username: system:node:{{EC2PrivateDNSName}} 10 | groups: 11 | - system:bootstrappers 12 | - system:nodes 13 | -------------------------------------------------------------------------------- /k8s/service_accounts/eks-admin-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: eks-admin 5 | namespace: kube-system 6 | --- 7 | apiVersion: rbac.authorization.k8s.io/v1beta1 8 | kind: ClusterRoleBinding 9 | metadata: 10 | name: eks-admin 11 | roleRef: 12 | apiGroup: rbac.authorization.k8s.io 13 | kind: ClusterRole 14 | name: cluster-admin 15 | subjects: 16 | - kind: ServiceAccount 17 | name: eks-admin 18 | namespace: kube-system -------------------------------------------------------------------------------- /k8s/services/airflow-webserver.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: prod-airflow-webserver 5 | annotations: 6 | service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0 7 | spec: 8 | type: NodePort 9 | selector: 10 | app: prod-airflow-webserver 11 | ports: 12 | - name: webserver 13 | protocol: TCP 14 | port: 8080 15 | targetPort: webserver 16 | nodePort: 32080 17 | type: LoadBalancer 18 | -------------------------------------------------------------------------------- /k8s/services/go-ethereum.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: go-ethereum 5 | annotations: 6 | service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0 7 | spec: 8 | type: NodePort 9 | selector: 10 | app: go-ethereum 11 | tier: go-ethereum 12 | ports: 13 | - protocol: TCP 14 | port: 8545 15 | targetPort: 8545 16 | type: LoadBalancer 17 | -------------------------------------------------------------------------------- /k8s/services/grafana.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: grafana 5 | annotations: 6 | service.beta.kubernetes.io/aws-load-balancer-type: nlb 7 | spec: 8 | type: NodePort 9 | selector: 10 | app: grafana 11 | tier: grafana 12 | ports: 13 | - protocol: TCP 14 | port: 3000 15 | targetPort: 3000 16 | type: LoadBalancer 17 | -------------------------------------------------------------------------------- /k8s/services/superset.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: superset 5 | annotations: 6 | service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0 7 | spec: 8 | type: NodePort 9 | selector: 10 | app: superset 11 | tier: superset 12 | ports: 13 | - protocol: TCP 14 | port: 80 15 | targetPort: 8088 16 | type: LoadBalancer -------------------------------------------------------------------------------- /k8s/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Update the local kube config 4 | aws eks update-kubeconfig --name insight-prod-cluster 5 | 6 | # Apply the auth config map so EC2 instances can join our cluster as worker nodes 7 | eks apply -f aws-auth-cm.yaml 8 | 9 | # Kubernetes Dashboard 10 | eks apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v1.10.1/src/deploy/recommended/kubernetes-dashboard.yaml 11 | 12 | # Heapster 13 | eks apply -f https://raw.githubusercontent.com/kubernetes/heapster/master/deploy/kube-config/influxdb/heapster.yaml 14 | 15 | # InfluxDB 16 | eks apply -f https://raw.githubusercontent.com/kubernetes/heapster/master/deploy/kube-config/influxdb/influxdb.yaml 17 | 18 | # Admin service account 19 | eks apply -f eks-admin-service-account.yaml 20 | 21 | # TODO: Figure out how we can parse the token from this command 22 | #EKS_ADMIN_AUTH_TOKEN=$() 23 | eks -n kube-system describe secret $(eks -n kube-system get secret | grep eks-admin | awk '{print $1}') 24 | 25 | # Create the superset config map 26 | eks create configmap superset-config --from-file=superset/superset_config.py 27 | -------------------------------------------------------------------------------- /redshift/check_load_errors.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | starttime, 4 | query, 5 | filename as filename, 6 | line_number as line, 7 | colname as column, 8 | type, 9 | position as pos, 10 | raw_line as line_text, 11 | raw_field_value as field_text, 12 | err_reason as reason 13 | from stl_load_errors 14 | order by starttime desc 15 | limit 200; 16 | -------------------------------------------------------------------------------- /redshift/schema/coinmarketcap.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Data Source: https://coinmarketcap.com/currencies/ethereum/historical-data/ 4 | -- 5 | 6 | CREATE SCHEMA IF NOT EXISTS coinmarketcap; 7 | 8 | DROP TABLE IF EXISTS coinmarketcap.ethereum_usd_price_history; 9 | 10 | CREATE TABLE coinmarketcap.ethereum_usd_price_history ( 11 | day TIMESTAMP NOT NULL, 12 | "open" NUMERIC(38, 6) NOT NULL, 13 | high NUMERIC(38, 6) NOT NULL, 14 | low NUMERIC(38, 6) NOT NULL, 15 | close NUMERIC(38, 6) NOT NULL, 16 | volume NUMERIC(38, 6) NOT NULL, 17 | market_cap NUMERIC(38, 6) NOT NULL, 18 | PRIMARY KEY (day) 19 | ) 20 | DISTSTYLE ALL 21 | SORTKEY (day); 22 | 23 | -------------------------------------------------------------------------------- /redshift/schema/coinmetrics.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Data Source: https://coinmetrics.io/data-downloads/ 4 | -- 5 | 6 | CREATE SCHEMA IF NOT EXISTS coinmetrics; 7 | 8 | DROP TABLE IF EXISTS coinmetrics.ethereum_usd_price_history; 9 | 10 | CREATE TABLE coinmetrics.ethereum_usd_price_history ( 11 | day TIMESTAMP NOT NULL, 12 | tx_volume_usd NUMERIC(38, 6) NOT NULL, 13 | adjusted_tx_volume_usd NUMERIC(38, 6) NOT NULL, 14 | tx_count BIGINT NOT NULL, 15 | marketcap_usd NUMERIC(38, 6) NOT NULL, 16 | price_usd NUMERIC(38, 6) NOT NULL, 17 | exchange_volume_usd NUMERIC(38, 6) NOT NULL, 18 | generated_coins NUMERIC(38, 6) NOT NULL, 19 | fees NUMERIC(38, 6) NOT NULL, 20 | active_addresses BIGINT NOT NULL, 21 | median_tx_value_usd NUMERIC(38, 6) NOT NULL, 22 | median_fee NUMERIC(38, 6) NOT NULL, 23 | average_difficulty NUMERIC(38, 6) NOT NULL, 24 | payment_count BIGINT NOT NULL, 25 | block_size BIGINT NOT NULL, 26 | block_count BIGINT NOT NULL, 27 | nvt NUMERIC(38, 6) NOT NULL, 28 | PRIMARY KEY (day) 29 | ) 30 | DISTSTYLE ALL 31 | SORTKEY (day); -------------------------------------------------------------------------------- /redshift/schema/multipl.sql: -------------------------------------------------------------------------------- 1 | 2 | -- 3 | -- Data Source: http://www.multpl.com/shiller-pe/ 4 | -- 5 | -- Stock Market Data Used in "Irrational Exuberance" Princeton University Press, 2000, 2005, 2015, updated 6 | -- Robert J. Shiller 7 | -- 8 | 9 | CREATE SCHEMA IF NOT EXISTS multpl; 10 | 11 | DROP TABLE IF EXISTS multpl.shiller_pe; 12 | 13 | CREATE TABLE multpl.shiller_pe ( 14 | month TIMESTAMP NOT NULL, 15 | value NUMERIC(8, 2) NOT NULL, 16 | PRIMARY KEY (month) 17 | ) 18 | DISTSTYLE ALL 19 | SORTKEY (month); 20 | -------------------------------------------------------------------------------- /redshift/users.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE SCHEMA IF NOT EXISTS ethereum; 3 | 4 | -- Users with write privileges 5 | CREATE USER airflow WITH PASSWORD 'md5cf665ef3f22dbdbac3d814f411289983'; 6 | 7 | GRANT ALL ON SCHEMA coinmarketcap TO airflow; 8 | GRANT ALL ON ALL TABLES IN SCHEMA coinmarketcap TO airflow; 9 | 10 | GRANT ALL ON SCHEMA coinmetrics TO airflow; 11 | GRANT ALL ON ALL TABLES IN SCHEMA coinmetrics TO airflow; 12 | 13 | GRANT ALL ON SCHEMA ethereum TO airflow; 14 | GRANT ALL ON ALL TABLES IN SCHEMA ethereum TO airflow; 15 | 16 | GRANT ALL ON SCHEMA multpl TO airflow; 17 | GRANT ALL ON ALL TABLES IN SCHEMA multpl TO airflow; 18 | 19 | GRANT ALL ON SCHEMA public TO airflow; 20 | GRANT ALL ON ALL TABLES IN SCHEMA public TO airflow; 21 | 22 | -- Group with read-only privileges 23 | CREATE GROUP read_only; 24 | 25 | REVOKE ALL ON SCHEMA coinmarketcap FROM GROUP read_only; 26 | REVOKE ALL ON SCHEMA coinmetrics FROM GROUP read_only; 27 | REVOKE ALL ON SCHEMA ethereum FROM GROUP read_only; 28 | REVOKE ALL ON SCHEMA multpl FROM GROUP read_only; 29 | REVOKE ALL ON SCHEMA public FROM GROUP read_only; 30 | 31 | GRANT SELECT ON ALL TABLES IN SCHEMA coinmarketcap TO GROUP read_only; 32 | GRANT SELECT ON ALL TABLES IN SCHEMA coinmetrics TO GROUP read_only; 33 | GRANT SELECT ON ALL TABLES IN SCHEMA ethereum TO GROUP read_only; 34 | GRANT SELECT ON ALL TABLES IN SCHEMA multpl TO GROUP read_only; 35 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO GROUP read_only; 36 | 37 | GRANT USAGE ON SCHEMA coinmarketcap TO GROUP read_only; 38 | GRANT USAGE ON SCHEMA coinmetrics TO GROUP read_only; 39 | GRANT USAGE ON SCHEMA ethereum TO GROUP read_only; 40 | GRANT USAGE ON SCHEMA multpl TO GROUP read_only; 41 | GRANT USAGE ON SCHEMA public TO GROUP read_only; 42 | 43 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA coinmarketcap GRANT SELECT ON TABLES TO GROUP read_only; 44 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA coinmetrics GRANT SELECT ON TABLES TO GROUP read_only; 45 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA ethereum GRANT SELECT ON TABLES TO GROUP read_only; 46 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA multpl GRANT SELECT ON TABLES TO GROUP read_only; 47 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA public GRANT SELECT ON TABLES TO GROUP read_only; 48 | 49 | -- Read-only users for the data team; passwords are MD5 hashed 50 | CREATE USER emily WITH PASSWORD 'md51ef5fec399320fe29b433cefd0b947b9'; 51 | CREATE USER grafana WITH PASSWORD 'md54e0282ddf5fe1a6a490cafec948374cb'; 52 | CREATE USER jared WITH PASSWORD 'md56973f062ac9ff074c44728cf5933219f'; 53 | CREATE USER louis WITH PASSWORD 'md5f33e04a12adccd1d65ef2cf6cf389c23'; 54 | CREATE USER mitchell WITH PASSWORD 'md574a9cff949c98e0ac39bb59fb85fa62b'; 55 | CREATE USER superset WITH PASSWORD 'md537f2206b4fd0aeb36b896c8bafcaec9b'; 56 | CREATE USER webster WITH PASSWORD 'md5cf665ef3f22dbdbac3d814f411289983'; 57 | 58 | ALTER GROUP read_only ADD USER 59 | emily, 60 | grafana, 61 | jared, 62 | louis, 63 | mitchell, 64 | superset, 65 | webster; 66 | -------------------------------------------------------------------------------- /terraform/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Terraform 3 | 4 | The terraform configs are split into separate directories for each environment 5 | and resource type. [Modules](https://www.terraform.io/docs/modules/index.html) 6 | are reusable components that can be deployed to multiple environments. Most of 7 | our config files leverage modules from the [Terraform Module Registry](https://registry.terraform.io/) instead 8 | of custom modules. The goal of this approach was to minimize time to market. 9 | 10 | * environments 11 | * base 12 | * ops 13 | * [ecr](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/base/ops/ecr/ecr.tf) 14 | * prod 15 | * compute 16 | * [eks](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/compute/eks/eks.tf) 17 | * data 18 | * [rds-airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-airflow/rds-airflow.tf) 19 | * [rds-grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-grafana/rds-grafana.tf) 20 | * [rds-superset](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-superset/rds-superset.tf) 21 | * [redshift](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/redshift/redshift.tf) 22 | * network 23 | * [bastion](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/bastion/bastion.tf) 24 | * [vpc](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/vpc/vpc.tf) 25 | * services 26 | * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/airflow/airflow.tf) 27 | * [grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/grafana/grafana.tf) 28 | * modules 29 | * services 30 | * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/modules/services/airflow/main.tf) 31 | -------------------------------------------------------------------------------- /terraform/environments/base/ops/ecr/ecr.tf: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------------- 2 | # Shared "base" ECR components (repositories for docker images) 3 | #-------------------------------------------------------------- 4 | 5 | variable "region" {} 6 | 7 | provider "aws" { 8 | region = "${var.region}" 9 | } 10 | 11 | terraform { 12 | backend "s3" { 13 | bucket = "insight-base-terraform" 14 | key = "ops/ecr/ecr.tfstate" 15 | region = "us-east-1" 16 | } 17 | } 18 | 19 | # We are creating ECR as separate resources to be able to remove 20 | # any ECR at any point, using terraform. With ECR passed as list 21 | # it is impossible. 22 | 23 | resource "aws_ecr_repository" "security_token_analytics" { 24 | name = "security-token-analytics" 25 | } 26 | 27 | resource "aws_ecr_repository" "ethereum-etl" { 28 | name = "ethereum-etl" 29 | } 30 | -------------------------------------------------------------------------------- /terraform/environments/base/ops/ecr/terraform.tfvars: -------------------------------------------------------------------------------- 1 | region = "us-east-1" 2 | -------------------------------------------------------------------------------- /terraform/environments/prod/compute/eks/eks.tf: -------------------------------------------------------------------------------- 1 | variable "project" { } 2 | variable "environment" { } 3 | variable "region" { } 4 | variable "asg_desired_capacity" { } 5 | variable "asg_max_size" { } 6 | variable "asg_min_size" { } 7 | variable "instance_type" { } 8 | 9 | provider "aws" { 10 | region = "${var.region}" 11 | } 12 | 13 | terraform { 14 | required_version = "> 0.7.0" 15 | 16 | backend "s3" { 17 | bucket = "insight-prod-terraform" 18 | key = "compute/eks/eks.tfstate" 19 | region = "us-east-1" 20 | } 21 | } 22 | 23 | data "terraform_remote_state" "vpc_state" { 24 | backend = "s3" 25 | 26 | config { 27 | bucket = "insight-prod-terraform" 28 | region = "${var.region}" 29 | key = "network/vpc/vpc.tfstate" 30 | } 31 | } 32 | 33 | data "terraform_remote_state" "bastion_state" { 34 | backend = "s3" 35 | 36 | config { 37 | bucket = "insight-prod-terraform" 38 | region = "${var.region}" 39 | key = "network/bastion/bastion.tfstate" 40 | } 41 | } 42 | 43 | resource "aws_security_group" "worker_group_bastion_ingress" { 44 | name_prefix = "worker_group_mgmt_one" 45 | description = "Bastion ingress to be applied to all EKS worker nodes" 46 | vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}" 47 | 48 | ingress { 49 | from_port = 0 50 | to_port = 65535 51 | protocol = "tcp" 52 | security_groups = ["${data.terraform_remote_state.bastion_state.security_group_id}"] 53 | } 54 | } 55 | 56 | # 57 | # Scales the EKS Worker autoscaling group based on CPU utilization 58 | # 59 | resource "aws_autoscaling_policy" "eks_workers" { 60 | name = "${var.project}-${var.environment}-eks-worker-cpu-target-tracking" 61 | autoscaling_group_name = "${element(module.eks.workers_asg_names, 0)}" 62 | adjustment_type = "ChangeInCapacity" 63 | policy_type = "TargetTrackingScaling" 64 | estimated_instance_warmup = 300 65 | 66 | target_tracking_configuration { 67 | predefined_metric_specification { 68 | predefined_metric_type = "ASGAverageCPUUtilization" 69 | } 70 | target_value = 70.0 71 | } 72 | } 73 | 74 | module "eks" { 75 | source = "terraform-aws-modules/eks/aws" 76 | cluster_name = "${var.project}-${var.environment}-cluster" 77 | 78 | subnets = "${concat( 79 | data.terraform_remote_state.vpc_state.public_subnets, 80 | data.terraform_remote_state.vpc_state.private_subnets 81 | )}" 82 | vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}" 83 | 84 | write_kubeconfig = false 85 | manage_aws_auth = false 86 | 87 | tags = { 88 | environment = "${var.environment}" 89 | } 90 | 91 | worker_groups = [ 92 | { 93 | asg_desired_capacity = "${var.asg_desired_capacity}" # Desired worker capacity in the autoscaling group. 94 | asg_max_size = "${var.asg_max_size}" # Maximum worker capacity in the autoscaling group. 95 | asg_min_size = "${var.asg_min_size}" # Minimum worker capacity in the autoscaling group. 96 | instance_type = "${var.instance_type}" # Size of the workers instances. 97 | spot_price = "" # Cost of spot instance. 98 | placement_tenancy = "" # The tenancy of the instance. Valid values are "default" or "dedicated". 99 | root_volume_size = "100" # root volume size of workers instances. 100 | root_volume_type = "gp2" # root volume type of workers instances, can be 'standard', 'gp2', or 'io1' 101 | root_iops = "0" # The amount of provisioned IOPS. This must be set with a volume_type of "io1". 102 | key_name = "ops" # The key name that should be used for the instances in the autoscaling group 103 | pre_userdata = "" # userdata to pre-append to the default userdata. 104 | additional_userdata = "" # userdata to append to the default userdata. 105 | ebs_optimized = true # sets whether to use ebs optimization on supported types. 106 | enable_monitoring = true # Enables/disables detailed monitoring. 107 | public_ip = false # Associate a public ip address with a worker 108 | kubelet_extra_args = "" # This string is passed directly to kubelet if set. Useful for adding labels or taints. 109 | autoscaling_enabled = false # Sets whether policy and matching tags will be added to allow autoscaling. 110 | additional_security_group_ids = "${aws_security_group.worker_group_bastion_ingress.id}" # A comma delimited list of additional security group ids to include in worker launch config 111 | protect_from_scale_in = false # Prevent AWS from scaling in, so that cluster-autoscaler is solely responsible. 112 | suspended_processes = "" # A comma delimited string of processes to to suspend. i.e. AZRebalance,HealthCheck,ReplaceUnhealthy 113 | target_group_arns = "" # A comma delimited list of ALB target group ARNs to be associated to the ASG 114 | } 115 | ] 116 | } 117 | -------------------------------------------------------------------------------- /terraform/environments/prod/compute/eks/terraform.tfvars: -------------------------------------------------------------------------------- 1 | project = "insight" 2 | environment = "prod" 3 | region = "us-east-1" 4 | asg_desired_capacity = 3 5 | asg_max_size = 10 6 | asg_min_size = 3 7 | instance_type = "c5.xlarge" 8 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-airflow/rds-airflow.tf: -------------------------------------------------------------------------------- 1 | variable "application" { } 2 | variable "project" { } 3 | variable "environment" { } 4 | variable "region" { } 5 | 6 | variable "allocated_storage" { } 7 | variable "allow_major_version_upgrade" { } 8 | variable "apply_immediately" { } 9 | variable "auto_minor_version_upgrade" { } 10 | variable "backup_retention_period" { } 11 | variable "backup_window" { } 12 | variable "create_db_instance" { } 13 | variable "create_db_option_group" { } 14 | variable "create_db_parameter_group" { } 15 | variable "create_db_subnet_group" { } 16 | variable "create_monitoring_role" { } 17 | variable "deletion_protection" { } 18 | variable "engine" { } 19 | variable "engine_version" { } 20 | variable "family" { } 21 | variable "iam_database_authentication_enabled" { } 22 | variable "instance_class" { } 23 | variable "iops" { } 24 | variable "maintenance_window" { } 25 | variable "major_engine_version" { } 26 | variable "monitoring_interval" { } 27 | variable "multi_az" { } 28 | variable "port" { } 29 | variable "publicly_accessible" { } 30 | variable "skip_final_snapshot" { } 31 | variable "storage_encrypted" { } 32 | variable "storage_type" { } 33 | 34 | 35 | provider "aws" { 36 | region = "${var.region}" 37 | } 38 | 39 | terraform { 40 | required_version = "> 0.7.0" 41 | 42 | backend "s3" { 43 | bucket = "insight-prod-terraform" 44 | key = "data/rds-airflow/rds-airflow" 45 | region = "us-east-1" 46 | } 47 | } 48 | 49 | data "terraform_remote_state" "vpc_state" { 50 | backend = "s3" 51 | 52 | config { 53 | bucket = "insight-prod-terraform" 54 | region = "${var.region}" 55 | key = "network/vpc/vpc.tfstate" 56 | } 57 | } 58 | 59 | data "aws_ssm_parameter" "username" { 60 | name = "/prod/rds-airflow/USERNAME" 61 | } 62 | 63 | data "aws_ssm_parameter" "password" { 64 | name = "/prod/rds-airflow/PASSWORD" 65 | } 66 | 67 | # TODO: Lock down to airflow service security group; protected from public access for now though 68 | resource "aws_security_group" "rds" { 69 | name = "${var.project}-${var.environment}-${var.application}-rds" 70 | vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}" 71 | description = "RDS security group" 72 | 73 | tags { 74 | Name = "${var.project}-${var.environment}-${var.application}-rds" 75 | Application = "${var.application}" 76 | Environment = "${var.environment}" 77 | Project = "${var.project}" 78 | } 79 | lifecycle { create_before_destroy = true } 80 | 81 | ingress { 82 | protocol = "tcp" 83 | from_port = "${var.port}" 84 | to_port = "${var.port}" 85 | cidr_blocks = ["0.0.0.0/0"] 86 | } 87 | 88 | egress { 89 | protocol = -1 90 | from_port = 0 91 | to_port = 0 92 | cidr_blocks = ["0.0.0.0/0"] 93 | } 94 | } 95 | 96 | resource "aws_db_parameter_group" "default" { 97 | name = "${var.environment}-${var.project}-${var.application}" 98 | family = "${var.family}" 99 | description = "RDS cluster parameter group" 100 | 101 | parameter { 102 | name = "application_name" 103 | value = "${var.environment}-${var.project}-${var.application}" 104 | } 105 | } 106 | 107 | resource "aws_db_option_group" "default" { 108 | name = "${var.environment}-${var.project}-${var.application}" 109 | engine_name = "${var.engine}" 110 | major_engine_version = "10" 111 | } 112 | 113 | resource "aws_db_subnet_group" "default" { 114 | name = "${var.environment}-${var.project}-${var.application}" 115 | subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"] 116 | 117 | tags = { 118 | Application = "${var.application}" 119 | Environment = "${var.environment}" 120 | Project = "${var.project}" 121 | } 122 | } 123 | 124 | # 125 | # https://github.com/terraform-aws-modules/terraform-aws-rds 126 | # 127 | module "rds" { 128 | source = "terraform-aws-modules/rds/aws" 129 | 130 | allocated_storage = "${var.allocated_storage}" 131 | allow_major_version_upgrade = "${var.allow_major_version_upgrade}" 132 | apply_immediately = "${var.apply_immediately}" 133 | auto_minor_version_upgrade = "${var.auto_minor_version_upgrade}" 134 | availability_zone = "${element(data.terraform_remote_state.vpc_state.azs, 0)}" 135 | backup_retention_period = "${var.backup_retention_period}" 136 | backup_window = "${var.backup_window}" 137 | create_db_instance = "${var.create_db_instance}" 138 | create_db_option_group = "${var.create_db_option_group}" 139 | create_db_parameter_group = "${var.create_db_parameter_group}" 140 | create_db_subnet_group = "${var.create_db_subnet_group}" 141 | create_monitoring_role = "${var.create_monitoring_role}" 142 | db_subnet_group_name = "${aws_db_subnet_group.default.name}" 143 | deletion_protection = "${var.deletion_protection}" 144 | enabled_cloudwatch_logs_exports = [] 145 | engine = "${var.engine}" 146 | engine_version = "${var.engine_version}" 147 | family = "${var.family}" 148 | final_snapshot_identifier = "${var.environment}-${var.project}-${var.application}-final" 149 | iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}" 150 | identifier = "${var.project}-${var.environment}-${var.application}" 151 | instance_class = "${var.instance_class}" 152 | iops = "${var.iops}" 153 | maintenance_window = "${var.maintenance_window}" 154 | major_engine_version = "${var.major_engine_version}" 155 | monitoring_interval = "${var.monitoring_interval}" 156 | monitoring_role_name = "${var.environment}-${var.project}-${var.application}-monitoring" 157 | multi_az = "${var.multi_az}" 158 | name = "${var.environment}_${var.application}" 159 | option_group_name = "${aws_db_option_group.default.name}" 160 | parameter_group_name = "${aws_db_parameter_group.default.name}" 161 | password = "${data.aws_ssm_parameter.password.value}" 162 | port = "${var.port}" 163 | publicly_accessible = "${var.publicly_accessible}" 164 | skip_final_snapshot = "${var.skip_final_snapshot}" 165 | storage_encrypted = "${var.storage_encrypted}" 166 | storage_type = "${var.storage_type}" 167 | subnet_ids = "${data.terraform_remote_state.vpc_state.private_subnets}" 168 | 169 | tags = { 170 | Application = "${var.application}" 171 | Environment = "${var.environment}" 172 | Project = "${var.project}" 173 | } 174 | 175 | username = "${data.aws_ssm_parameter.username.value}" 176 | vpc_security_group_ids = ["${aws_security_group.rds.id}"] 177 | } 178 | 179 | output "db_instance_address" { 180 | value = "${module.rds.this_db_instance_address}" 181 | } 182 | 183 | output "db_instance_arn" { 184 | value = "${module.rds.this_db_instance_arn}" 185 | } 186 | 187 | output "tdb_instance_endpoint" { 188 | value = "${module.rds.this_db_instance_endpoint}" 189 | } 190 | 191 | output "db_instance_id" { 192 | value = "${module.rds.this_db_instance_id}" 193 | } 194 | 195 | output "db_instance_name" { 196 | value = "${module.rds.this_db_instance_name}" 197 | } 198 | 199 | output "db_instance_resource_id" { 200 | value = "${module.rds.this_db_instance_resource_id}" 201 | } 202 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-airflow/terraform.tfvars: -------------------------------------------------------------------------------- 1 | application = "airflow" 2 | project = "insight" 3 | environment = "prod" 4 | region = "us-east-1" 5 | 6 | allocated_storage = "20" 7 | allow_major_version_upgrade = "false" 8 | apply_immediately = "true" 9 | auto_minor_version_upgrade = "true" 10 | backup_retention_period = "14" 11 | backup_window = "03:00-06:00" 12 | create_db_instance = "true" 13 | create_db_option_group = false 14 | create_db_parameter_group = false 15 | create_db_subnet_group = false 16 | create_monitoring_role = "true" 17 | deletion_protection = "false" 18 | engine = "postgres" 19 | engine_version = "10.6" 20 | family = "postgres10" 21 | iam_database_authentication_enabled = "false" 22 | instance_class = "db.t3.micro" 23 | iops = "0" 24 | maintenance_window = "Mon:00:00-Mon:03:00" 25 | major_engine_version = "10.6" 26 | monitoring_interval = "5" 27 | multi_az = "false" 28 | port = "5432" 29 | publicly_accessible = "false" 30 | skip_final_snapshot = "false" 31 | storage_encrypted = "false" 32 | storage_type = "gp2" 33 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-grafana/rds-grafana.tf: -------------------------------------------------------------------------------- 1 | variable "application" { } 2 | variable "project" { } 3 | variable "environment" { } 4 | variable "region" { } 5 | 6 | variable "allocated_storage" { } 7 | variable "allow_major_version_upgrade" { } 8 | variable "apply_immediately" { } 9 | variable "auto_minor_version_upgrade" { } 10 | variable "backup_retention_period" { } 11 | variable "backup_window" { } 12 | variable "create_db_instance" { } 13 | variable "create_db_option_group" { } 14 | variable "create_db_parameter_group" { } 15 | variable "create_db_subnet_group" { } 16 | variable "create_monitoring_role" { } 17 | variable "deletion_protection" { } 18 | variable "engine" { } 19 | variable "engine_version" { } 20 | variable "family" { } 21 | variable "iam_database_authentication_enabled" { } 22 | variable "instance_class" { } 23 | variable "iops" { } 24 | variable "maintenance_window" { } 25 | variable "major_engine_version" { } 26 | variable "monitoring_interval" { } 27 | variable "multi_az" { } 28 | variable "port" { } 29 | variable "publicly_accessible" { } 30 | variable "skip_final_snapshot" { } 31 | variable "storage_encrypted" { } 32 | variable "storage_type" { } 33 | 34 | 35 | provider "aws" { 36 | region = "${var.region}" 37 | } 38 | 39 | terraform { 40 | required_version = "> 0.7.0" 41 | 42 | backend "s3" { 43 | bucket = "insight-prod-terraform" 44 | key = "data/rds-grafana/rds-grafana" 45 | region = "us-east-1" 46 | } 47 | } 48 | 49 | data "terraform_remote_state" "vpc_state" { 50 | backend = "s3" 51 | 52 | config { 53 | bucket = "insight-prod-terraform" 54 | region = "${var.region}" 55 | key = "network/vpc/vpc.tfstate" 56 | } 57 | } 58 | 59 | data "aws_ssm_parameter" "username" { 60 | name = "/prod/rds-grafana/USERNAME" 61 | } 62 | 63 | data "aws_ssm_parameter" "password" { 64 | name = "/prod/rds-grafana/PASSWORD" 65 | } 66 | 67 | # TODO: Lock down to grafana service security group; protected from public access for now though 68 | resource "aws_security_group" "rds" { 69 | name = "${var.project}-${var.environment}-${var.application}-rds" 70 | vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}" 71 | description = "RDS security group" 72 | 73 | tags { 74 | Name = "${var.project}-${var.environment}-${var.application}-rds" 75 | Application = "${var.application}" 76 | Environment = "${var.environment}" 77 | Project = "${var.project}" 78 | } 79 | lifecycle { create_before_destroy = true } 80 | 81 | ingress { 82 | protocol = "tcp" 83 | from_port = "${var.port}" 84 | to_port = "${var.port}" 85 | cidr_blocks = ["0.0.0.0/0"] 86 | } 87 | 88 | egress { 89 | protocol = -1 90 | from_port = 0 91 | to_port = 0 92 | cidr_blocks = ["0.0.0.0/0"] 93 | } 94 | } 95 | 96 | resource "aws_db_parameter_group" "default" { 97 | name = "${var.environment}-${var.project}-${var.application}" 98 | family = "${var.family}" 99 | description = "RDS cluster parameter group" 100 | 101 | parameter { 102 | name = "application_name" 103 | value = "${var.environment}-${var.project}-${var.application}" 104 | } 105 | } 106 | 107 | resource "aws_db_option_group" "default" { 108 | name = "${var.environment}-${var.project}-${var.application}" 109 | engine_name = "${var.engine}" 110 | major_engine_version = "10" 111 | } 112 | 113 | resource "aws_db_subnet_group" "default" { 114 | name = "${var.environment}-${var.project}-${var.application}" 115 | subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"] 116 | 117 | tags = { 118 | Application = "${var.application}" 119 | Environment = "${var.environment}" 120 | Project = "${var.project}" 121 | } 122 | } 123 | 124 | # 125 | # https://github.com/terraform-aws-modules/terraform-aws-rds 126 | # 127 | module "rds" { 128 | source = "terraform-aws-modules/rds/aws" 129 | 130 | allocated_storage = "${var.allocated_storage}" 131 | allow_major_version_upgrade = "${var.allow_major_version_upgrade}" 132 | apply_immediately = "${var.apply_immediately}" 133 | auto_minor_version_upgrade = "${var.auto_minor_version_upgrade}" 134 | availability_zone = "${element(data.terraform_remote_state.vpc_state.azs, 0)}" 135 | backup_retention_period = "${var.backup_retention_period}" 136 | backup_window = "${var.backup_window}" 137 | create_db_instance = "${var.create_db_instance}" 138 | create_db_option_group = "${var.create_db_option_group}" 139 | create_db_parameter_group = "${var.create_db_parameter_group}" 140 | create_db_subnet_group = "${var.create_db_subnet_group}" 141 | create_monitoring_role = "${var.create_monitoring_role}" 142 | db_subnet_group_name = "${aws_db_subnet_group.default.name}" 143 | deletion_protection = "${var.deletion_protection}" 144 | enabled_cloudwatch_logs_exports = [] 145 | engine = "${var.engine}" 146 | engine_version = "${var.engine_version}" 147 | family = "${var.family}" 148 | final_snapshot_identifier = "${var.environment}-${var.project}-${var.application}-final" 149 | iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}" 150 | identifier = "${var.project}-${var.environment}-${var.application}" 151 | instance_class = "${var.instance_class}" 152 | iops = "${var.iops}" 153 | maintenance_window = "${var.maintenance_window}" 154 | major_engine_version = "${var.major_engine_version}" 155 | monitoring_interval = "${var.monitoring_interval}" 156 | monitoring_role_name = "${var.environment}-rds-${var.application}-monitoring-role" 157 | multi_az = "${var.multi_az}" 158 | name = "${var.environment}_${var.application}" 159 | option_group_name = "${aws_db_option_group.default.name}" 160 | parameter_group_name = "${aws_db_parameter_group.default.name}" 161 | password = "${data.aws_ssm_parameter.password.value}" 162 | port = "${var.port}" 163 | publicly_accessible = "${var.publicly_accessible}" 164 | skip_final_snapshot = "${var.skip_final_snapshot}" 165 | storage_encrypted = "${var.storage_encrypted}" 166 | storage_type = "${var.storage_type}" 167 | subnet_ids = "${data.terraform_remote_state.vpc_state.private_subnets}" 168 | 169 | tags = { 170 | Application = "${var.application}" 171 | Environment = "${var.environment}" 172 | Project = "${var.project}" 173 | } 174 | 175 | username = "${data.aws_ssm_parameter.username.value}" 176 | vpc_security_group_ids = ["${aws_security_group.rds.id}"] 177 | } 178 | 179 | output "db_instance_address" { 180 | value = "${module.rds.this_db_instance_address}" 181 | } 182 | 183 | output "db_instance_arn" { 184 | value = "${module.rds.this_db_instance_arn}" 185 | } 186 | 187 | output "tdb_instance_endpoint" { 188 | value = "${module.rds.this_db_instance_endpoint}" 189 | } 190 | 191 | output "db_instance_id" { 192 | value = "${module.rds.this_db_instance_id}" 193 | } 194 | 195 | output "db_instance_name" { 196 | value = "${module.rds.this_db_instance_name}" 197 | } 198 | 199 | output "db_instance_resource_id" { 200 | value = "${module.rds.this_db_instance_resource_id}" 201 | } 202 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-grafana/terraform.tfvars: -------------------------------------------------------------------------------- 1 | application = "grafana" 2 | project = "insight" 3 | environment = "prod" 4 | region = "us-east-1" 5 | 6 | allocated_storage = "20" 7 | allow_major_version_upgrade = "false" 8 | apply_immediately = "true" 9 | auto_minor_version_upgrade = "true" 10 | backup_retention_period = "14" 11 | backup_window = "03:00-06:00" 12 | create_db_instance = "true" 13 | create_db_option_group = false 14 | create_db_parameter_group = false 15 | create_db_subnet_group = false 16 | create_monitoring_role = "true" 17 | deletion_protection = "false" 18 | engine = "postgres" 19 | engine_version = "10.6" 20 | family = "postgres10" 21 | iam_database_authentication_enabled = "false" 22 | instance_class = "db.t3.micro" 23 | iops = "0" 24 | maintenance_window = "Mon:00:00-Mon:03:00" 25 | major_engine_version = "10.6" 26 | monitoring_interval = "5" 27 | multi_az = "false" 28 | port = "5432" 29 | publicly_accessible = "false" 30 | skip_final_snapshot = "false" 31 | storage_encrypted = "false" 32 | storage_type = "gp2" 33 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-superset/rds-superset.tf: -------------------------------------------------------------------------------- 1 | variable "application" { } 2 | variable "project" { } 3 | variable "environment" { } 4 | variable "region" { } 5 | 6 | variable "allocated_storage" { } 7 | variable "allow_major_version_upgrade" { } 8 | variable "apply_immediately" { } 9 | variable "auto_minor_version_upgrade" { } 10 | variable "backup_retention_period" { } 11 | variable "backup_window" { } 12 | variable "create_db_instance" { } 13 | variable "create_db_option_group" { } 14 | variable "create_db_parameter_group" { } 15 | variable "create_db_subnet_group" { } 16 | variable "create_monitoring_role" { } 17 | variable "deletion_protection" { } 18 | variable "engine" { } 19 | variable "engine_version" { } 20 | variable "family" { } 21 | variable "iam_database_authentication_enabled" { } 22 | variable "instance_class" { } 23 | variable "iops" { } 24 | variable "maintenance_window" { } 25 | variable "major_engine_version" { } 26 | variable "monitoring_interval" { } 27 | variable "multi_az" { } 28 | variable "port" { } 29 | variable "publicly_accessible" { } 30 | variable "skip_final_snapshot" { } 31 | variable "storage_encrypted" { } 32 | variable "storage_type" { } 33 | 34 | 35 | provider "aws" { 36 | region = "${var.region}" 37 | } 38 | 39 | terraform { 40 | required_version = "> 0.7.0" 41 | 42 | backend "s3" { 43 | bucket = "insight-prod-terraform" 44 | key = "data/rds-superset/rds-superset" 45 | region = "us-east-1" 46 | } 47 | } 48 | 49 | data "terraform_remote_state" "vpc_state" { 50 | backend = "s3" 51 | 52 | config { 53 | bucket = "insight-prod-terraform" 54 | region = "${var.region}" 55 | key = "network/vpc/vpc.tfstate" 56 | } 57 | } 58 | 59 | data "aws_ssm_parameter" "username" { 60 | name = "/prod/rds-superset/USERNAME" 61 | } 62 | 63 | data "aws_ssm_parameter" "password" { 64 | name = "/prod/rds-superset/PASSWORD" 65 | } 66 | 67 | # TODO: Lock down to superset service security group; protected from public access for now though 68 | resource "aws_security_group" "rds" { 69 | name = "${var.project}-${var.environment}-${var.application}-rds" 70 | vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}" 71 | description = "RDS security group" 72 | 73 | tags { 74 | Name = "${var.project}-${var.environment}-${var.application}-rds" 75 | Application = "${var.application}" 76 | Environment = "${var.environment}" 77 | Project = "${var.project}" 78 | } 79 | lifecycle { create_before_destroy = true } 80 | 81 | ingress { 82 | protocol = "tcp" 83 | from_port = "${var.port}" 84 | to_port = "${var.port}" 85 | cidr_blocks = ["0.0.0.0/0"] 86 | } 87 | 88 | egress { 89 | protocol = -1 90 | from_port = 0 91 | to_port = 0 92 | cidr_blocks = ["0.0.0.0/0"] 93 | } 94 | } 95 | 96 | resource "aws_db_parameter_group" "default" { 97 | name = "${var.environment}-${var.project}-${var.application}" 98 | family = "${var.family}" 99 | description = "RDS cluster parameter group" 100 | 101 | parameter { 102 | name = "application_name" 103 | value = "${var.environment}-${var.project}-${var.application}" 104 | } 105 | } 106 | 107 | resource "aws_db_option_group" "default" { 108 | name = "${var.environment}-${var.project}-${var.application}" 109 | engine_name = "${var.engine}" 110 | major_engine_version = "10" 111 | } 112 | 113 | resource "aws_db_subnet_group" "default" { 114 | name = "${var.environment}-${var.project}-${var.application}" 115 | subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"] 116 | 117 | tags = { 118 | Application = "${var.application}" 119 | Environment = "${var.environment}" 120 | Project = "${var.project}" 121 | } 122 | } 123 | 124 | # 125 | # https://github.com/terraform-aws-modules/terraform-aws-rds 126 | # 127 | module "rds" { 128 | source = "terraform-aws-modules/rds/aws" 129 | 130 | allocated_storage = "${var.allocated_storage}" 131 | allow_major_version_upgrade = "${var.allow_major_version_upgrade}" 132 | apply_immediately = "${var.apply_immediately}" 133 | auto_minor_version_upgrade = "${var.auto_minor_version_upgrade}" 134 | availability_zone = "${element(data.terraform_remote_state.vpc_state.azs, 0)}" 135 | backup_retention_period = "${var.backup_retention_period}" 136 | backup_window = "${var.backup_window}" 137 | create_db_instance = "${var.create_db_instance}" 138 | create_db_option_group = "${var.create_db_option_group}" 139 | create_db_parameter_group = "${var.create_db_parameter_group}" 140 | create_db_subnet_group = "${var.create_db_subnet_group}" 141 | create_monitoring_role = "${var.create_monitoring_role}" 142 | db_subnet_group_name = "${aws_db_subnet_group.default.name}" 143 | deletion_protection = "${var.deletion_protection}" 144 | enabled_cloudwatch_logs_exports = [] 145 | engine = "${var.engine}" 146 | engine_version = "${var.engine_version}" 147 | family = "${var.family}" 148 | final_snapshot_identifier = "${var.environment}-${var.project}-${var.application}-final" 149 | iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}" 150 | identifier = "${var.project}-${var.environment}-${var.application}" 151 | instance_class = "${var.instance_class}" 152 | iops = "${var.iops}" 153 | maintenance_window = "${var.maintenance_window}" 154 | major_engine_version = "${var.major_engine_version}" 155 | monitoring_interval = "${var.monitoring_interval}" 156 | multi_az = "${var.multi_az}" 157 | name = "${var.environment}_${var.application}" 158 | option_group_name = "${aws_db_option_group.default.name}" 159 | parameter_group_name = "${aws_db_parameter_group.default.name}" 160 | password = "${data.aws_ssm_parameter.password.value}" 161 | port = "${var.port}" 162 | publicly_accessible = "${var.publicly_accessible}" 163 | skip_final_snapshot = "${var.skip_final_snapshot}" 164 | storage_encrypted = "${var.storage_encrypted}" 165 | storage_type = "${var.storage_type}" 166 | subnet_ids = "${data.terraform_remote_state.vpc_state.private_subnets}" 167 | 168 | tags = { 169 | Application = "${var.application}" 170 | Environment = "${var.environment}" 171 | Project = "${var.project}" 172 | } 173 | 174 | username = "${data.aws_ssm_parameter.username.value}" 175 | vpc_security_group_ids = ["${aws_security_group.rds.id}"] 176 | } 177 | 178 | output "db_instance_address" { 179 | value = "${module.rds.this_db_instance_address}" 180 | } 181 | 182 | output "db_instance_arn" { 183 | value = "${module.rds.this_db_instance_arn}" 184 | } 185 | 186 | output "tdb_instance_endpoint" { 187 | value = "${module.rds.this_db_instance_endpoint}" 188 | } 189 | 190 | output "db_instance_id" { 191 | value = "${module.rds.this_db_instance_id}" 192 | } 193 | 194 | output "db_instance_name" { 195 | value = "${module.rds.this_db_instance_name}" 196 | } 197 | 198 | output "db_instance_resource_id" { 199 | value = "${module.rds.this_db_instance_resource_id}" 200 | } 201 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/rds-superset/terraform.tfvars: -------------------------------------------------------------------------------- 1 | application = "superset" 2 | project = "insight" 3 | environment = "prod" 4 | region = "us-east-1" 5 | 6 | allocated_storage = "20" 7 | allow_major_version_upgrade = "false" 8 | apply_immediately = "true" 9 | auto_minor_version_upgrade = "true" 10 | backup_retention_period = "14" 11 | backup_window = "03:00-06:00" 12 | create_db_instance = "true" 13 | create_db_option_group = false 14 | create_db_parameter_group = false 15 | create_db_subnet_group = false 16 | create_monitoring_role = "true" 17 | deletion_protection = "false" 18 | engine = "postgres" 19 | engine_version = "10.6" 20 | family = "postgres10" 21 | iam_database_authentication_enabled = "false" 22 | instance_class = "db.t3.micro" 23 | iops = "0" 24 | maintenance_window = "Mon:00:00-Mon:03:00" 25 | major_engine_version = "10.6" 26 | monitoring_interval = "5" 27 | multi_az = "false" 28 | port = "5432" 29 | publicly_accessible = "false" 30 | skip_final_snapshot = "false" 31 | storage_encrypted = "false" 32 | storage_type = "gp2" 33 | -------------------------------------------------------------------------------- /terraform/environments/prod/data/redshift/redshift.tf: -------------------------------------------------------------------------------- 1 | variable "project" { } 2 | variable "environment" { } 3 | variable "region" { } 4 | variable "allow_version_upgrade" { } 5 | variable "automated_snapshot_retention_period" { } 6 | variable "cluster_node_type" { } 7 | variable "cluster_number_of_nodes" { } 8 | variable "cluster_parameter_group" { } 9 | variable "cluster_port" { } 10 | variable "cluster_version" { } 11 | variable "enable_logging" { } 12 | variable "encrypted" { } 13 | variable "enhanced_vpc_routing" { } 14 | variable "logging_bucket_name" { } 15 | variable "preferred_maintenance_window" { } 16 | variable "publicly_accessible" { } 17 | variable "skip_final_snapshot" { } 18 | variable "wlm_json_configuration" { } 19 | 20 | provider "aws" { 21 | region = "${var.region}" 22 | } 23 | 24 | terraform { 25 | required_version = "> 0.7.0" 26 | 27 | backend "s3" { 28 | bucket = "insight-prod-terraform" 29 | key = "data/redshift/redshift.tfstate" 30 | region = "us-east-1" 31 | } 32 | } 33 | 34 | data "terraform_remote_state" "vpc_state" { 35 | backend = "s3" 36 | 37 | config { 38 | bucket = "insight-prod-terraform" 39 | region = "${var.region}" 40 | key = "network/vpc/vpc.tfstate" 41 | } 42 | } 43 | 44 | data "terraform_remote_state" "airflow_state" { 45 | backend = "s3" 46 | 47 | config { 48 | bucket = "insight-prod-terraform" 49 | region = "${var.region}" 50 | key = "services/airflow/airflow.tfstate" 51 | } 52 | } 53 | 54 | data "aws_ssm_parameter" "cluster_master_username" { 55 | name = "/prod/redshift/CLUSTER_MASTER_USERNAME" 56 | } 57 | 58 | 59 | data "aws_ssm_parameter" "cluster_master_password" { 60 | name = "/prod/redshift/CLUSTER_MASTER_PASSWORD" 61 | } 62 | 63 | # 64 | # IAM role that allows our Redshift cluster to load data from S3. 65 | # 66 | resource "aws_iam_service_linked_role" "redshift_service_role" { 67 | aws_service_name = "redshift.amazonaws.com" 68 | } 69 | 70 | resource "aws_iam_policy" "s3_read_write_policy" { 71 | name = "${var.project}-${var.environment}-s3-policy-airflow-output" 72 | 73 | policy = <