├── .dockerignore
├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── airflow
    ├── README.md
    ├── config
    │   └── airflow.cfg
    ├── dags
    │   └── redshift
    │   │   ├── dags
    │   │       ├── increment_aggregates.py
    │   │       └── refresh_aggregates.py
    │   │   └── sql
    │   │       ├── increment
    │   │           ├── aggregate_metrics_by_day.sql
    │   │           └── aggregate_transaction_metrics_by_block.sql
    │   │       └── refresh
    │   │           ├── aggregate_metrics_by_day.sql
    │   │           └── aggregate_transaction_metrics_by_block.sql
    └── entrypoint.sh
├── docs
    ├── challenges.md
    ├── getting_started.md
    ├── img
    │   ├── dags.png
    │   ├── eks_worker_autoscaling.png
    │   ├── eks_worker_no_autoscaling.png
    │   ├── grafana_dashboard.png
    │   ├── kubernetes_dashboard.png
    │   ├── range_restricted_scan.png
    │   ├── runtime_of_daily_incremental_update.png
    │   └── tech_stack.png
    └── tech_stack.md
├── k8s
    ├── README.md
    ├── config_maps
    │   └── aws-auth-cm.yaml
    ├── service_accounts
    │   └── eks-admin-service-account.yaml
    ├── services
    │   ├── airflow-webserver.yaml
    │   ├── go-ethereum.yaml
    │   ├── grafana.yaml
    │   └── superset.yaml
    └── setup.sh
├── redshift
    ├── check_load_errors.sql
    ├── schema
    │   ├── coinmarketcap.sql
    │   ├── coinmetrics.sql
    │   └── multipl.sql
    └── users.sql
└── terraform
    ├── README.md
    ├── environments
        ├── base
        │   └── ops
        │   │   └── ecr
        │   │       ├── ecr.tf
        │   │       └── terraform.tfvars
        └── prod
        │   ├── compute
        │       └── eks
        │       │   ├── eks.tf
        │       │   └── terraform.tfvars
        │   ├── data
        │       ├── rds-airflow
        │       │   ├── rds-airflow.tf
        │       │   └── terraform.tfvars
        │       ├── rds-grafana
        │       │   ├── rds-grafana.tf
        │       │   └── terraform.tfvars
        │       ├── rds-superset
        │       │   ├── rds-superset.tf
        │       │   └── terraform.tfvars
        │       └── redshift
        │       │   ├── redshift.tf
        │       │   └── terraform.tfvars
        │   ├── network
        │       ├── bastion
        │       │   ├── bastion.tf
        │       │   └── terraform.tfvars
        │       └── vpc
        │       │   ├── terraform.tfvars
        │       │   └── vpc.tf
        │   └── services
        │       ├── airflow
        │           ├── airflow.tf
        │           └── terraform.tfvars
        │       └── grafana
        │           ├── grafana.tf
        │           └── terraform.tfvars
    └── modules
        └── services
            └── airflow
                ├── main.tf
                ├── outputs.tf
                └── variables.tf


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Terraform state
 2 | .terraform/*
 3 | **/.terraform/*
 4 | **/terraform.tfstate.backup
 5 | **/errored.tfstate
 6 | tfplan
 7 | 
 8 | # Secrets
 9 | k8s/secrets/
10 | 
11 | # VIM swap files
12 | *.swp
13 | 
14 | # Temp files
15 | # tmp/*
16 | 
17 | *.zip
18 | 
19 | # mypy
20 | .mypy_cache
21 | 
22 | # pid file created during tests
23 | *.pid
24 | 
25 | # Jetbrains IDE files
26 | *.iml
27 | .idea/
28 | 
29 | # Temp sql
30 | redshift/scratch.sql
31 | 
32 | # VIM swap files
33 | *.swp
34 | 
35 | # Byte-compiled / optimized / DLL files
36 | __pycache__/
37 | *.py[cod]
38 | 
39 | # C extensions
40 | *.so
41 | 
42 | # Distribution / packaging
43 | .Python
44 | env/
45 | build/
46 | develop-eggs/
47 | dist/
48 | downloads/
49 | eggs/
50 | .eggs/
51 | lib/
52 | lib64/
53 | parts/
54 | sdist/
55 | var/
56 | *.egg-info/
57 | .installed.cfg
58 | *.egg
59 | 
60 | # PyInstaller
61 | #  Usually these files are written by a python script from a template
62 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
63 | *.manifest
64 | *.spec
65 | 
66 | # Installer logs
67 | pip-log.txt
68 | pip-delete-this-directory.txt
69 | 
70 | # Unit test / coverage reports
71 | htmlcov/
72 | .tox/
73 | .coverage
74 | .coverage.*
75 | .cache
76 | nosetests.xml
77 | coverage.xml
78 | *,cover
79 | .pytest_cache
80 | 
81 | # Sphinx documentation
82 | docs/_build/
83 | 
84 | # PyBuilder
85 | target/
86 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "airflow/dags/ethereum-etl-airflow"]
2 | 	path = airflow/dags/ethereum-etl-airflow
3 | 	url = https://github.com/iter-io/ethereum-etl-airflow.git
4 | 	branch = feature-aws
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | #
  2 | # This Dockerfile is used to build a Docker image for Airflow that contains all
  3 | # dependencies and DAGs. This same image is used for the scheduler, webserver,
  4 | # and workers.
  5 | #
  6 | # It's based on the ones in the Airflow repository:
  7 | #
  8 | # https://github.com/apache/airflow/blob/master/Dockerfile
  9 | # https://github.com/apache/airflow/blob/master/scripts/ci/kubernetes/docker/Dockerfile
 10 | #
 11 | # It also borrows from Puckel's popular image:
 12 | #
 13 | # https://github.com/puckel/docker-airflow
 14 | #
 15 | # The major differences:
 16 | #
 17 | # 1. Airflow is installed directly from the master branch instead of the tagged
 18 | #    releases.  This involves cloning the code from Github into the image and
 19 | #    building the frontend with npm.
 20 | #
 21 | # 2. Dependencies were added for ethereum-etl and bitcoin-etl.
 22 | #
 23 | #
 24 | 
 25 | FROM python:3.6-slim
 26 | 
 27 | # Never prompts the user for choices on installation/configuration of packages
 28 | ENV DEBIAN_FRONTEND noninteractive
 29 | ENV TERM linux
 30 | 
 31 | # Dependences required for the build but not at runtime
 32 | ARG buildDeps="\
 33 |     freetds-dev \
 34 |     libczmq-dev \
 35 |     libkrb5-dev \
 36 |     libsasl2-dev \
 37 |     libssl-dev \
 38 |     libffi-dev \
 39 |     libpq-dev \
 40 |     git \
 41 |     nodejs"
 42 | 
 43 | # Dependencies required by Airflow at runtime
 44 | ARG APT_DEPS="\
 45 |     $buildDeps \
 46 |     bind9utils \
 47 |     libsasl2-dev \
 48 |     freetds-bin \
 49 |     build-essential \
 50 |     default-libmysqlclient-dev \
 51 |     inetutils-telnet \
 52 |     apt-utils \
 53 |     curl \
 54 |     rsync \
 55 |     netcat \
 56 |     locales \
 57 |     wget \
 58 |     zip \
 59 |     unzip"
 60 | 
 61 | # Dependencies installed via pip
 62 | ARG PYTHON_DEPS="\
 63 |     pytz \
 64 |     cryptography \
 65 |     requests \
 66 |     pyOpenSSL \
 67 |     ndg-httpsclient \
 68 |     pyasn1 \
 69 |     psycopg2-binary \
 70 |     Flask-Bcrypt \
 71 |     Flask-WTF==0.14 \
 72 |     click \
 73 |     kubernetes \
 74 |     setuptools \
 75 |     wheel"
 76 | 
 77 | # http://airflow.apache.org/installation.html
 78 | ARG AIRFLOW_DEPS="postgres,s3,devel"
 79 | ARG AIRFLOW_HOME=/usr/local/airflow
 80 | 
 81 | # Required by ethereum-etl
 82 | ARG ETHEREUM_ETL_DEPS="\
 83 |     google-api-python-client \
 84 |     httplib2 \
 85 |     bitcoin-etl \
 86 |     ethereum-etl \
 87 |     mythril \
 88 |     pyetherchain \
 89 |     pandas \
 90 |     pandas-gbq"
 91 | 
 92 | ENV AIRFLOW_GPL_UNIDECODE yes
 93 | 
 94 | # Define en_US.
 95 | ENV LANGUAGE en_US.UTF-8
 96 | ENV LANG en_US.UTF-8
 97 | ENV LC_ALL en_US.UTF-8
 98 | ENV LC_CTYPE en_US.UTF-8
 99 | ENV LC_MESSAGES en_US.UTF-8
100 | 
101 | WORKDIR /opt/
102 | 
103 | RUN set -ex \
104 |     # Update our currently installed packages
105 |     && apt-get update -yqq \
106 |     && apt-get upgrade -yqq \
107 |     # Install Airflow dependencies
108 |     && apt install -y $APT_DEPS \
109 |     && pip install --upgrade pip \
110 |     && pip install --no-cache-dir ${PYTHON_DEPS} \
111 |     # Get the master branch of Airflow from Github
112 |     && git clone --depth=1 https://github.com/apache/airflow.git \
113 |     # Build the Airflow frontend
114 |     && curl -sL https://deb.nodesource.com/setup_11.x | bash - \
115 |     && apt-get install -y nodejs \
116 |     && npm --prefix /opt/airflow/airflow/www install /opt/airflow/airflow/www \
117 |     && npm --prefix /opt/airflow/airflow/www run-script build \
118 |     # Install Airflow from source
119 |     && pip install --no-cache-dir -e /opt/airflow[$AIRFLOW_DEPS] \
120 |     # Required by Airflow S3 Hook
121 |     && useradd -ms /bin/bash -d ${AIRFLOW_HOME} airflow \
122 |     && pip install boto3 \
123 |     # Change the local to UTF-8
124 |     && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
125 |     && locale-gen \
126 |     && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
127 |     # Required by ethereum-etl-airflow
128 |     && pip install --no-cache-dir ${ETHEREUM_ETL_DEPS} \
129 |     # Remove unncessary files from this layer
130 |     && apt-get purge --auto-remove -yqq $buildDeps \
131 |     && apt-get autoremove -yqq --purge \
132 |     && apt-get clean \
133 |     && rm -rf \
134 |         /var/lib/apt/lists/* \
135 |         /tmp/* \
136 |         /var/tmp/* \
137 |         /usr/share/man \
138 |         /usr/share/doc \
139 |         /usr/share/doc-base
140 | 
141 | WORKDIR ${AIRFLOW_HOME}
142 | 
143 | COPY airflow/entrypoint.sh /entrypoint.sh
144 | COPY airflow/config/airflow.cfg ${AIRFLOW_HOME}/airflow.cfg
145 | 
146 | COPY ./airflow/dags ${AIRFLOW_HOME}/dags
147 | 
148 | # Trying to get Kubernetes workers to load our dags
149 | COPY ./airflow/dags /tmp/dags
150 | 
151 | RUN chown -R airflow: ${AIRFLOW_HOME}
152 | 
153 | EXPOSE 8080 5555 8793
154 | 
155 | USER airflow
156 | ENTRYPOINT ["/entrypoint.sh"]
157 | 
158 | # sets default arg for entrypoint
159 | CMD ["webserver"]
160 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, ITERIO INC.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the <organization> nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build_venv build_docker ensure_image lint_code lint_tests run run_tests test \
 2 | 	update_dependencies upload_docker venv_activate deploy
 3 | 
 4 | DOCKER_IMAGE = 772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics
 5 | KUBE = kubectl --context="arn:aws:eks:us-east-1:772681551441:cluster/insight-prod-cluster"
 6 | VERSION = $(shell git rev-parse --short HEAD)
 7 | IMAGE_FOUND = $(shell docker images \
 8 | 			  --format "{{ .Repository }}" --filter "reference=security_token_analytics")
 9 | AIRFLOW_TF_CONFIG_DIR = terraform/environments/prod/services/airflow
10 | 
11 | test: build_venv
12 | 
13 | #
14 | # Uploads our Docker image to ECR with both latest and version tags.
15 | #
16 | upload_docker: build_docker
17 | 
18 | 	# Tag our image
19 | 	docker tag $(DOCKER_IMAGE):latest $(DOCKER_IMAGE):$(VERSION)
20 | 
21 | 	# Authenticate with ECR and push the image
22 | 	eval $(aws ecr get-login --no-include-email) && docker push $(DOCKER_IMAGE):$(VERSION)
23 | 	eval $(aws ecr get-login --no-include-email) && docker push $(DOCKER_IMAGE):latest
24 | 
25 | 
26 | deploy: upload_docker
27 | 	echo "Redeploying the Airflow webserver"
28 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform taint --module=airflow kubernetes_deployment.airflow_webserver
29 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform plan -target=module.airflow.kubernetes_deployment.airflow_webserver -out=tfplan
30 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform apply "tfplan"
31 | 
32 | 	echo "Redeploying the Airflow scheduler"
33 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform taint --module=airflow kubernetes_deployment.airflow_scheduler
34 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform plan -target=module.airflow.kubernetes_deployment.airflow_scheduler -out=tfplan
35 | 	cd $(AIRFLOW_TF_CONFIG_DIR) && terraform apply "tfplan"
36 | 
37 | 
38 | venv_activate:
39 | 	pipenv shell
40 | 
41 | 
42 | update_dependencies:
43 | 	pipenv update --dev
44 | 	make test
45 | 
46 | 
47 | # ==========================
48 | # Internal targets
49 | # ==========================
50 | 
51 | build_docker:
52 | 	docker build --build-arg VERSION=$(VERSION) -t $(DOCKER_IMAGE) .
53 | 
54 | build_venv:
55 | 	pipenv sync --dev
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # security-token-analytics
 3 | 
 4 | ### Overview
 5 | 
 6 | This project uses blockchain data to provide a platform for financial analysis 
 7 | of crypto assets.  In particular we are researching in 
 8 | emerging standards for [security tokens](https://github.com/ethereum/EIPs/issues/1411) 
 9 | and potential methods of fundamental analysis.  More broadly this project aims 
10 | to serve as an example for how to build a data pipeline for analyzing public 
11 | blockchains.
12 | 
13 | ## Documentation
14 | 
15 | ### Table of Contents
16 | 1.  [Getting Started](/docs/getting_started.md)
17 | 2.  [Terraform Configs](/terraform/README.md)
18 | 3.  [Setting up Kubernetes / EKS](/k8s/README.md)
19 | 4.  [Airflow DAGs](/airflow/README.md)
20 | 5.  [Tech Stack](/docs/tech_stack.md)
21 | 6.  [Engineering Challenges](/docs/challenges.md)
22 | 
23 | 
24 | ### High-level Architecture
25 | ![high level architecture](docs/img/tech_stack.png)
26 | 
27 | ### Example Dashboard
28 | ![example dashboard](docs/img/grafana_dashboard.png)
29 | 


--------------------------------------------------------------------------------
/airflow/README.md:
--------------------------------------------------------------------------------
 1 | ![dags](../docs/img/dags.png)
 2 | 
 3 | #### DAGs Overview
 4 | 
 5 | 1.  [ethereum_etl_export_dag](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/export_dag.py) -
 6 |     Runs the [ethereum-etl library](https://github.com/blockchain-etl/ethereum-etl) 
 7 |     to export blockchain data into CSV and JSON files.  These files are then uploaded 
 8 |     to S3.
 9 | 
10 | 2. [ethereum_etl_load_dag_redshift](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/load_dag_redshift.py) - 
11 |    Uses the Redshift [COPY command](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) 
12 |    to load files from S3 into Redshift.
13 | 
14 | 3. [redshift_increment_aggregates](https://github.com/iter-io/security-token-analytics/blob/master/airflow/dags/redshift/dags/increment_aggregates.py) - 
15 |    Executes [SQL queries](https://github.com/iter-io/security-token-analytics/tree/master/airflow/dags/redshift/sql/increment) 
16 |    in Redshift for incrementally updating the aggregate data models.
17 | 
18 | 4. [redshift_refresh_aggregates](https://github.com/iter-io/security-token-analytics/blob/master/airflow/dags/redshift/dags/refresh_aggregates.py) - 
19 |    Executes [SQL queries](https://github.com/iter-io/security-token-analytics/tree/master/airflow/dags/redshift/sql/refresh) 
20 |    in Redshift for doing a full refresh the aggregate data models.  We keep 
21 |    this DAG off but can manually trigger it to rebuild the tables if necessary.
22 | 
23 | 
24 | #### Why the DAGs are not combined
25 | 
26 | The original export and load DAGs are separated because the export 
27 | DAG produces single partitions while the load DAG imported all partitions into 
28 | BigQuery.  A secondary goal of this project is to contribute back to the 
29 | blockchain-etl project as opposed to forking it.  So it made sense to keep them 
30 | separated for now. 
31 | 
32 | DAGs #1-2 will be included in the pull request while DAGs #3-4 will remain in this 
33 | project. After getting feedback on the pull request then DAGs #1-3 will probably 
34 | be combined into one (or run as subdags). 
35 | 
36 | 
37 | #### Airflow build
38 | 
39 | The [Dockerfile](https://github.com/iter-io/security-token-analytics/blob/master/Dockerfile) 
40 | in the root of this repository is used to build a Docker image for Airflow that 
41 | contains all dependencies and DAGs. This same image is used for the scheduler, 
42 | webserver, and workers.  In order to have the latest Kubernetes features, this 
43 | build uses the [master branch](https://github.com/apache/airflow/tree/master) for 
44 | airflow instead of a tagged release.
45 | 
46 | Info on the frontend npm build:
47 | 
48 | https://github.com/apache/airflow/blob/master/CONTRIBUTING.md#setting-up-the-node--npm-javascript-environment
49 | 


--------------------------------------------------------------------------------
/airflow/config/airflow.cfg:
--------------------------------------------------------------------------------
  1 | [core]
  2 | # The home folder for airflow, default is ~/airflow
  3 | airflow_home = /usr/local/airflow
  4 | 
  5 | # The folder where your airflow pipelines live, most likely a
  6 | # subfolder in a code repository
  7 | dags_folder = /usr/local/airflow/dags
  8 | 
  9 | # The folder where airflow should store its log files.
 10 | base_log_folder = /usr/local/airflow/logs
 11 | 
 12 | # https://github.com/apache/airflow/blob/master/docs/howto/write-logs.rst
 13 | remote_logging = True
 14 | #remote_base_log_folder = ENVRIONMENT VARIABLE
 15 | #remote_log_conn_id = ENVRIONMENT VARIABLE
 16 | #encrypt_s3_logs = ENVRIONMENT VARIABLE
 17 | 
 18 | # Logging level
 19 | logging_level = DEBUG
 20 | fab_logging_level = WARN
 21 | 
 22 | # Logging class
 23 | # Specify the class that will specify the logging configuration
 24 | # This class has to be on the python classpath
 25 | # logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
 26 | logging_config_class =
 27 | 
 28 | # Log format
 29 | log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s
 30 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 31 | 
 32 | # Log filename format
 33 | #log_filename_template = {{{{ ti.dag_id }}}}/{{{{ ti.task_id }}}}/{{{{ ts }}}}/{{{{ try_number }}}}.log
 34 | #log_processor_filename_template = {{{{ filename }}}}.log
 35 | #dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log
 36 | 
 37 | # Hostname by providing a path to a callable, which will resolve the hostname
 38 | #hostname_callable = socket:getfqdn
 39 | 
 40 | # Default timezone in case supplied date times are naive
 41 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam)
 42 | default_timezone = utc
 43 | 
 44 | # The executor class that airflow should use. Choices include
 45 | # SequentialExecutor, LocalExecutor, CeleryExecutor
 46 | #executor = LocalExecutor
 47 | executor = KubernetesExecutor
 48 | 
 49 | # The SqlAlchemy connection string to the metadata database.
 50 | # SqlAlchemy supports many different database engine, more information
 51 | # their website
 52 | #sql_alchemy_conn = ENVIRONMENT VARIABLE
 53 | 
 54 | # The encoding for the databases
 55 | sql_engine_encoding = utf-8
 56 | 
 57 | # The SqlAlchemy pool size is the maximum number of database connections
 58 | # in the pool.
 59 | # TODO:  Does this affect our max concurrency?
 60 | sql_alchemy_pool_size = 32
 61 | 
 62 | # The SqlAlchemy pool recycle is the number of seconds a connection
 63 | # can be idle in the pool before it is invalidated. This config does
 64 | # not apply to sqlite.
 65 | sql_alchemy_pool_recycle = 3600
 66 | 
 67 | # How many seconds to retry re-establishing a DB connection after
 68 | # disconnects. Setting this to 0 disables retries.
 69 | sql_alchemy_reconnect_timeout = 300
 70 | 
 71 | # The schema to use for the metadata database
 72 | # SqlAlchemy supports databases with the concept of multiple schemas.
 73 | sql_alchemy_schema =
 74 | 
 75 | # The amount of parallelism as a setting to the executor. This defines
 76 | # the max number of task instances that should run simultaneously
 77 | # on this airflow installation
 78 | parallelism = 128
 79 | 
 80 | # The number of task instances allowed to run concurrently by the scheduler
 81 | dag_concurrency = 128
 82 | 
 83 | # Are DAGs paused by default at creation
 84 | dags_are_paused_at_creation = True
 85 | 
 86 | # The maximum number of active DAG runs per DAG
 87 | max_active_runs_per_dag = 128
 88 | 
 89 | load_examples = False
 90 | 
 91 | # Where your Airflow plugins are stored
 92 | plugins_folder = /usr/local/airflow/plugins
 93 | 
 94 | # Secret key to save connection passwords in the db
 95 | #fernet_key = ENVRIONMENT VARIABLE
 96 | 
 97 | # Whether to disable pickling dags
 98 | donot_pickle = False
 99 | 
100 | # How long before timing out a python file import while filling the DagBag
101 | # Increased this from 60 due to dag_id could not be found errors
102 | dagbag_import_timeout = 180
103 | 
104 | # The class to use for running task instances in a subprocess
105 | task_runner = StandardTaskRunner
106 | 
107 | # If set, tasks without a `run_as_user` argument will be run with this user
108 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
109 | default_impersonation =
110 | 
111 | # What security module to use (for example kerberos):
112 | security =
113 | 
114 | # If set to False enables some unsecure features like Charts and Ad Hoc Queries.
115 | # In 2.0 will default to True.
116 | secure_mode = False
117 | 
118 | # Turn unit test mode on (overwrites many configuration options with test
119 | # values at runtime)
120 | unit_test_mode = False
121 | 
122 | # Name of handler to read task instance logs.
123 | # Default to use task handler.
124 | task_log_reader = task
125 | 
126 | # Whether to enable pickling for xcom (note that this is insecure and allows for
127 | # RCE exploits). This will be deprecated in Airflow 2.0 (be forced to False).
128 | enable_xcom_pickling = True
129 | 
130 | # When a task is killed forcefully, this is the amount of time in seconds that
131 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
132 | killed_task_cleanup_time = 60
133 | 
134 | # Whether to override params with dag_run.conf. If you pass some key-value pairs through `airflow backfill -c` or
135 | # `airflow trigger_dag -c`, the key-value pairs will override the existing ones in params.
136 | dag_run_conf_overrides_params = False
137 | 
138 | # Worker initialisation check to validate Metadata Database connection
139 | worker_precheck = False
140 | 
141 | [webserver]
142 | # The base url of your website as airflow cannot guess what domain or
143 | # cname you are using. This is used in automated emails that
144 | # airflow sends to point links to the right web server
145 | base_url = http://localhost:8080
146 | 
147 | # The ip specified when starting the web server
148 | web_server_host = 0.0.0.0
149 | 
150 | # The port on which to run the web server
151 | web_server_port = 8080
152 | 
153 | # Paths to the SSL certificate and key for the web server. When both are
154 | # provided SSL will be enabled. This does not change the web server port.
155 | web_server_ssl_cert =
156 | web_server_ssl_key =
157 | 
158 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
159 | web_server_master_timeout = 120
160 | 
161 | # Number of seconds the gunicorn webserver waits before timing out on a worker
162 | web_server_worker_timeout = 120
163 | 
164 | # Number of workers to refresh at a time. When set to 0, worker refresh is
165 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
166 | # bringing up new ones and killing old ones.
167 | worker_refresh_batch_size = 1
168 | 
169 | # Number of seconds to wait before refreshing a batch of workers.
170 | worker_refresh_interval = 30
171 | 
172 | # Secret key used to run your flask app
173 | # It should be as random as possible
174 | secret_key = {SECRET_KEY}
175 | 
176 | # Number of workers to run the Gunicorn web server
177 | workers = 4
178 | 
179 | # The worker class gunicorn should use. Choices include
180 | # sync (default), eventlet, gevent
181 | worker_class = sync
182 | 
183 | # Log files for the gunicorn webserver. '-' means log to stderr.
184 | access_logfile = -
185 | error_logfile = -
186 | 
187 | # Expose the configuration file in the web server
188 | # This is only applicable for the flask-admin based web UI (non FAB-based).
189 | # In the FAB-based web UI with RBAC feature,
190 | # access to configuration is controlled by role permissions.
191 | expose_config = True
192 | 
193 | # Set to true to turn on authentication:
194 | # https://airflow.apache.org/security.html#web-authentication
195 | authenticate = False
196 | rbac = False
197 | 
198 | # Filter the list of dags by owner name (requires authentication to be enabled)
199 | filter_by_owner = False
200 | 
201 | # Filtering mode. Choices include user (default) and ldapgroup.
202 | # Ldap group filtering requires using the ldap backend
203 | #
204 | # Note that the ldap server needs the "memberOf" overlay to be set up
205 | # in order to user the ldapgroup mode.
206 | owner_mode = user
207 | 
208 | # Default DAG view.  Valid values are:
209 | # tree, graph, duration, gantt, landing_times
210 | dag_default_view = tree
211 | 
212 | # Default DAG orientation. Valid values are:
213 | # LR (Left->Right), TB (Top->Bottom), RL (Right->Left), BT (Bottom->Top)
214 | dag_orientation = LR
215 | 
216 | # Puts the webserver in demonstration mode; blurs the names of Operators for
217 | # privacy.
218 | demo_mode = False
219 | 
220 | # The amount of time (in secs) webserver will wait for initial handshake
221 | # while fetching logs from other worker machine
222 | log_fetch_timeout_sec = 5
223 | 
224 | # By default, the webserver shows paused DAGs. Flip this to hide paused
225 | # DAGs by default
226 | hide_paused_dags_by_default = False
227 | 
228 | # Consistent page size across all listing views in the UI
229 | page_size = 100
230 | 
231 | # Define the color of navigation bar
232 | navbar_color = #007A87
233 | 
234 | # Default dagrun to show in UI
235 | default_dag_run_display_number = 25
236 | 
237 | # Enable werkzeug `ProxyFix` middleware
238 | enable_proxy_fix = False
239 | 
240 | [aws_default]
241 | aws_default_region = us-east-1
242 | 
243 | [email]
244 | email_backend = airflow.utils.email.send_email_smtp
245 | 
246 | [smtp]
247 | # If you want airflow to send emails on retries, failure, and you want to
248 | # the airflow.utils.send_email function, you have to configure an smtp
249 | # server here
250 | smtp_host = localhost
251 | smtp_starttls = True
252 | smtp_ssl = False
253 | smtp_user = airflow
254 | smtp_port = 25
255 | smtp_password = airflow
256 | smtp_mail_from = airflow@airflow.local
257 | 
258 | 
259 | [scheduler]
260 | # Task instances listen for external kill signal (when you clear tasks
261 | # from the CLI or the UI), this defines the frequency at which they should
262 | # listen (in seconds).
263 | job_heartbeat_sec = 5
264 | 
265 | # The scheduler constantly tries to trigger new tasks (look at the
266 | # scheduler section in the docs for more information). This defines
267 | # how often the scheduler should run (in seconds).
268 | #
269 | # We increased this value from 5 to 60 in an attempt to reduce airflow
270 | # scheduling latency.
271 | #
272 | scheduler_heartbeat_sec = 60
273 | 
274 | # after how much time (seconds) a new DAGs should be picked up from the filesystem
275 | min_file_process_interval = 0
276 | 
277 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
278 | dag_dir_list_interval = 300
279 | 
280 | # How often should stats be printed to the logs
281 | print_stats_interval = 30
282 | 
283 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold ago (in seconds),
284 | # scheduler is considered unhealthy.
285 | # This is used by the health check in the "/health" endpoint
286 | scheduler_health_check_threshold = 30
287 | 
288 | child_process_log_directory = /usr/local/airflow/logs/scheduler
289 | 
290 | # Local task jobs periodically heartbeat to the DB. If the job has
291 | # not heartbeat in this many seconds, the scheduler will mark the
292 | # associated task instance as failed and will re-schedule the task.
293 | scheduler_zombie_task_threshold = 300
294 | 
295 | # Turn off scheduler catchup by setting this to False.
296 | # Default behavior is unchanged and
297 | # Command Line Backfills still work, but the scheduler
298 | # will not do scheduler catchup if this is False,
299 | # however it can be set on a per DAG basis in the
300 | # DAG definition (catchup)
301 | catchup_by_default = True
302 | 
303 | # This changes the batch size of queries in the scheduling main loop.
304 | # If this is too high, SQL query performance may be impacted by one
305 | # or more of the following:
306 | #  - reversion to full table scan
307 | #  - complexity of query predicate
308 | #  - excessive locking
309 | #
310 | # Additionally, you may hit the maximum allowable query length for your db.
311 | #
312 | # Set this to 0 for no limit (not advised)
313 | max_tis_per_query = 512
314 | 
315 | # Statsd (https://github.com/etsy/statsd) integration settings
316 | #statsd_on = False
317 | #statsd_host = localhost
318 | #statsd_port = 8125
319 | #statsd_prefix = airflow
320 | 
321 | # The scheduler can run multiple threads in parallel to schedule dags.
322 | # This defines how many threads will run.
323 | max_threads = 8
324 | 
325 | authenticate = False
326 | 
327 | # Turn off scheduler use of cron intervals by setting this to False.
328 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
329 | use_job_schedule = True
330 | 
331 | 
332 | [kubernetes]
333 | worker_container_repository = 772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics
334 | worker_container_tag = latest
335 | worker_container_image_pull_policy = Always
336 | worker_dags_folder = /usr/local/airflow/dags
337 | 
338 | # If True (default), worker pods will be deleted upon termination
339 | delete_worker_pods = True
340 | 
341 | # Number of Kubernetes Worker Pod creation calls per scheduler loop
342 | worker_pods_creation_batch_size = 1
343 | 
344 | # The Kubernetes namespace where airflow workers should be created. Defaults to `default`
345 | namespace = default
346 | 
347 | # The name of the Kubernetes ConfigMap Containing the Airflow Configuration (this file)
348 | #airflow_configmap = /usr/local/airflow/airflow.cfg
349 | airflow_configmap =
350 | 
351 | # For docker image already contains DAGs, this is set to `True`, and the worker will search for dags in dags_folder,
352 | # otherwise use git sync or dags volume claim to mount DAGs
353 | dags_in_image = True
354 | 
355 | # The name of the Kubernetes service account to be associated with airflow workers, if any.
356 | # Service accounts are required for workers that require access to secrets or cluster resources.
357 | # See the Kubernetes RBAC documentation for more:
358 | #   https://kubernetes.io/docs/admin/authorization/rbac/
359 | worker_service_account_name = default
360 | 
361 | # Any image pull secrets to be given to worker pods, If more than one secret is
362 | # required, provide a comma separated list: secret_a,secret_b
363 | image_pull_secrets =
364 | 
365 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
366 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
367 | # It will raise an exception if called from a process not running in a kubernetes environment.
368 | in_cluster = True
369 | 
370 | # Affinity configuration as a single line formatted JSON object.
371 | # See the affinity model for top-level key names (e.g. `nodeAffinity`, etc.):
372 | #   https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core
373 | affinity =
374 | 
375 | # A list of toleration objects as a single line formatted JSON array
376 | # See:
377 | #   https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core
378 | tolerations =
379 | 
380 | 
381 | [kubernetes_node_selectors]
382 | # The Key-value pairs to be given to worker pods.
383 | # The worker pods will be scheduled to the nodes of the specified key-value pairs.
384 | # Should be supplied in the format: key = value
385 | 
386 | [kubernetes_secrets]
387 | # The scheduler mounts the following secrets into your workers as they are launched by the
388 | # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the
389 | # defined secrets and mount them as secret environment variables in the launched workers.
390 | 
391 | # Airflow settings
392 | AIRFLOW__CORE__AIRFLOW_HOME = prod-airflow=AIRFLOW__CORE__AIRFLOW_HOME
393 | AIRFLOW__CORE__DAGS_FOLDER = prod-airflow=AIRFLOW__CORE__DAGS_FOLDER
394 | AIRFLOW__CORE__FERNET_KEY = prod-airflow=AIRFLOW__CORE__FERNET_KEY
395 | AIRFLOW__CORE__REMOTE_LOG_CONN_ID = prod-airflow=AIRFLOW__CORE__REMOTE_LOG_CONN_ID
396 | AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER = prod-airflow=AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER
397 | AIRFLOW__CORE__ENCRYPT_S3_LOGS = prod-airflow=AIRFLOW__CORE__ENCRYPT_S3_LOGS
398 | AIRFLOW__CORE__SQL_ALCHEMY_CONN = prod-airflow=AIRFLOW__CORE__SQL_ALCHEMY_CONN
399 | AIRFLOW__KUBERNETES__DAGS_IN_IMAGE = prod-airflow=AIRFLOW__KUBERNETES__DAGS_IN_IMAGE
400 | AIRFLOW_CONN_AWS_DEFAULT = prod-airflow=AIRFLOW_CONN_AWS_DEFAULT
401 | PYTHONPATH = prod-airflow=PYTHONPATH
402 | 
403 | 
404 | # ethereum-etl env vars
405 | CLOUD_PROVIDER = prod-airflow=CLOUD_PROVIDER
406 | OUTPUT_BUCKET = prod-airflow=OUTPUT_BUCKET
407 | AWS_ACCESS_KEY_ID = prod-airflow=AWS_ACCESS_KEY_ID
408 | AWS_SECRET_ACCESS_KEY = prod-airflow=AWS_SECRET_ACCESS_KEY
409 | DAGS_FOLDER = prod-airflow=DAGS_FOLDER
410 | REDSHIFT_SQL_FOLDER = prod-airflow=REDSHIFT_SQL_FOLDER
411 | EXPORT_BLOCKS_AND_TRANSACTIONS = prod-airflow=EXPORT_BLOCKS_AND_TRANSACTIONS
412 | EXPORT_RECEIPTS_AND_LOGS = prod-airflow=EXPORT_RECEIPTS_AND_LOGS
413 | EXTRACT_TOKEN_TRANSFERS = prod-airflow=EXTRACT_TOKEN_TRANSFERS
414 | EXPORT_CONTRACTS = prod-airflow=EXPORT_CONTRACTS
415 | EXPORT_TOKENS = prod-airflow=EXPORT_TOKENS
416 | EXPORT_TRACES = prod-airflow=EXPORT_TRACES
417 | NOTIFICATION_EMAILS = prod-airflow=NOTIFICATION_EMAILS
418 | EXPORT_MAX_WORKERS = prod-airflow=EXPORT_MAX_WORKERS
419 | EXPORT_BATCH_SIZE = prod-airflow=EXPORT_BATCH_SIZE
420 | WEB3_PROVIDER_URI_BACKUP = prod-airflow=WEB3_PROVIDER_URI_BACKUP
421 | WEB3_PROVIDER_URI_ARCHIVAL = prod-airflow=WEB3_PROVIDER_URI_ARCHIVAL
422 | DESTINATION_DATASET_PROJECT_ID = prod-airflow=DESTINATION_DATASET_PROJECT_ID


--------------------------------------------------------------------------------
/airflow/dags/redshift/dags/increment_aggregates.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | from airflow import models
 4 | from airflow.hooks.postgres_hook import PostgresHook
 5 | from airflow.operators.python_operator import PythonOperator
 6 | from datetime import datetime, timedelta
 7 | from time import mktime
 8 | 
 9 | import logging
10 | import os
11 | 
12 | logging.basicConfig()
13 | logging.getLogger().setLevel(logging.DEBUG)
14 | 
15 | default_dag_args = {
16 |     'depends_on_past': False,
17 |     'start_date': datetime(2015, 7, 30),
18 |     'email_on_failure': True,
19 |     'email_on_retry': True,
20 |     'retries': 5,
21 |     'retry_delay': timedelta(minutes=5)
22 | }
23 | 
24 | notification_emails = os.environ.get('NOTIFICATION_EMAILS')
25 | if notification_emails and len(notification_emails) > 0:
26 |     default_dag_args['email'] = [email.strip() for email in notification_emails.split(',')]
27 | 
28 | dag = models.DAG(
29 |     dag_id='redshift_increment_aggregates',
30 |     # Daily at 2:00am
31 |     schedule_interval='0 2 * * *',
32 |     concurrency=1,
33 |     max_active_runs=1,
34 |     default_args=default_dag_args
35 | )
36 | 
37 | sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER', "/usr/local/airflow/dags/redshift/sql")
38 | if sql_folder is None:
39 |     raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable")
40 | 
41 | 
42 | def run_sql(ds, **kwargs):
43 |     conn_id = kwargs.get('conn_id')
44 |     sql_file_path = kwargs.get('sql_file_path')
45 |     pg_hook = PostgresHook(conn_id)
46 | 
47 |     # Get inclusive timestamp bounds of the execution day
48 |     year, month, day = map(int, ds.split('-'))
49 |     start_timestamp = int(mktime(datetime(year, month, day, 0, 0, 0).timetuple()))
50 |     end_timestamp = int(mktime(datetime(year, month, day, 23, 59, 59).timetuple()))
51 | 
52 |     with open(sql_file_path, 'r') as sql_file:
53 |         sql = sql_file.read().format(
54 |             start_timestamp=start_timestamp,
55 |             end_timestamp=end_timestamp
56 |         )
57 |         pg_hook.run(sql)
58 | 
59 | 
60 | def add_refresh_task(task_id, sql_file_path, dependencies=None):
61 | 
62 |     operator = PythonOperator(
63 |         task_id=task_id,
64 |         dag = dag,
65 |         python_callable=run_sql,
66 |         provide_context=True,
67 |         op_kwargs={
68 |             'conn_id'       : 'redshift',
69 |             'sql_file_path' : sql_file_path
70 |         },
71 |     )
72 |     if dependencies is not None and len(dependencies) > 0:
73 |         for dependency in dependencies:
74 |             if dependency is not None:
75 |                 dependency >> operator
76 |     return operator
77 | 
78 | 
79 | transaction_metrics_operator = add_refresh_task(
80 |     'aggregate_transaction_metrics_by_block',
81 |     sql_folder + '/increment/aggregate_transaction_metrics_by_block.sql'
82 | )
83 | 
84 | transaction_metrics_operator = add_refresh_task(
85 |     'aggregate_metrics_by_day',
86 |     sql_folder + '/increment/aggregate_metrics_by_day.sql',
87 |     dependencies=[transaction_metrics_operator]
88 | )
89 | 


--------------------------------------------------------------------------------
/airflow/dags/redshift/dags/refresh_aggregates.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | from airflow import models
 4 | from airflow.hooks.postgres_hook import PostgresHook
 5 | from airflow.operators.python_operator import PythonOperator
 6 | from datetime import datetime, timedelta
 7 | 
 8 | import logging
 9 | import os
10 | 
11 | logging.basicConfig()
12 | logging.getLogger().setLevel(logging.DEBUG)
13 | 
14 | default_dag_args = {
15 |     'depends_on_past': False,
16 |     'start_date': datetime(2019, 2, 1),
17 |     'email_on_failure': True,
18 |     'email_on_retry': True,
19 |     'retries': 5,
20 |     'retry_delay': timedelta(minutes=5)
21 | }
22 | 
23 | notification_emails = os.environ.get('NOTIFICATION_EMAILS')
24 | if notification_emails and len(notification_emails) > 0:
25 |     default_dag_args['email'] = [email.strip() for email in notification_emails.split(',')]
26 | 
27 | dag = models.DAG(
28 |     dag_id='redshift_refresh_aggregates',
29 |     schedule_interval=None,
30 |     concurrency=1,
31 |     max_active_runs=1,
32 |     default_args=default_dag_args
33 | )
34 | 
35 | sql_folder = os.environ.get('REDSHIFT_SQL_FOLDER', "/usr/local/airflow/dags/redshift/sql")
36 | if sql_folder is None:
37 |     raise ValueError("You must set REDSHIFT_SQL_FOLDER environment variable")
38 | 
39 | 
40 | def run_sql(**kwargs):
41 |     conn_id = kwargs.get('conn_id')
42 |     sql_file_path = kwargs.get('sql_file_path')
43 |     pg_hook = PostgresHook(conn_id)
44 | 
45 |     with open(sql_file_path, 'r') as sql_file:
46 |         sql = sql_file.read()
47 |         pg_hook.run(sql)
48 | 
49 | 
50 | def add_refresh_task(task_id, sql_file_path, dependencies=None):
51 | 
52 |     operator = PythonOperator(
53 |         task_id=task_id,
54 |         dag = dag,
55 |         python_callable=run_sql,
56 |         provide_context=True,
57 |         op_kwargs={
58 |             'conn_id'       : 'redshift',
59 |             'sql_file_path' : sql_file_path
60 |         },
61 |     )
62 |     if dependencies is not None and len(dependencies) > 0:
63 |         for dependency in dependencies:
64 |             if dependency is not None:
65 |                 dependency >> operator
66 |     return operator
67 | 
68 | 
69 | transaction_metrics_operator = add_refresh_task(
70 |     'aggregate_transaction_metrics_by_block',
71 |     sql_folder + '/refresh/aggregate_transaction_metrics_by_block.sql'
72 | )
73 | 
74 | transaction_metrics_operator = add_refresh_task(
75 |     'aggregate_metrics_by_day',
76 |     sql_folder + '/refresh/aggregate_metrics_by_day.sql',
77 |     dependencies=[transaction_metrics_operator]
78 | )
79 | 


--------------------------------------------------------------------------------
/airflow/dags/redshift/sql/increment/aggregate_metrics_by_day.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | --  Join at the block level and aggregate by day.
 4 | --
 5 | 
 6 | DROP TABLE IF EXISTS ethereum.aggregate_metrics_by_day_incr_tmp;
 7 | 
 8 | CREATE TABLE ethereum.aggregate_metrics_by_day_incr_tmp
 9 | (LIKE ethereum.aggregate_metrics_by_day);
10 | 
11 | INSERT INTO ethereum.aggregate_metrics_by_day_incr_tmp
12 | SELECT
13 |   DATE_TRUNC('day', TIMESTAMP 'epoch' + blocks.timestamp * INTERVAL '1 second')
14 |                                 AS day,
15 | 
16 |   COUNT(DISTINCT blocks.hash)   AS blocks_cnt,
17 |   COUNT(DISTINCT blocks.miner)  AS unique_miners,
18 |   AVG(blocks.difficulty)        AS median_difficulty,
19 |   MAX(blocks.total_difficulty)  AS cumulative_difficulty,
20 |   SUM(blocks.size)              AS total_blocksize_bytes,
21 |   SUM(blocks.gas_used)          AS gas_used,
22 |   SUM(transaction_count)        AS transactions_cnt_from_blocks,
23 |   SUM(transactions_cnt)         AS transactions_cnt,
24 |   SUM(new_addresses)            AS new_addresses,
25 |   SUM(unique_senders)           AS unique_senders,
26 |   SUM(unique_receivers)         AS unique_receivers,
27 |   SUM(value_transferred_wei)    AS value_transferred_wei,
28 |   SUM(value_transferred_eth)    AS value_transferred_eth,
29 |   SUM(total_gas_provided)       AS total_gas_provided
30 | FROM
31 |   ethereum.blocks AS blocks
32 | INNER JOIN
33 |   ethereum.aggregate_transaction_metrics_by_block AS transactions
34 | ON
35 |   blocks.number = transactions.block_number
36 | WHERE
37 |   timestamp BETWEEN {start_timestamp} AND {end_timestamp}
38 | GROUP BY
39 |   day
40 | ORDER BY
41 |   day DESC;
42 | 
43 | 
44 | BEGIN TRANSACTION;
45 | 
46 | DELETE FROM ethereum.aggregate_metrics_by_day
47 | USING ethereum.aggregate_metrics_by_day_incr_tmp
48 | WHERE
49 |   ethereum.aggregate_metrics_by_day.day = ethereum.aggregate_metrics_by_day_incr_tmp.day;
50 | 
51 | INSERT INTO ethereum.aggregate_metrics_by_day
52 | SELECT * FROM ethereum.aggregate_metrics_by_day_incr_tmp;
53 | 
54 | END TRANSACTION;
55 | 
56 | DROP TABLE ethereum.aggregate_metrics_by_day_incr_tmp;


--------------------------------------------------------------------------------
/airflow/dags/redshift/sql/increment/aggregate_transaction_metrics_by_block.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Rollup our transactions to the block level.
 4 | --
 5 | 
 6 | DROP TABLE IF EXISTS ethereum.aggregate_transaction_metrics_by_block_incr_tmp;
 7 | 
 8 | CREATE TABLE ethereum.aggregate_transaction_metrics_by_block_incr_tmp
 9 | (LIKE ethereum.aggregate_transaction_metrics_by_block);
10 | 
11 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block_incr_tmp
12 | SELECT
13 |   transactions.block_number                 AS block_number,
14 |   COUNT(DISTINCT transactions.hash)         AS transactions_cnt,
15 | 
16 |   COUNT(DISTINCT
17 |     CASE
18 |       WHEN transactions.nonce = 0 THEN from_address
19 |       ELSE NULL
20 |     END
21 |   )                                         AS new_addresses,
22 | 
23 |   COUNT(DISTINCT transactions.from_address) AS unique_senders,
24 |   COUNT(DISTINCT transactions.to_address)   AS unique_receivers,
25 |   SUM(transactions.value)                   AS value_transferred_wei,
26 | 
27 |   SUM(transactions.value)::NUMERIC(32, 6) / POWER(10, 18)::NUMERIC(32, 6)
28 |                                             AS value_transferred_eth,
29 | 
30 |   SUM(transactions.gas)                     AS total_gas_provided,
31 |   AVG(transactions.gas_price)               AS avg_gas_price
32 | FROM
33 |   ethereum.transactions AS transactions
34 | WHERE
35 |   transactions.block_number BETWEEN
36 |     (SELECT MIN(number) FROM ethereum.blocks WHERE timestamp >= {start_timestamp})
37 |     AND
38 |     (SELECT MAX(number) FROM ethereum.blocks WHERE timestamp <= {end_timestamp})
39 | GROUP BY
40 |   transactions.block_number
41 | ORDER BY
42 |   transactions.block_number ASC;
43 | 
44 | 
45 | BEGIN TRANSACTION;
46 | 
47 | DELETE FROM ethereum.aggregate_transaction_metrics_by_block
48 | USING ethereum.aggregate_transaction_metrics_by_block_incr_tmp
49 | WHERE
50 |   ethereum.aggregate_transaction_metrics_by_block.block_number =
51 |     ethereum.aggregate_transaction_metrics_by_block_incr_tmp.block_number;
52 | 
53 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block
54 | SELECT * FROM ethereum.aggregate_transaction_metrics_by_block_incr_tmp;
55 | 
56 | END TRANSACTION;
57 | 
58 | DROP TABLE ethereum.aggregate_transaction_metrics_by_block_incr_tmp;
59 | 


--------------------------------------------------------------------------------
/airflow/dags/redshift/sql/refresh/aggregate_metrics_by_day.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | --  Join at the block level and aggregate by day.
 4 | --
 5 | 
 6 | DROP TABLE IF EXISTS ethereum.aggregate_metrics_by_day_tmp;
 7 | 
 8 | CREATE TABLE ethereum.aggregate_metrics_by_day_tmp
 9 | (LIKE ethereum.aggregate_metrics_by_day);
10 | 
11 | INSERT INTO ethereum.aggregate_metrics_by_day_tmp
12 | SELECT
13 |   DATE_TRUNC('day', TIMESTAMP 'epoch' + blocks.timestamp * INTERVAL '1 second')
14 |                                 AS day,
15 | 
16 |   COUNT(DISTINCT blocks.hash)   AS blocks_cnt,
17 |   COUNT(DISTINCT blocks.miner)  AS unique_miners,
18 |   AVG(blocks.difficulty)        AS median_difficulty,
19 |   MAX(blocks.total_difficulty)  AS cumulative_difficulty,
20 |   SUM(blocks.size)              AS total_blocksize_bytes,
21 |   SUM(blocks.gas_used)          AS gas_used,
22 |   SUM(transaction_count)        AS transactions_cnt_from_blocks,
23 |   SUM(transactions_cnt)         AS transactions_cnt,
24 |   SUM(new_addresses)            AS new_addresses,
25 |   SUM(unique_senders)           AS unique_senders,
26 |   SUM(unique_receivers)         AS unique_receivers,
27 |   SUM(value_transferred_wei)    AS value_transferred_wei,
28 |   SUM(value_transferred_eth)    AS value_transferred_eth,
29 |   SUM(total_gas_provided)       AS total_gas_provided
30 | FROM
31 |   ethereum.blocks AS blocks
32 | INNER JOIN
33 |   ethereum.aggregate_transaction_metrics_by_block AS transactions
34 | ON
35 |   blocks.number = transactions.block_number
36 | GROUP BY
37 |   day
38 | ORDER BY
39 |   day DESC;
40 | 
41 | BEGIN;
42 | DROP TABLE ethereum.aggregate_metrics_by_day;
43 | ALTER TABLE ethereum.aggregate_metrics_by_day_tmp RENAME TO aggregate_metrics_by_day;
44 | COMMIT;


--------------------------------------------------------------------------------
/airflow/dags/redshift/sql/refresh/aggregate_transaction_metrics_by_block.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Rollup our transactions to the block level.
 4 | --
 5 | 
 6 | DROP TABLE IF EXISTS ethereum.aggregate_transaction_metrics_by_block_tmp;
 7 | 
 8 | CREATE TABLE ethereum.aggregate_transaction_metrics_by_block_tmp
 9 | (LIKE ethereum.aggregate_transaction_metrics_by_block);
10 | 
11 | INSERT INTO ethereum.aggregate_transaction_metrics_by_block_tmp
12 | SELECT
13 |   transactions.block_number                 AS block_number,
14 |   COUNT(DISTINCT transactions.hash)         AS transactions_cnt,
15 | 
16 |   COUNT(DISTINCT
17 |     CASE
18 |       WHEN transactions.nonce = 0 THEN from_address
19 |       ELSE NULL
20 |     END
21 |   )                                         AS new_addresses,
22 | 
23 |   COUNT(DISTINCT transactions.from_address) AS unique_senders,
24 |   COUNT(DISTINCT transactions.to_address)   AS unique_receivers,
25 |   SUM(transactions.value)                   AS value_transferred_wei,
26 | 
27 |   SUM(transactions.value)::NUMERIC(32, 6) / POWER(10, 18)::NUMERIC(32, 6)
28 |                                             AS value_transferred_eth,
29 | 
30 |   SUM(transactions.gas)                     AS total_gas_provided,
31 |   AVG(transactions.gas_price)               AS avg_gas_price
32 | FROM
33 |   ethereum.transactions AS transactions
34 | GROUP BY
35 |   transactions.block_number
36 | ORDER BY
37 |   transactions.block_number ASC;
38 | 
39 | BEGIN;
40 | DROP TABLE ethereum.aggregate_transaction_metrics_by_block;
41 | ALTER TABLE ethereum.aggregate_transaction_metrics_by_block_tmp RENAME TO aggregate_transaction_metrics_by_block;
42 | COMMIT;
43 | 


--------------------------------------------------------------------------------
/airflow/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CMD="airflow"
 4 | TRY_LOOP="${TRY_LOOP:-10}"
 5 | POSTGRES_HOST="${POSTGRES_HOST:-postgres}"
 6 | POSTGRES_PORT=5432
 7 | POSTGRES_CREDS="${POSTGRES_CREDS:-airflow:airflow}"
 8 | AIRFLOW_URL_PREFIX="${AIRFLOW_URL_PREFIX:-}"
 9 | 
10 | sed -i "s/{{ POSTGRES_HOST }}/${POSTGRES_HOST}/" ${AIRFLOW_HOME}/airflow.cfg
11 | sed -i "s/{{ POSTGRES_CREDS }}/${POSTGRES_CREDS}/" ${AIRFLOW_HOME}/airflow.cfg
12 | sed -i "s#{{ AIRFLOW_URL_PREFIX }}#${AIRFLOW_URL_PREFIX}#" ${AIRFLOW_HOME}/airflow.cfg
13 | 
14 | # ethereum-etl
15 | export CLOUD_PROVIDER="aws"
16 | export OUTPUT_BUCKET="insight-prod-ethereum-etl-output"
17 | export EXPORT_BLOCKS_AND_TRANSACTIONS=True
18 | export EXPORT_RECEIPTS_AND_LOGS=True
19 | export EXTRACT_TOKEN_TRANSFERS=True
20 | export EXPORT_CONTRACTS=True
21 | export EXPORT_TOKENS=True
22 | export EXPORT_TRACES=False
23 | export NOTIFICATION_EMAILS="webster@iteriodata.com"
24 | export EXPORT_MAX_WORKERS=4
25 | export EXPORT_BATCH_SIZE=10
26 | export WEB3_PROVIDER_URI_BACKUP="https://mainnet.infura.io"
27 | export WEB3_PROVIDER_URI_ARCHIVAL="https://mainnet.infura.io"
28 | export DESTINATION_DATASET_PROJECT_ID="test"
29 | export DAGS_FOLDER="/usr/local/airflow/dags/ethereum-etl-airflow/dags"
30 | export PYTHONPATH="${PYTHONPATH}:/${DAGS_FOLDER}"
31 | 
32 | 
33 | # Install custom python package if requirements.txt is present
34 | if [ -e "/requirements.txt" ]; then
35 |     $(which pip) install --user -r /requirements.txt
36 | fi
37 | 
38 | # wait for postgres
39 | sleep 60
40 | #if [ "$1" = "webserver" ] || [ "$1" = "worker" ] || [ "$1" = "scheduler" ] ; then
41 | #  i=0
42 | #  while ! nc $POSTGRES_HOST $POSTGRES_PORT >/dev/null 2>&1 < /dev/null; do
43 | #    i=`expr $i + 1`
44 | #    if [ $i -ge $TRY_LOOP ]; then
45 | #      echo "$(date) - ${POSTGRES_HOST}:${POSTGRES_PORT} still not reachable, giving up"
46 | #      exit 1
47 | #    fi
48 | #    echo "$(date) - waiting for ${POSTGRES_HOST}:${POSTGRES_PORT}... $i/$TRY_LOOP"
49 | #    sleep 5
50 | #  done
51 | 
52 | case "$1" in
53 |   webserver)
54 |     sleep 10
55 |     echo "Initialize database..."
56 |     #  # TODO: move to a Helm hook
57 |     # https://github.com/kubernetes/helm/blob/master/docs/charts_hooks.md
58 |     $CMD initdb
59 |     # https://github.com/apache/airflow/blob/master/docs/security.rst
60 |     airflow users --create --username admin --password password --role Admin --email webster@iteriodata.com --firstname Webster --lastname Cook
61 |     exec $CMD webserver
62 |     ;;
63 |   worker)
64 |     # To give the webserver time to run initdb.
65 |     sleep 30
66 |     exec $CMD "$@"
67 |     ;;
68 |   scheduler)
69 |     # To give the webserver time to run initdb.
70 |     sleep 30
71 |     # Via Tobias Kaymak
72 |     # https://github.com/puckel/docker-airflow/issues/55
73 |     while echo "Running Scheduler"; do
74 |       # See https://airflow.apache.org/cli.html#scheduler
75 |       airflow scheduler
76 |       exitcode=$?
77 |       if [ $exitcode -ne 0 ]; then
78 |         echo "ERROR: Scheduler exited with exit code $?."
79 |         echo $(date)
80 |         exit $exitcode
81 |       fi
82 |       sleep 30
83 |     done
84 |     ;;
85 |   *)
86 |     # The command is something like bash, not an airflow subcommand. Just run it in the right environment.
87 |     exec "$@"
88 |     ;;
89 | esac
90 | 


--------------------------------------------------------------------------------
/docs/challenges.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Engineering Challenges
  3 | 
  4 | #### Scalability / Resource Utilization
  5 | 
  6 | Exporting historical blockchain data requires scaling the Airflow / Kubernetes 
  7 | cluster.  On our first run utilizing 3 x m4.large instances, it took 4 days to
  8 | export the Ethereum blockchain at a rate of 10 GB / hour.  The Kubernetes nodes 
  9 | were fully utilizing CPU resources during that time:
 10 | 
 11 | ![eks_worker_no_autoscaling](img/eks_worker_no_autoscaling.png)
 12 | 
 13 | This was handled by scaling the Kubernetes cluster and increasing concurrency 
 14 | of Airflow workers.  The following parameters were modified in the Airflow 
 15 | configuration:
 16 | 
 17 | ```
 18 | # The amount of parallelism as a setting to the executor. This defines
 19 | # the max number of task instances that should run simultaneously
 20 | # on this airflow installation
 21 | parallelism = 128
 22 | 
 23 | # The number of task instances allowed to run concurrently by the scheduler
 24 | dag_concurrency = 128
 25 | 
 26 | # The maximum number of active DAG runs per DAG
 27 | max_active_runs_per_dag = 128
 28 | ```
 29 | 
 30 | And a "target tracking" EC2 autoscaling policy was implemented using Terraform:
 31 | 
 32 | ```
 33 | #
 34 | #  Scales the EKS Worker autoscaling group based on CPU utilization
 35 | #
 36 | resource "aws_autoscaling_policy" "eks_workers" {
 37 |   name                      = "${var.project}-${var.environment}-eks-worker-cpu-target-tracking"
 38 |   autoscaling_group_name    = "${element(module.eks.workers_asg_names, 0)}"
 39 |   adjustment_type           = "ChangeInCapacity"
 40 |   policy_type               = "TargetTrackingScaling"
 41 |   estimated_instance_warmup = 300
 42 | 
 43 |   target_tracking_configuration {
 44 |     predefined_metric_specification {
 45 |       predefined_metric_type = "ASGAverageCPUUtilization"
 46 |     }
 47 |     target_value = 70.0
 48 |   }
 49 | }
 50 | 
 51 | ```
 52 | 
 53 | This mechanism was used to temporarily scale from 3 to 10 EC2 instances while 
 54 | exporting the historical data.  In this case it took 10 hours at 100 GB / hour.
 55 | This chart shows the ramp up in CPU usage followed by a drop when the scaling 
 56 | occurs:
 57 | 
 58 | ![eks_worker_no_autoscaling](img/eks_worker_autoscaling.png)
 59 | 
 60 | Once the historical data is exported, the additional nodes are removed from the 
 61 | cluster to reduce costs.
 62 | 
 63 | The latency of the scheduler made it difficult to reach the CPU target of 70% 
 64 | (i.e. the scheduler couldn't launch tasks quick enough).  Apparently the latency
 65 | can be decreased by changing config settings such as `scheduler_heartbeat_sec`. 
 66 | But this requires additional testing.
 67 | 
 68 | Another open problem is having the Airflow scheduler backoff if spawning 
 69 | additional tasks will exceed the cluster's resource capacity.  Running a large 
 70 | number of idle nodes will lead to unacceptable costs.  Furthermore, the 
 71 | Kubernetes API appears to drop requests when cluster resources are exhausted. 
 72 | This can cause issues when additional nodes are attempting to join the cluster.
 73 | 
 74 | 
 75 | #### DAG deployments
 76 | 
 77 | There are operational trade-offs associated with the DAG deployment process.
 78 | The following methodologies are most prevalent:
 79 | 
 80 | 1. Syncing a shared volume with remote storage such as git or S3
 81 | 2. "Pre-baked" DAGs deployed w/ the airflow container.
 82 | 
 83 | Our implementation uses "pre-baked" DAGs and the `dags_in_image` configuration 
 84 | options.  The motivation here is to avoid errors related to syncing files at 
 85 | runtime.  The downside is that this approach is unlikely to scale as well in 
 86 | large organization with thousands of DAGs.
 87 | 
 88 | 
 89 | #### DAG task visibility
 90 | 
 91 | The initial version of this project used the 
 92 | [KubernetesPodOperator](https://airflow.apache.org/kubernetes.html?highlight=kubernetes%20pod%20operator#airflow.contrib.operators.kubernetes_pod_operator.KubernetesPodOperator)
 93 | to spin up ethereum-etl containers.  The intention was to avoid mixing code for 
 94 | orchestration and blockchain data export.  With a specialized container for 
 95 | exporting data from each blockchain (Bitcoin, Ethereum, Monero, etc).  
 96 | Dependencies for each blockchain export would be managed separately and debugging
 97 | would be simplified.
 98 |  
 99 | Although this approach could allow a large development team to collaborate more 
100 | easily, the downside is that Airflow has limited visibility into the processes 
101 | running within the container.  For example, a container could run 3 steps of 
102 | the data export and 2nd could fail.  But it could still show up in Airflow 
103 | as successful and not trigger any alerts.
104 | 
105 | Since the ethereum-etl container didn't have clear success/failure exit
106 | statuses that would bubble up to Airflow, it made more sense to 
107 | [fork](https://github.com/iter-io/ethereum-etl-airflow) the ethereum-etl-airflow 
108 | project and run tasks using the Python Operator / Kubernetes Executor.  This 
109 | task visibility trade-off also motivated the use of raw SQL for updating Redshift 
110 | tables instead of using [DBT](https://www.getdbt.com/).
111 | 
112 | 
113 | #### Incremental updates in Redshift
114 | 
115 | Our use case requires joining and aggregating time series data to create metrics
116 | for end users.  For example, we need to aggregate blocks and transactions then
117 | join the result.  This is expensive because the transactions table is ~400 million 
118 | rows and growing.  Using a single "dense storage" Redshift node (ds2.xlarge), it 
119 | takes about an hour to aggregate and join the full historical dataset.
120 | 
121 | We cannot cost effectively do a full refresh on these data models many times per 
122 | day.  So it's necessary to incrementally update them as the data is ingested.
123 | 
124 | Initially the incremental update on one day of data had a runtime of ~20 minutes 
125 | due to a sequential scan on the transactions table.  This would grow linearly 
126 | with the size of the table. The performance of this query was improved by selecting 
127 | appropriate sortkeys for the blocks and transactions tables.  The `timestamp` column 
128 | is used as a sortkey on the blocks table and the block_number is used as a sortkey 
129 | on the transactions table.  After reloading the tables with these sorkeys in place,
130 | the incremental update on one day of data had a runtime of 2 minutes (10x improvement).
131 | 
132 | ![runtime daily incremental update](img/runtime_of_daily_incremental_update.png)
133 | 
134 | ![range restricted scan](img/range_restricted_scan.png) 
135 | 
136 | 
137 | #### Data security
138 | 
139 | There is a growing demand for joining public blockchain data on proprietary 
140 | datasets.  As a result security features must be sufficient for protecting 
141 | proprietary data stored on the platform.
142 | 
143 | During the initial setup of Kubernetes / EKS, we identified the need to 
144 | place some Kubernetes services in public subnet with load balancers 
145 | accessible from the public internet (e.g. ethereum nodes).  While other 
146 | services should run within a private subnet with internal load balancers 
147 | (e.g. Airflow).
148 | 
149 | This information was used as a reference for setting up annotations to 
150 | select a subnet:
151 | 
152 | * https://github.com/kubernetes/kubernetes/pull/22064/files#diff-07ba008af9c76b0539556ff7fde3105eR62
153 | * https://github.com/terraform-aws-modules/terraform-aws-eks/issues/15
154 | 
155 | We specifically expect security issues related to protecting our network 
156 | while receiving data on our blockchain nodes.  Running these nodes will 
157 | announce our endponts to potential adversaries.  In the event one of our
158 | nodes are compromised, we must control access to services such as S3 and
159 | the Kubernetes cluster management API.  These policies are implemented 
160 | using Kubernetes RBAC and AWS IAM role configuration.
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/docs/getting_started.md:
--------------------------------------------------------------------------------
  1 | ## Getting Started
  2 | 
  3 | 
  4 | #### Step #1 - Install Docker
  5 | 
  6 | Mac OS X:
  7 |   1. `brew install docker`
  8 | 
  9 | Ubuntu Linux:
 10 |   1.  Install [Docker CE](https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/)
 11 | 
 12 |   2.  To use docker without root credentials, create a docker group:
 13 | 
 14 |       ```bash
 15 |         sudo groupadd docker
 16 |       ```
 17 | 
 18 |       Then add your user to this group:
 19 |       ```bash
 20 |         sudo usermod -aG docker $USER
 21 |       ```
 22 | 
 23 |       You can read more in the [official docker documentation](https://docs.docker.com/install/linux/linux-postinstall/#manage-docker-as-a-non-root-user).
 24 | 
 25 |   3.  Log out and then log in again to re-evaluate your groups.
 26 | 
 27 | 
 28 | #### Step #2 - Install the aws-cli
 29 | 
 30 | 
 31 | 1.  Install the [AWS-CLI](http://docs.aws.amazon.com/cli/latest/userguide/awscli-install-linux.html).
 32 | 
 33 | 2.  Download your AWS login and keypair from the IAM console.
 34 | 
 35 | 3.  Run `aws configure` and input your credentials.
 36 | 
 37 | 4.  Now you can use your Docker client authenticate with [AWS ECR](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html):  
 38 | 
 39 |   ```bash
 40 |     eval $(aws ecr get-login --no-include-email)
 41 |   ```
 42 | 
 43 |   You should also add this alias to your bash profile:
 44 |   ```bash
 45 |     alias ecrlogin='eval $(aws ecr get-login --no-include-email)'
 46 |   ```
 47 | 
 48 |   In the future you can authenticate yourself with ECR using this command:
 49 |   ```bash
 50 |   ecrlogin
 51 |   ```
 52 | 
 53 | 
 54 | #### Step #3 - Install Terraform
 55 | 
 56 | Follow the [getting started guide](https://learn.hashicorp.com/terraform/getting-started/install.html#installing-terraform) 
 57 | from Hashicorp.
 58 | 
 59 | 
 60 | #### Step #4 - Install kube-ctl
 61 | 
 62 | Follow the [task doc](https://kubernetes.io/docs/tasks/tools/install-kubectl/) 
 63 | on kubernetes.io.
 64 |    
 65 | 
 66 | #### Step #5 - Use Terraform to provision the base environment and prod network
 67 | 
 68 | Apply the following Terraform configs:
 69 | 
 70 | * environments
 71 |     * base
 72 |         * ops
 73 |             * [ecr](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/base/ops/ecr/ecr.tf)
 74 |     * prod
 75 |         * compute
 76 |             * [eks](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/compute/eks/eks.tf)
 77 |         * network
 78 |             * [bastion](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/bastion/bastion.tf)
 79 |             * [vpc](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/vpc/vpc.tf)
 80 | 
 81 | Here's the example commands for the EKS config:
 82 | 
 83 | ```
 84 | cd terraform/environments/prod/compute/eks
 85 | terraform init
 86 | terraform apply
 87 | ```
 88 | 
 89 | Repeat this for each of the configs above.
 90 | 
 91 | 
 92 | #### Step #6 - Setup EKS / Kubernetes
 93 | 
 94 | Follow the [setup guide](https://github.com/iter-io/security-token-analytics/blob/master/k8s/README.md) 
 95 | in this repo.
 96 | 
 97 | 
 98 | #### Step #7 - Build and deploy the Airflow container
 99 | 
100 | Run `make build_docker` in the project root.
101 | 
102 | 
103 | #### Step #8 - Use Terraform to provision the prod databases and services
104 | 
105 | * environments
106 |     * prod
107 |         * data
108 |             * [rds-airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-airflow/rds-airflow.tf)
109 |             * [rds-grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-grafana/rds-grafana.tf)
110 |             * [rds-superset](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-superset/rds-superset.tf)
111 |             * [redshift](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/redshift/redshift.tf)
112 |         * services
113 |             * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/airflow/airflow.tf)
114 |             * [grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/grafana/grafana.tf)
115 | 
116 | 
117 | #### Step #9 - Create the schema and users in Redshift
118 | 
119 | 1.  Use psql to create the Redshift schemas:
120 |  
121 |     * [ethereum schema](https://github.com/iter-io/ethereum-etl-airflow/blob/feature-aws/dags/resources/stages/raw/schemas_redshift/schema.sql)
122 |     * [3rd party schemas](https://github.com/iter-io/security-token-analytics/tree/master/redshift/schema)
123 |    
124 | 2.  Use psql to run [users.sql](https://github.com/iter-io/security-token-analytics/blob/master/redshift/users.sql) 
125 |     for setting up the Redshift user accounts.


--------------------------------------------------------------------------------
/docs/img/dags.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/dags.png


--------------------------------------------------------------------------------
/docs/img/eks_worker_autoscaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/eks_worker_autoscaling.png


--------------------------------------------------------------------------------
/docs/img/eks_worker_no_autoscaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/eks_worker_no_autoscaling.png


--------------------------------------------------------------------------------
/docs/img/grafana_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/grafana_dashboard.png


--------------------------------------------------------------------------------
/docs/img/kubernetes_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/kubernetes_dashboard.png


--------------------------------------------------------------------------------
/docs/img/range_restricted_scan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/range_restricted_scan.png


--------------------------------------------------------------------------------
/docs/img/runtime_of_daily_incremental_update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/runtime_of_daily_incremental_update.png


--------------------------------------------------------------------------------
/docs/img/tech_stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iter-io/security-token-analytics/2cb855916ae5dc3ae77912835cb7884ef6532419/docs/img/tech_stack.png


--------------------------------------------------------------------------------
/docs/tech_stack.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Tech Stack
  3 | 
  4 | #### Kubernetes / EKS
  5 | 
  6 | Kubernetes provides cluster orchestration services for running the apps in our
  7 | data pipeline. This allows us to scale out ETL workloads and maintain availability
  8 | of services such as the Airflow scheduler and webserver.
  9 | 
 10 | We utilize [Amazon's EKS service](https://aws.amazon.com/eks/) for a managed 
 11 | Kubernetes control plane. The Kubernetes cluster is scaled using an [EC2 
 12 | Autoscaling Group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroup.html) 
 13 | in conjunction with a [scaling policy](https://docs.aws.amazon.com/autoscaling/ec2/userguide/scaling_plan.html).
 14 | Our cluster scales in response high CPU usage and attempts to maintain a target
 15 | utilization of 70% by adding and removing nodes from the cluster.
 16 | 
 17 | 
 18 | #### Airflow
 19 | 
 20 | Airflow serves as a scheduler for coordinating the following pipeline tasks:
 21 | 
 22 | 1. Running tasks to export data from Ethereum nodes using the JSON RPC API. Then 
 23 |    upload it to S3.
 24 | 2. Loading output data from S3 into Redshift using the COPY command.
 25 | 3. Executing SQL in Redshift to update our data models.
 26 | 
 27 | The Dockerfile in the root of this repository is used to build a Docker image
 28 | for Airflow that contains all dependencies and DAGs. This same image is used for 
 29 | the scheduler, webserver, and workers.
 30 | 
 31 | The Airflow scheduler and webserver are deployed on Kubernetes to make them 
 32 | highly available.  In the event one of these processes fails, Kubernetes will 
 33 | launch new pods in an attempt to keep them running.  This setup could also be 
 34 | used to scale out the webserver service if necessary in a large organization.  
 35 | 
 36 | A Postgres RDS instance is used as the backend database for Airflow.  So our 
 37 | scheduler and webserver containers are stateless and can be redeployed as 
 38 | needed.
 39 | 
 40 | Airflow workers are each launched in their own pod using the [Kubernetes 
 41 | Executor](https://airflow.readthedocs.io/en/stable/kubernetes.html).  Due to 
 42 | the experimental status of Airflow's Kubernetes functionality, we are building 
 43 | our image directly from the Airflow master branch (instead of a tagged release).
 44 | 
 45 | 
 46 | #### go-ethereum
 47 | 
 48 | [go-ethereum](https://github.com/ethereum/go-ethereum/wiki/Geth) is an ethereum 
 49 | implementation written in Go.  We run a "full archive node" containing a complete 
 50 | history of blockchain transactions. Then we export data via the 
 51 | [JSON RPC API](https://github.com/ethereum/wiki/wiki/JSON-RPC).  The goal of 
 52 | using Kubernetes to run go-ethereum is to scale the data export process by storing 
 53 | multiple copies of the blockchain and load balancing the JSON RPC API.
 54 | 
 55 | 
 56 | #### blockchain-etl
 57 | 
 58 | We utilize code from the [ethereum-etl project](https://github.com/blockchain-etl)
 59 | to export raw data from the blockchain.  We [forked](https://github.com/iter-io/ethereum-etl-airflow) 
 60 | the project's Airflow DAGs for GCP and added support for using them with S3 and 
 61 | Redshift on AWS.   This minimized time to market by leveraging an existing 
 62 | high-quality codebase (credit goes to [Evgeny Medvedev](https://github.com/medvedev1088) 
 63 | and [Allen Day](https://github.com/allenday)).
 64 | 
 65 | 
 66 | #### S3
 67 | S3 serves as a ["data lake"](https://aws.amazon.com/big-data/datalakes-and-analytics/what-is-a-data-lake/)
 68 | for storing exported blockchain data.  Data is loaded from S3 into Redshift using
 69 | the [COPY command](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html)
 70 | 
 71 | 
 72 | #### Redshift
 73 | 
 74 | Redshift is used as a [data warehouse](https://aws.amazon.com/data-warehouse/). 
 75 | This allows us to build useful data models and run interactive queries on 
 76 | them.  Redshift was chosen because the columnar storage format is well-suited 
 77 | for aggregate queries on the historical data.  We expect this will be an 
 78 | important access pattern.
 79 | 
 80 | 
 81 | #### Grafana
 82 | 
 83 | Grafana is used for data visualization and presentation.  We expect our end 
 84 | users will be familiar with SQL. So using our examples they should be able to 
 85 | combine simple SQL queries into shareable dashboards.  The goal is to build a 
 86 | "self-service" system where end-users can assist us in identifying important 
 87 | metrics and improving the Redshift data models. 
 88 | 
 89 | 
 90 | #### Terraform
 91 | 
 92 | Terraform modules are used to provision the following AWS resources:
 93 | 
 94 | 1.  VPC components
 95 | 2.  Bastion server
 96 | 3.  EKS cluster
 97 | 4.  S3 buckets
 98 | 5.  RDS instances
 99 | 6.  Redshift cluster
100 | 
101 | 
102 | #### Security
103 | 
104 | AWS IAM, VPC security groups, and Kubernetes RBAC Authorization are used to 
105 | protect AWS resources.  The [AWS Parameter Store](https://docs.aws.amazon.com/systems-manager/latest/userguide/systems-manager-paramstore.html) 
106 | and [Kubernetes secrets](https://kubernetes.io/docs/concepts/configuration/secret/)
107 | are used to protect credentials and provide them to containers as environment 
108 | variables at runtime.


--------------------------------------------------------------------------------
/k8s/README.md:
--------------------------------------------------------------------------------
 1 | ![high level architecture](../docs/img/kubernetes_dashboard.png)
 2 | 
 3 | ## Kubernetes setup
 4 | 
 5 | 1.  Setup the AWS IAM authenticator:
 6 | 
 7 |     https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html
 8 | 
 9 | 2.  Create a kubeconfig:
10 | 
11 |     https://docs.aws.amazon.com/eks/latest/userguide/create-kubeconfig.html
12 | 
13 | 3.  Download the aws-auth config map template:
14 | 
15 |     `curl -O https://amazon-eks.s3-us-west-2.amazonaws.com/cloudformation/2019-01-09/aws-auth-cm.yaml`
16 | 
17 |     Edit the aws-auth-cm.yaml file by adding Role ARN of the EC2 worker instances.
18 | 
19 | 4.  Run our shell script k8s setup script:
20 | 
21 |     `source k8s/setup.sh`
22 | 
23 | 5.  Run the proxy:
24 | 
25 |     `kubectl --context=<CLUSTER_ARN> proxy`
26 | 
27 | 6. Open the Kubernetes dashboard login url in your browser:
28 | 
29 |     http://localhost:8001/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy/#!/login
30 | 
31 | 7. Insert the token printed by the setup.sh and click login.
32 | 
33 | 8. To access internal services such as the Airflow web UI, setup a SOCKS proxy 
34 |    using an alias such as the following:
35 | 
36 |       `alias insocks='ssh -D 8123 -f -C -q -N inprodbastion'`
37 | 
38 | 9. For debugging purposes you can ssh from your local system through the bastion
39 |    server to access any of the k8s nodes:
40 | 
41 |    `ssh -J inprodbastion ec2-user@ip-10-0-102-76.ec2.internal`
42 | 
43 |     For ease of use setup an alias such as:
44 | 
45 |     `function jumpto() { ssh -J inprodbastion ec2-user@$1; }`
46 | 


--------------------------------------------------------------------------------
/k8s/config_maps/aws-auth-cm.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: aws-auth
 5 |   namespace: kube-system
 6 | data:
 7 |   mapRoles: |
 8 |     - rolearn: <ARN of instance role (not instance profile)>
 9 |       username: system:node:{{EC2PrivateDNSName}}
10 |       groups:
11 |         - system:bootstrappers
12 |         - system:nodes
13 | 


--------------------------------------------------------------------------------
/k8s/service_accounts/eks-admin-service-account.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ServiceAccount
 3 | metadata:
 4 |   name: eks-admin
 5 |   namespace: kube-system
 6 | ---
 7 | apiVersion: rbac.authorization.k8s.io/v1beta1
 8 | kind: ClusterRoleBinding
 9 | metadata:
10 |   name: eks-admin
11 | roleRef:
12 |   apiGroup: rbac.authorization.k8s.io
13 |   kind: ClusterRole
14 |   name: cluster-admin
15 | subjects:
16 | - kind: ServiceAccount
17 |   name: eks-admin
18 |   namespace: kube-system


--------------------------------------------------------------------------------
/k8s/services/airflow-webserver.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: prod-airflow-webserver
 5 |   annotations:
 6 |     service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0
 7 | spec:
 8 |   type: NodePort
 9 |   selector:
10 |     app: prod-airflow-webserver
11 |   ports:
12 |     - name: webserver
13 |       protocol: TCP
14 |       port: 8080
15 |       targetPort: webserver
16 |       nodePort: 32080
17 |   type: LoadBalancer
18 | 


--------------------------------------------------------------------------------
/k8s/services/go-ethereum.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: go-ethereum
 5 |   annotations:
 6 |     service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0
 7 | spec:
 8 |   type: NodePort
 9 |   selector:
10 |     app: go-ethereum
11 |     tier: go-ethereum
12 |   ports:
13 |     - protocol: TCP
14 |       port: 8545
15 |       targetPort: 8545
16 |   type: LoadBalancer
17 | 


--------------------------------------------------------------------------------
/k8s/services/grafana.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: grafana
 5 |   annotations:
 6 |     service.beta.kubernetes.io/aws-load-balancer-type: nlb
 7 | spec:
 8 |   type: NodePort
 9 |   selector:
10 |     app: grafana
11 |     tier: grafana
12 |   ports:
13 |     - protocol: TCP
14 |       port: 3000
15 |       targetPort: 3000
16 |   type: LoadBalancer
17 | 


--------------------------------------------------------------------------------
/k8s/services/superset.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: superset
 5 |   annotations:
 6 |     service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0
 7 | spec:
 8 |   type: NodePort
 9 |   selector:
10 |     app: superset
11 |     tier: superset
12 |   ports:
13 |     - protocol: TCP
14 |       port: 80
15 |       targetPort: 8088
16 |   type: LoadBalancer


--------------------------------------------------------------------------------
/k8s/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Update the local kube config
 4 | aws eks update-kubeconfig --name insight-prod-cluster
 5 | 
 6 | # Apply the auth config map so EC2 instances can join our cluster as worker nodes
 7 | eks apply -f aws-auth-cm.yaml
 8 | 
 9 | # Kubernetes Dashboard
10 | eks apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v1.10.1/src/deploy/recommended/kubernetes-dashboard.yaml
11 | 
12 | # Heapster
13 | eks apply -f https://raw.githubusercontent.com/kubernetes/heapster/master/deploy/kube-config/influxdb/heapster.yaml
14 | 
15 | # InfluxDB
16 | eks apply -f https://raw.githubusercontent.com/kubernetes/heapster/master/deploy/kube-config/influxdb/influxdb.yaml
17 | 
18 | # Admin service account
19 | eks apply -f eks-admin-service-account.yaml
20 | 
21 | # TODO: Figure out how we can parse the token from this command
22 | #EKS_ADMIN_AUTH_TOKEN=$()
23 | eks -n kube-system describe secret $(eks -n kube-system get secret | grep eks-admin | awk '{print $1}')
24 | 
25 | # Create the superset config map
26 | eks create configmap superset-config --from-file=superset/superset_config.py
27 | 


--------------------------------------------------------------------------------
/redshift/check_load_errors.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | select
 3 |   starttime,
 4 |   query,
 5 |   filename as filename,
 6 |   line_number as line,
 7 |   colname as column,
 8 |   type,
 9 |   position as pos,
10 |   raw_line as line_text,
11 |   raw_field_value as field_text,
12 |   err_reason as reason
13 | from stl_load_errors
14 | order by starttime desc
15 | limit 200;
16 | 


--------------------------------------------------------------------------------
/redshift/schema/coinmarketcap.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Data Source:  https://coinmarketcap.com/currencies/ethereum/historical-data/
 4 | --
 5 | 
 6 | CREATE SCHEMA IF NOT EXISTS coinmarketcap;
 7 | 
 8 | DROP TABLE IF EXISTS coinmarketcap.ethereum_usd_price_history;
 9 | 
10 | CREATE TABLE coinmarketcap.ethereum_usd_price_history (
11 |   day        TIMESTAMP      NOT NULL,
12 |   "open"     NUMERIC(38, 6) NOT NULL,
13 |   high       NUMERIC(38, 6) NOT NULL,
14 |   low        NUMERIC(38, 6) NOT NULL,
15 |   close      NUMERIC(38, 6) NOT NULL,
16 |   volume     NUMERIC(38, 6) NOT NULL,
17 |   market_cap NUMERIC(38, 6) NOT NULL,
18 |   PRIMARY KEY (day)
19 | )
20 | DISTSTYLE ALL
21 | SORTKEY (day);
22 | 
23 | 


--------------------------------------------------------------------------------
/redshift/schema/coinmetrics.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Data Source:  https://coinmetrics.io/data-downloads/
 4 | --
 5 | 
 6 | CREATE SCHEMA IF NOT EXISTS coinmetrics;
 7 | 
 8 | DROP TABLE IF EXISTS coinmetrics.ethereum_usd_price_history;
 9 | 
10 | CREATE TABLE coinmetrics.ethereum_usd_price_history (
11 |   day                    TIMESTAMP      NOT NULL,
12 |   tx_volume_usd          NUMERIC(38, 6) NOT NULL,
13 |   adjusted_tx_volume_usd NUMERIC(38, 6) NOT NULL,
14 |   tx_count               BIGINT         NOT NULL,
15 |   marketcap_usd          NUMERIC(38, 6) NOT NULL,
16 |   price_usd              NUMERIC(38, 6) NOT NULL,
17 |   exchange_volume_usd    NUMERIC(38, 6) NOT NULL,
18 |   generated_coins        NUMERIC(38, 6) NOT NULL,
19 |   fees                   NUMERIC(38, 6) NOT NULL,
20 |   active_addresses       BIGINT         NOT NULL,
21 |   median_tx_value_usd    NUMERIC(38, 6) NOT NULL,
22 |   median_fee             NUMERIC(38, 6) NOT NULL,
23 |   average_difficulty     NUMERIC(38, 6) NOT NULL,
24 |   payment_count          BIGINT         NOT NULL,
25 |   block_size             BIGINT         NOT NULL,
26 |   block_count            BIGINT         NOT NULL,
27 |   nvt                    NUMERIC(38, 6) NOT NULL,
28 |   PRIMARY KEY (day)
29 | )
30 | DISTSTYLE ALL
31 | SORTKEY (day);


--------------------------------------------------------------------------------
/redshift/schema/multipl.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | --
 3 | -- Data Source:  http://www.multpl.com/shiller-pe/
 4 | --
 5 | -- Stock Market Data Used in "Irrational Exuberance" Princeton University Press, 2000, 2005, 2015, updated
 6 | -- Robert J. Shiller
 7 | --
 8 | 
 9 | CREATE SCHEMA IF NOT EXISTS multpl;
10 | 
11 | DROP TABLE IF EXISTS multpl.shiller_pe;
12 | 
13 | CREATE TABLE multpl.shiller_pe (
14 |   month TIMESTAMP     NOT NULL,
15 |   value NUMERIC(8, 2) NOT NULL,
16 |   PRIMARY KEY (month)
17 | )
18 | DISTSTYLE ALL
19 | SORTKEY (month);
20 | 


--------------------------------------------------------------------------------
/redshift/users.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | CREATE SCHEMA IF NOT EXISTS ethereum;
 3 | 
 4 | -- Users with write privileges
 5 | CREATE USER airflow WITH PASSWORD 'md5cf665ef3f22dbdbac3d814f411289983';
 6 | 
 7 | GRANT ALL ON SCHEMA coinmarketcap TO airflow;
 8 | GRANT ALL ON ALL TABLES IN SCHEMA coinmarketcap TO airflow;
 9 | 
10 | GRANT ALL ON SCHEMA coinmetrics TO airflow;
11 | GRANT ALL ON ALL TABLES IN SCHEMA coinmetrics TO airflow;
12 | 
13 | GRANT ALL ON SCHEMA ethereum TO airflow;
14 | GRANT ALL ON ALL TABLES IN SCHEMA ethereum TO airflow;
15 | 
16 | GRANT ALL ON SCHEMA multpl TO airflow;
17 | GRANT ALL ON ALL TABLES IN SCHEMA multpl TO airflow;
18 | 
19 | GRANT ALL ON SCHEMA public TO airflow;
20 | GRANT ALL ON ALL TABLES IN SCHEMA public TO airflow;
21 | 
22 | -- Group with read-only privileges
23 | CREATE GROUP read_only;
24 | 
25 | REVOKE ALL ON SCHEMA coinmarketcap FROM GROUP read_only;
26 | REVOKE ALL ON SCHEMA coinmetrics FROM GROUP read_only;
27 | REVOKE ALL ON SCHEMA ethereum FROM GROUP read_only;
28 | REVOKE ALL ON SCHEMA multpl FROM GROUP read_only;
29 | REVOKE ALL ON SCHEMA public FROM GROUP read_only;
30 | 
31 | GRANT SELECT ON ALL TABLES IN SCHEMA coinmarketcap TO GROUP read_only;
32 | GRANT SELECT ON ALL TABLES IN SCHEMA coinmetrics TO GROUP read_only;
33 | GRANT SELECT ON ALL TABLES IN SCHEMA ethereum TO GROUP read_only;
34 | GRANT SELECT ON ALL TABLES IN SCHEMA multpl TO GROUP read_only;
35 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO GROUP read_only;
36 | 
37 | GRANT USAGE ON SCHEMA coinmarketcap TO GROUP read_only;
38 | GRANT USAGE ON SCHEMA coinmetrics TO GROUP read_only;
39 | GRANT USAGE ON SCHEMA ethereum TO GROUP read_only;
40 | GRANT USAGE ON SCHEMA multpl TO GROUP read_only;
41 | GRANT USAGE ON SCHEMA public TO GROUP read_only;
42 | 
43 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA coinmarketcap GRANT SELECT ON TABLES TO GROUP read_only;
44 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA coinmetrics GRANT SELECT ON TABLES TO GROUP read_only;
45 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA ethereum GRANT SELECT ON TABLES TO GROUP read_only;
46 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA multpl GRANT SELECT ON TABLES TO GROUP read_only;
47 | ALTER DEFAULT PRIVILEGES FOR USER root IN SCHEMA public GRANT SELECT ON TABLES TO GROUP read_only;
48 | 
49 | -- Read-only users for the data team; passwords are MD5 hashed
50 | CREATE USER emily WITH PASSWORD 'md51ef5fec399320fe29b433cefd0b947b9';
51 | CREATE USER grafana WITH PASSWORD 'md54e0282ddf5fe1a6a490cafec948374cb';
52 | CREATE USER jared WITH PASSWORD 'md56973f062ac9ff074c44728cf5933219f';
53 | CREATE USER louis WITH PASSWORD 'md5f33e04a12adccd1d65ef2cf6cf389c23';
54 | CREATE USER mitchell WITH PASSWORD 'md574a9cff949c98e0ac39bb59fb85fa62b';
55 | CREATE USER superset WITH PASSWORD 'md537f2206b4fd0aeb36b896c8bafcaec9b';
56 | CREATE USER webster WITH PASSWORD 'md5cf665ef3f22dbdbac3d814f411289983';
57 | 
58 | ALTER GROUP read_only ADD USER
59 |  emily,
60 |  grafana,
61 |  jared,
62 |  louis,
63 |  mitchell,
64 |  superset,
65 |  webster;
66 | 


--------------------------------------------------------------------------------
/terraform/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Terraform
 3 | 
 4 | The terraform configs are split into separate directories for each environment 
 5 | and resource type. [Modules](https://www.terraform.io/docs/modules/index.html) 
 6 | are reusable components that can be deployed to multiple environments. Most of 
 7 | our config files leverage modules from the [Terraform Module Registry](https://registry.terraform.io/) instead 
 8 | of custom modules.  The goal of this approach was to minimize time to market.
 9 | 
10 | * environments
11 |     * base
12 |         * ops
13 |             * [ecr](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/base/ops/ecr/ecr.tf)
14 |     * prod
15 |         * compute
16 |             * [eks](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/compute/eks/eks.tf)
17 |         * data
18 |             * [rds-airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-airflow/rds-airflow.tf)
19 |             * [rds-grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-grafana/rds-grafana.tf)
20 |             * [rds-superset](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/rds-superset/rds-superset.tf)
21 |             * [redshift](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/data/redshift/redshift.tf)
22 |         * network
23 |             * [bastion](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/bastion/bastion.tf)
24 |             * [vpc](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/network/vpc/vpc.tf)
25 |         * services
26 |             * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/airflow/airflow.tf)
27 |             * [grafana](https://github.com/iter-io/security-token-analytics/blob/master/terraform/environments/prod/services/grafana/grafana.tf)
28 | * modules
29 |     * services
30 |         * [airflow](https://github.com/iter-io/security-token-analytics/blob/master/terraform/modules/services/airflow/main.tf)
31 | 


--------------------------------------------------------------------------------
/terraform/environments/base/ops/ecr/ecr.tf:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------------
 2 | # Shared "base" ECR components (repositories for docker images)
 3 | #--------------------------------------------------------------
 4 | 
 5 | variable "region" {}
 6 | 
 7 | provider "aws" {
 8 |   region = "${var.region}"
 9 | }
10 | 
11 | terraform {
12 |   backend "s3" {
13 |     bucket = "insight-base-terraform"
14 |     key    = "ops/ecr/ecr.tfstate"
15 |     region = "us-east-1"
16 |   }
17 | }
18 | 
19 | # We are creating ECR as separate resources to be able to remove
20 | # any ECR at any point, using terraform. With ECR passed as list
21 | # it is impossible.
22 | 
23 | resource "aws_ecr_repository" "security_token_analytics" {
24 |   name = "security-token-analytics"
25 | }
26 | 
27 | resource "aws_ecr_repository" "ethereum-etl" {
28 |   name = "ethereum-etl"
29 | }
30 | 


--------------------------------------------------------------------------------
/terraform/environments/base/ops/ecr/terraform.tfvars:
--------------------------------------------------------------------------------
1 | region = "us-east-1"
2 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/compute/eks/eks.tf:
--------------------------------------------------------------------------------
  1 | variable "project"                      { }
  2 | variable "environment"                  { }
  3 | variable "region"                       { }
  4 | variable "asg_desired_capacity"         { }
  5 | variable "asg_max_size"                 { }
  6 | variable "asg_min_size"                 { }
  7 | variable "instance_type"                { }
  8 | 
  9 | provider "aws" {
 10 |   region = "${var.region}"
 11 | }
 12 | 
 13 | terraform {
 14 |   required_version = "> 0.7.0"
 15 | 
 16 |   backend "s3" {
 17 |     bucket = "insight-prod-terraform"
 18 |     key    = "compute/eks/eks.tfstate"
 19 |     region = "us-east-1"
 20 |   }
 21 | }
 22 | 
 23 | data "terraform_remote_state" "vpc_state" {
 24 |   backend = "s3"
 25 | 
 26 |   config {
 27 |     bucket = "insight-prod-terraform"
 28 |     region = "${var.region}"
 29 |     key = "network/vpc/vpc.tfstate"
 30 |   }
 31 | }
 32 | 
 33 | data "terraform_remote_state" "bastion_state" {
 34 |   backend = "s3"
 35 | 
 36 |   config {
 37 |     bucket = "insight-prod-terraform"
 38 |     region = "${var.region}"
 39 |     key = "network/bastion/bastion.tfstate"
 40 |   }
 41 | }
 42 | 
 43 | resource "aws_security_group" "worker_group_bastion_ingress" {
 44 |   name_prefix = "worker_group_mgmt_one"
 45 |   description = "Bastion ingress to be applied to all EKS worker nodes"
 46 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
 47 | 
 48 |   ingress {
 49 |     from_port       = 0
 50 |     to_port         = 65535
 51 |     protocol        = "tcp"
 52 |     security_groups = ["${data.terraform_remote_state.bastion_state.security_group_id}"]
 53 |   }
 54 | }
 55 | 
 56 | #
 57 | #  Scales the EKS Worker autoscaling group based on CPU utilization
 58 | #
 59 | resource "aws_autoscaling_policy" "eks_workers" {
 60 |   name                      = "${var.project}-${var.environment}-eks-worker-cpu-target-tracking"
 61 |   autoscaling_group_name    = "${element(module.eks.workers_asg_names, 0)}"
 62 |   adjustment_type           = "ChangeInCapacity"
 63 |   policy_type               = "TargetTrackingScaling"
 64 |   estimated_instance_warmup = 300
 65 | 
 66 |   target_tracking_configuration {
 67 |     predefined_metric_specification {
 68 |       predefined_metric_type = "ASGAverageCPUUtilization"
 69 |     }
 70 |     target_value = 70.0
 71 |   }
 72 | }
 73 | 
 74 | module "eks" {
 75 |   source       = "terraform-aws-modules/eks/aws"
 76 |   cluster_name = "${var.project}-${var.environment}-cluster"
 77 | 
 78 |   subnets = "${concat(
 79 |     data.terraform_remote_state.vpc_state.public_subnets,
 80 |     data.terraform_remote_state.vpc_state.private_subnets
 81 |   )}"
 82 |   vpc_id = "${data.terraform_remote_state.vpc_state.vpc_id}"
 83 | 
 84 |   write_kubeconfig = false
 85 |   manage_aws_auth  = false
 86 | 
 87 |   tags = {
 88 |     environment = "${var.environment}"
 89 |   }
 90 | 
 91 |   worker_groups = [
 92 |     {
 93 |       asg_desired_capacity          = "${var.asg_desired_capacity}"                            # Desired worker capacity in the autoscaling group.
 94 |       asg_max_size                  = "${var.asg_max_size}"                                    # Maximum worker capacity in the autoscaling group.
 95 |       asg_min_size                  = "${var.asg_min_size}"                                    # Minimum worker capacity in the autoscaling group.
 96 |       instance_type                 = "${var.instance_type}"                                   # Size of the workers instances.
 97 |       spot_price                    = ""                                                       # Cost of spot instance.
 98 |       placement_tenancy             = ""                                                       # The tenancy of the instance. Valid values are "default" or "dedicated".
 99 |       root_volume_size              = "100"                                                    # root volume size of workers instances.
100 |       root_volume_type              = "gp2"                                                    # root volume type of workers instances, can be 'standard', 'gp2', or 'io1'
101 |       root_iops                     = "0"                                                      # The amount of provisioned IOPS. This must be set with a volume_type of "io1".
102 |       key_name                      = "ops"                                                    # The key name that should be used for the instances in the autoscaling group
103 |       pre_userdata                  = ""                                                       # userdata to pre-append to the default userdata.
104 |       additional_userdata           = ""                                                       # userdata to append to the default userdata.
105 |       ebs_optimized                 = true                                                     # sets whether to use ebs optimization on supported types.
106 |       enable_monitoring             = true                                                     # Enables/disables detailed monitoring.
107 |       public_ip                     = false                                                    # Associate a public ip address with a worker
108 |       kubelet_extra_args            = ""                                                       # This string is passed directly to kubelet if set. Useful for adding labels or taints.
109 |       autoscaling_enabled           = false                                                    # Sets whether policy and matching tags will be added to allow autoscaling.
110 |       additional_security_group_ids = "${aws_security_group.worker_group_bastion_ingress.id}"  # A comma delimited list of additional security group ids to include in worker launch config
111 |       protect_from_scale_in         = false                                                    # Prevent AWS from scaling in, so that cluster-autoscaler is solely responsible.
112 |       suspended_processes           = ""                                                       # A comma delimited string of processes to to suspend. i.e. AZRebalance,HealthCheck,ReplaceUnhealthy
113 |       target_group_arns             = ""                                                       # A comma delimited list of ALB target group ARNs to be associated to the ASG
114 |     }
115 |   ]
116 | }
117 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/compute/eks/terraform.tfvars:
--------------------------------------------------------------------------------
1 | project              = "insight"
2 | environment          = "prod"
3 | region               = "us-east-1"
4 | asg_desired_capacity = 3
5 | asg_max_size         = 10
6 | asg_min_size         = 3
7 | instance_type        = "c5.xlarge"
8 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-airflow/rds-airflow.tf:
--------------------------------------------------------------------------------
  1 | variable "application"                         { }
  2 | variable "project"                             { }
  3 | variable "environment"                         { }
  4 | variable "region"                              { }
  5 | 
  6 | variable "allocated_storage"                   { }
  7 | variable "allow_major_version_upgrade"         { }
  8 | variable "apply_immediately"                   { }
  9 | variable "auto_minor_version_upgrade"          { }
 10 | variable "backup_retention_period"             { }
 11 | variable "backup_window"                       { }
 12 | variable "create_db_instance"                  { }
 13 | variable "create_db_option_group"              { }
 14 | variable "create_db_parameter_group"           { }
 15 | variable "create_db_subnet_group"              { }
 16 | variable "create_monitoring_role"              { }
 17 | variable "deletion_protection"                 { }
 18 | variable "engine"                              { }
 19 | variable "engine_version"                      { }
 20 | variable "family"                              { }
 21 | variable "iam_database_authentication_enabled" { }
 22 | variable "instance_class"                      { }
 23 | variable "iops"                                { }
 24 | variable "maintenance_window"                  { }
 25 | variable "major_engine_version"                { }
 26 | variable "monitoring_interval"                 { }
 27 | variable "multi_az"                            { }
 28 | variable "port"                                { }
 29 | variable "publicly_accessible"                 { }
 30 | variable "skip_final_snapshot"                 { }
 31 | variable "storage_encrypted"                   { }
 32 | variable "storage_type"                        { }
 33 | 
 34 | 
 35 | provider "aws" {
 36 |   region = "${var.region}"
 37 | }
 38 | 
 39 | terraform {
 40 |   required_version = "> 0.7.0"
 41 | 
 42 |   backend "s3" {
 43 |     bucket = "insight-prod-terraform"
 44 |     key    = "data/rds-airflow/rds-airflow"
 45 |     region = "us-east-1"
 46 |   }
 47 | }
 48 | 
 49 | data "terraform_remote_state" "vpc_state" {
 50 |   backend = "s3"
 51 | 
 52 |   config {
 53 |     bucket = "insight-prod-terraform"
 54 |     region = "${var.region}"
 55 |     key    = "network/vpc/vpc.tfstate"
 56 |   }
 57 | }
 58 | 
 59 | data "aws_ssm_parameter" "username" {
 60 |   name = "/prod/rds-airflow/USERNAME"
 61 | }
 62 | 
 63 | data "aws_ssm_parameter" "password" {
 64 |   name = "/prod/rds-airflow/PASSWORD"
 65 | }
 66 | 
 67 | # TODO:  Lock down to airflow service security group; protected from public access for now though
 68 | resource "aws_security_group" "rds" {
 69 |   name        = "${var.project}-${var.environment}-${var.application}-rds"
 70 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
 71 |   description = "RDS security group"
 72 | 
 73 |   tags      {
 74 |     Name = "${var.project}-${var.environment}-${var.application}-rds"
 75 |     Application = "${var.application}"
 76 |     Environment = "${var.environment}"
 77 |     Project     = "${var.project}"
 78 |   }
 79 |   lifecycle { create_before_destroy = true }
 80 | 
 81 |   ingress {
 82 |     protocol    = "tcp"
 83 |     from_port   = "${var.port}"
 84 |     to_port     = "${var.port}"
 85 |     cidr_blocks = ["0.0.0.0/0"]
 86 |   }
 87 | 
 88 |   egress {
 89 |     protocol    = -1
 90 |     from_port   = 0
 91 |     to_port     = 0
 92 |     cidr_blocks = ["0.0.0.0/0"]
 93 |   }
 94 | }
 95 | 
 96 | resource "aws_db_parameter_group" "default" {
 97 |   name        = "${var.environment}-${var.project}-${var.application}"
 98 |   family      = "${var.family}"
 99 |   description = "RDS cluster parameter group"
100 | 
101 |   parameter {
102 |     name = "application_name"
103 |     value = "${var.environment}-${var.project}-${var.application}"
104 |   }
105 | }
106 | 
107 | resource "aws_db_option_group" "default" {
108 |   name                     = "${var.environment}-${var.project}-${var.application}"
109 |   engine_name              = "${var.engine}"
110 |   major_engine_version     = "10"
111 | }
112 | 
113 | resource "aws_db_subnet_group" "default" {
114 |   name       = "${var.environment}-${var.project}-${var.application}"
115 |   subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"]
116 | 
117 |   tags = {
118 |     Application = "${var.application}"
119 |     Environment = "${var.environment}"
120 |     Project     = "${var.project}"
121 |   }
122 | }
123 | 
124 | #
125 | # https://github.com/terraform-aws-modules/terraform-aws-rds
126 | #
127 | module "rds" {
128 |   source                              = "terraform-aws-modules/rds/aws"
129 | 
130 |   allocated_storage                   = "${var.allocated_storage}"
131 |   allow_major_version_upgrade         = "${var.allow_major_version_upgrade}"
132 |   apply_immediately                   = "${var.apply_immediately}"
133 |   auto_minor_version_upgrade          = "${var.auto_minor_version_upgrade}"
134 |   availability_zone                   = "${element(data.terraform_remote_state.vpc_state.azs, 0)}"
135 |   backup_retention_period             = "${var.backup_retention_period}"
136 |   backup_window                       = "${var.backup_window}"
137 |   create_db_instance                  = "${var.create_db_instance}"
138 |   create_db_option_group              = "${var.create_db_option_group}"
139 |   create_db_parameter_group           = "${var.create_db_parameter_group}"
140 |   create_db_subnet_group              = "${var.create_db_subnet_group}"
141 |   create_monitoring_role              = "${var.create_monitoring_role}"
142 |   db_subnet_group_name                = "${aws_db_subnet_group.default.name}"
143 |   deletion_protection                 = "${var.deletion_protection}"
144 |   enabled_cloudwatch_logs_exports     = []
145 |   engine                              = "${var.engine}"
146 |   engine_version                      = "${var.engine_version}"
147 |   family                              = "${var.family}"
148 |   final_snapshot_identifier           = "${var.environment}-${var.project}-${var.application}-final"
149 |   iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}"
150 |   identifier                          = "${var.project}-${var.environment}-${var.application}"
151 |   instance_class                      = "${var.instance_class}"
152 |   iops                                = "${var.iops}"
153 |   maintenance_window                  = "${var.maintenance_window}"
154 |   major_engine_version                = "${var.major_engine_version}"
155 |   monitoring_interval                 = "${var.monitoring_interval}"
156 |   monitoring_role_name                = "${var.environment}-${var.project}-${var.application}-monitoring"
157 |   multi_az                            = "${var.multi_az}"
158 |   name                                = "${var.environment}_${var.application}"
159 |   option_group_name                   = "${aws_db_option_group.default.name}"
160 |   parameter_group_name                = "${aws_db_parameter_group.default.name}"
161 |   password                            = "${data.aws_ssm_parameter.password.value}"
162 |   port                                = "${var.port}"
163 |   publicly_accessible                 = "${var.publicly_accessible}"
164 |   skip_final_snapshot                 = "${var.skip_final_snapshot}"
165 |   storage_encrypted                   = "${var.storage_encrypted}"
166 |   storage_type                        = "${var.storage_type}"
167 |   subnet_ids                          = "${data.terraform_remote_state.vpc_state.private_subnets}"
168 | 
169 |   tags = {
170 |     Application = "${var.application}"
171 |     Environment = "${var.environment}"
172 |     Project     = "${var.project}"
173 |   }
174 | 
175 |   username               = "${data.aws_ssm_parameter.username.value}"
176 |   vpc_security_group_ids = ["${aws_security_group.rds.id}"]
177 | }
178 | 
179 | output "db_instance_address" {
180 |   value = "${module.rds.this_db_instance_address}"
181 | }
182 | 
183 | output "db_instance_arn" {
184 |   value = "${module.rds.this_db_instance_arn}"
185 | }
186 | 
187 | output "tdb_instance_endpoint" {
188 |   value = "${module.rds.this_db_instance_endpoint}"
189 | }
190 | 
191 | output "db_instance_id" {
192 |   value = "${module.rds.this_db_instance_id}"
193 | }
194 | 
195 | output "db_instance_name" {
196 |   value = "${module.rds.this_db_instance_name}"
197 | }
198 | 
199 | output "db_instance_resource_id" {
200 |   value = "${module.rds.this_db_instance_resource_id}"
201 | }
202 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-airflow/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | application                         = "airflow"
 2 | project                             = "insight"
 3 | environment                         = "prod"
 4 | region                              = "us-east-1"
 5 | 
 6 | allocated_storage                   = "20"
 7 | allow_major_version_upgrade         = "false"
 8 | apply_immediately                   = "true"
 9 | auto_minor_version_upgrade          = "true"
10 | backup_retention_period             = "14"
11 | backup_window                       = "03:00-06:00"
12 | create_db_instance                  = "true"
13 | create_db_option_group              = false
14 | create_db_parameter_group           = false
15 | create_db_subnet_group              = false
16 | create_monitoring_role              = "true"
17 | deletion_protection                 = "false"
18 | engine                              = "postgres"
19 | engine_version                      = "10.6"
20 | family                              = "postgres10"
21 | iam_database_authentication_enabled = "false"
22 | instance_class                      = "db.t3.micro"
23 | iops                                = "0"
24 | maintenance_window                  = "Mon:00:00-Mon:03:00"
25 | major_engine_version                = "10.6"
26 | monitoring_interval                 = "5"
27 | multi_az                            = "false"
28 | port                                = "5432"
29 | publicly_accessible                 = "false"
30 | skip_final_snapshot                 = "false"
31 | storage_encrypted                   = "false"
32 | storage_type                        = "gp2"
33 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-grafana/rds-grafana.tf:
--------------------------------------------------------------------------------
  1 | variable "application"                         { }
  2 | variable "project"                             { }
  3 | variable "environment"                         { }
  4 | variable "region"                              { }
  5 | 
  6 | variable "allocated_storage"                   { }
  7 | variable "allow_major_version_upgrade"         { }
  8 | variable "apply_immediately"                   { }
  9 | variable "auto_minor_version_upgrade"          { }
 10 | variable "backup_retention_period"             { }
 11 | variable "backup_window"                       { }
 12 | variable "create_db_instance"                  { }
 13 | variable "create_db_option_group"              { }
 14 | variable "create_db_parameter_group"           { }
 15 | variable "create_db_subnet_group"              { }
 16 | variable "create_monitoring_role"              { }
 17 | variable "deletion_protection"                 { }
 18 | variable "engine"                              { }
 19 | variable "engine_version"                      { }
 20 | variable "family"                              { }
 21 | variable "iam_database_authentication_enabled" { }
 22 | variable "instance_class"                      { }
 23 | variable "iops"                                { }
 24 | variable "maintenance_window"                  { }
 25 | variable "major_engine_version"                { }
 26 | variable "monitoring_interval"                 { }
 27 | variable "multi_az"                            { }
 28 | variable "port"                                { }
 29 | variable "publicly_accessible"                 { }
 30 | variable "skip_final_snapshot"                 { }
 31 | variable "storage_encrypted"                   { }
 32 | variable "storage_type"                        { }
 33 | 
 34 | 
 35 | provider "aws" {
 36 |   region = "${var.region}"
 37 | }
 38 | 
 39 | terraform {
 40 |   required_version = "> 0.7.0"
 41 | 
 42 |   backend "s3" {
 43 |     bucket = "insight-prod-terraform"
 44 |     key    = "data/rds-grafana/rds-grafana"
 45 |     region = "us-east-1"
 46 |   }
 47 | }
 48 | 
 49 | data "terraform_remote_state" "vpc_state" {
 50 |   backend = "s3"
 51 | 
 52 |   config {
 53 |     bucket = "insight-prod-terraform"
 54 |     region = "${var.region}"
 55 |     key    = "network/vpc/vpc.tfstate"
 56 |   }
 57 | }
 58 | 
 59 | data "aws_ssm_parameter" "username" {
 60 |   name = "/prod/rds-grafana/USERNAME"
 61 | }
 62 | 
 63 | data "aws_ssm_parameter" "password" {
 64 |   name = "/prod/rds-grafana/PASSWORD"
 65 | }
 66 | 
 67 | # TODO:  Lock down to grafana service security group; protected from public access for now though
 68 | resource "aws_security_group" "rds" {
 69 |   name        = "${var.project}-${var.environment}-${var.application}-rds"
 70 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
 71 |   description = "RDS security group"
 72 | 
 73 |   tags      {
 74 |     Name = "${var.project}-${var.environment}-${var.application}-rds"
 75 |     Application = "${var.application}"
 76 |     Environment = "${var.environment}"
 77 |     Project     = "${var.project}"
 78 |   }
 79 |   lifecycle { create_before_destroy = true }
 80 | 
 81 |   ingress {
 82 |     protocol    = "tcp"
 83 |     from_port   = "${var.port}"
 84 |     to_port     = "${var.port}"
 85 |     cidr_blocks = ["0.0.0.0/0"]
 86 |   }
 87 | 
 88 |   egress {
 89 |     protocol    = -1
 90 |     from_port   = 0
 91 |     to_port     = 0
 92 |     cidr_blocks = ["0.0.0.0/0"]
 93 |   }
 94 | }
 95 | 
 96 | resource "aws_db_parameter_group" "default" {
 97 |   name        = "${var.environment}-${var.project}-${var.application}"
 98 |   family      = "${var.family}"
 99 |   description = "RDS cluster parameter group"
100 | 
101 |   parameter {
102 |     name = "application_name"
103 |     value = "${var.environment}-${var.project}-${var.application}"
104 |   }
105 | }
106 | 
107 | resource "aws_db_option_group" "default" {
108 |   name                     = "${var.environment}-${var.project}-${var.application}"
109 |   engine_name              = "${var.engine}"
110 |   major_engine_version     = "10"
111 | }
112 | 
113 | resource "aws_db_subnet_group" "default" {
114 |   name       = "${var.environment}-${var.project}-${var.application}"
115 |   subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"]
116 | 
117 |   tags = {
118 |     Application = "${var.application}"
119 |     Environment = "${var.environment}"
120 |     Project     = "${var.project}"
121 |   }
122 | }
123 | 
124 | #
125 | # https://github.com/terraform-aws-modules/terraform-aws-rds
126 | #
127 | module "rds" {
128 |   source                              = "terraform-aws-modules/rds/aws"
129 | 
130 |   allocated_storage                   = "${var.allocated_storage}"
131 |   allow_major_version_upgrade         = "${var.allow_major_version_upgrade}"
132 |   apply_immediately                   = "${var.apply_immediately}"
133 |   auto_minor_version_upgrade          = "${var.auto_minor_version_upgrade}"
134 |   availability_zone                   = "${element(data.terraform_remote_state.vpc_state.azs, 0)}"
135 |   backup_retention_period             = "${var.backup_retention_period}"
136 |   backup_window                       = "${var.backup_window}"
137 |   create_db_instance                  = "${var.create_db_instance}"
138 |   create_db_option_group              = "${var.create_db_option_group}"
139 |   create_db_parameter_group           = "${var.create_db_parameter_group}"
140 |   create_db_subnet_group              = "${var.create_db_subnet_group}"
141 |   create_monitoring_role              = "${var.create_monitoring_role}"
142 |   db_subnet_group_name                = "${aws_db_subnet_group.default.name}"
143 |   deletion_protection                 = "${var.deletion_protection}"
144 |   enabled_cloudwatch_logs_exports     = []
145 |   engine                              = "${var.engine}"
146 |   engine_version                      = "${var.engine_version}"
147 |   family                              = "${var.family}"
148 |   final_snapshot_identifier           = "${var.environment}-${var.project}-${var.application}-final"
149 |   iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}"
150 |   identifier                          = "${var.project}-${var.environment}-${var.application}"
151 |   instance_class                      = "${var.instance_class}"
152 |   iops                                = "${var.iops}"
153 |   maintenance_window                  = "${var.maintenance_window}"
154 |   major_engine_version                = "${var.major_engine_version}"
155 |   monitoring_interval                 = "${var.monitoring_interval}"
156 |   monitoring_role_name                = "${var.environment}-rds-${var.application}-monitoring-role"
157 |   multi_az                            = "${var.multi_az}"
158 |   name                                = "${var.environment}_${var.application}"
159 |   option_group_name                   = "${aws_db_option_group.default.name}"
160 |   parameter_group_name                = "${aws_db_parameter_group.default.name}"
161 |   password                            = "${data.aws_ssm_parameter.password.value}"
162 |   port                                = "${var.port}"
163 |   publicly_accessible                 = "${var.publicly_accessible}"
164 |   skip_final_snapshot                 = "${var.skip_final_snapshot}"
165 |   storage_encrypted                   = "${var.storage_encrypted}"
166 |   storage_type                        = "${var.storage_type}"
167 |   subnet_ids                          = "${data.terraform_remote_state.vpc_state.private_subnets}"
168 | 
169 |   tags = {
170 |     Application = "${var.application}"
171 |     Environment = "${var.environment}"
172 |     Project     = "${var.project}"
173 |   }
174 | 
175 |   username               = "${data.aws_ssm_parameter.username.value}"
176 |   vpc_security_group_ids = ["${aws_security_group.rds.id}"]
177 | }
178 | 
179 | output "db_instance_address" {
180 |   value = "${module.rds.this_db_instance_address}"
181 | }
182 | 
183 | output "db_instance_arn" {
184 |   value = "${module.rds.this_db_instance_arn}"
185 | }
186 | 
187 | output "tdb_instance_endpoint" {
188 |   value = "${module.rds.this_db_instance_endpoint}"
189 | }
190 | 
191 | output "db_instance_id" {
192 |   value = "${module.rds.this_db_instance_id}"
193 | }
194 | 
195 | output "db_instance_name" {
196 |   value = "${module.rds.this_db_instance_name}"
197 | }
198 | 
199 | output "db_instance_resource_id" {
200 |   value = "${module.rds.this_db_instance_resource_id}"
201 | }
202 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-grafana/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | application                         = "grafana"
 2 | project                             = "insight"
 3 | environment                         = "prod"
 4 | region                              = "us-east-1"
 5 | 
 6 | allocated_storage                   = "20"
 7 | allow_major_version_upgrade         = "false"
 8 | apply_immediately                   = "true"
 9 | auto_minor_version_upgrade          = "true"
10 | backup_retention_period             = "14"
11 | backup_window                       = "03:00-06:00"
12 | create_db_instance                  = "true"
13 | create_db_option_group              = false
14 | create_db_parameter_group           = false
15 | create_db_subnet_group              = false
16 | create_monitoring_role              = "true"
17 | deletion_protection                 = "false"
18 | engine                              = "postgres"
19 | engine_version                      = "10.6"
20 | family                              = "postgres10"
21 | iam_database_authentication_enabled = "false"
22 | instance_class                      = "db.t3.micro"
23 | iops                                = "0"
24 | maintenance_window                  = "Mon:00:00-Mon:03:00"
25 | major_engine_version                = "10.6"
26 | monitoring_interval                 = "5"
27 | multi_az                            = "false"
28 | port                                = "5432"
29 | publicly_accessible                 = "false"
30 | skip_final_snapshot                 = "false"
31 | storage_encrypted                   = "false"
32 | storage_type                        = "gp2"
33 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-superset/rds-superset.tf:
--------------------------------------------------------------------------------
  1 | variable "application"                         { }
  2 | variable "project"                             { }
  3 | variable "environment"                         { }
  4 | variable "region"                              { }
  5 | 
  6 | variable "allocated_storage"                   { }
  7 | variable "allow_major_version_upgrade"         { }
  8 | variable "apply_immediately"                   { }
  9 | variable "auto_minor_version_upgrade"          { }
 10 | variable "backup_retention_period"             { }
 11 | variable "backup_window"                       { }
 12 | variable "create_db_instance"                  { }
 13 | variable "create_db_option_group"              { }
 14 | variable "create_db_parameter_group"           { }
 15 | variable "create_db_subnet_group"              { }
 16 | variable "create_monitoring_role"              { }
 17 | variable "deletion_protection"                 { }
 18 | variable "engine"                              { }
 19 | variable "engine_version"                      { }
 20 | variable "family"                              { }
 21 | variable "iam_database_authentication_enabled" { }
 22 | variable "instance_class"                      { }
 23 | variable "iops"                                { }
 24 | variable "maintenance_window"                  { }
 25 | variable "major_engine_version"                { }
 26 | variable "monitoring_interval"                 { }
 27 | variable "multi_az"                            { }
 28 | variable "port"                                { }
 29 | variable "publicly_accessible"                 { }
 30 | variable "skip_final_snapshot"                 { }
 31 | variable "storage_encrypted"                   { }
 32 | variable "storage_type"                        { }
 33 | 
 34 | 
 35 | provider "aws" {
 36 |   region = "${var.region}"
 37 | }
 38 | 
 39 | terraform {
 40 |   required_version = "> 0.7.0"
 41 | 
 42 |   backend "s3" {
 43 |     bucket = "insight-prod-terraform"
 44 |     key    = "data/rds-superset/rds-superset"
 45 |     region = "us-east-1"
 46 |   }
 47 | }
 48 | 
 49 | data "terraform_remote_state" "vpc_state" {
 50 |   backend = "s3"
 51 | 
 52 |   config {
 53 |     bucket = "insight-prod-terraform"
 54 |     region = "${var.region}"
 55 |     key    = "network/vpc/vpc.tfstate"
 56 |   }
 57 | }
 58 | 
 59 | data "aws_ssm_parameter" "username" {
 60 |   name = "/prod/rds-superset/USERNAME"
 61 | }
 62 | 
 63 | data "aws_ssm_parameter" "password" {
 64 |   name = "/prod/rds-superset/PASSWORD"
 65 | }
 66 | 
 67 | # TODO:  Lock down to superset service security group; protected from public access for now though
 68 | resource "aws_security_group" "rds" {
 69 |   name        = "${var.project}-${var.environment}-${var.application}-rds"
 70 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
 71 |   description = "RDS security group"
 72 | 
 73 |   tags      {
 74 |     Name = "${var.project}-${var.environment}-${var.application}-rds"
 75 |     Application = "${var.application}"
 76 |     Environment = "${var.environment}"
 77 |     Project     = "${var.project}"
 78 |   }
 79 |   lifecycle { create_before_destroy = true }
 80 | 
 81 |   ingress {
 82 |     protocol    = "tcp"
 83 |     from_port   = "${var.port}"
 84 |     to_port     = "${var.port}"
 85 |     cidr_blocks = ["0.0.0.0/0"]
 86 |   }
 87 | 
 88 |   egress {
 89 |     protocol    = -1
 90 |     from_port   = 0
 91 |     to_port     = 0
 92 |     cidr_blocks = ["0.0.0.0/0"]
 93 |   }
 94 | }
 95 | 
 96 | resource "aws_db_parameter_group" "default" {
 97 |   name        = "${var.environment}-${var.project}-${var.application}"
 98 |   family      = "${var.family}"
 99 |   description = "RDS cluster parameter group"
100 | 
101 |   parameter {
102 |     name = "application_name"
103 |     value = "${var.environment}-${var.project}-${var.application}"
104 |   }
105 | }
106 | 
107 | resource "aws_db_option_group" "default" {
108 |   name                     = "${var.environment}-${var.project}-${var.application}"
109 |   engine_name              = "${var.engine}"
110 |   major_engine_version     = "10"
111 | }
112 | 
113 | resource "aws_db_subnet_group" "default" {
114 |   name       = "${var.environment}-${var.project}-${var.application}"
115 |   subnet_ids = ["${data.terraform_remote_state.vpc_state.private_subnets}"]
116 | 
117 |   tags = {
118 |     Application = "${var.application}"
119 |     Environment = "${var.environment}"
120 |     Project     = "${var.project}"
121 |   }
122 | }
123 | 
124 | #
125 | # https://github.com/terraform-aws-modules/terraform-aws-rds
126 | #
127 | module "rds" {
128 |   source                              = "terraform-aws-modules/rds/aws"
129 | 
130 |   allocated_storage                   = "${var.allocated_storage}"
131 |   allow_major_version_upgrade         = "${var.allow_major_version_upgrade}"
132 |   apply_immediately                   = "${var.apply_immediately}"
133 |   auto_minor_version_upgrade          = "${var.auto_minor_version_upgrade}"
134 |   availability_zone                   = "${element(data.terraform_remote_state.vpc_state.azs, 0)}"
135 |   backup_retention_period             = "${var.backup_retention_period}"
136 |   backup_window                       = "${var.backup_window}"
137 |   create_db_instance                  = "${var.create_db_instance}"
138 |   create_db_option_group              = "${var.create_db_option_group}"
139 |   create_db_parameter_group           = "${var.create_db_parameter_group}"
140 |   create_db_subnet_group              = "${var.create_db_subnet_group}"
141 |   create_monitoring_role              = "${var.create_monitoring_role}"
142 |   db_subnet_group_name                = "${aws_db_subnet_group.default.name}"
143 |   deletion_protection                 = "${var.deletion_protection}"
144 |   enabled_cloudwatch_logs_exports     = []
145 |   engine                              = "${var.engine}"
146 |   engine_version                      = "${var.engine_version}"
147 |   family                              = "${var.family}"
148 |   final_snapshot_identifier           = "${var.environment}-${var.project}-${var.application}-final"
149 |   iam_database_authentication_enabled = "${var.iam_database_authentication_enabled}"
150 |   identifier                          = "${var.project}-${var.environment}-${var.application}"
151 |   instance_class                      = "${var.instance_class}"
152 |   iops                                = "${var.iops}"
153 |   maintenance_window                  = "${var.maintenance_window}"
154 |   major_engine_version                = "${var.major_engine_version}"
155 |   monitoring_interval                 = "${var.monitoring_interval}"
156 |   multi_az                            = "${var.multi_az}"
157 |   name                                = "${var.environment}_${var.application}"
158 |   option_group_name                   = "${aws_db_option_group.default.name}"
159 |   parameter_group_name                = "${aws_db_parameter_group.default.name}"
160 |   password                            = "${data.aws_ssm_parameter.password.value}"
161 |   port                                = "${var.port}"
162 |   publicly_accessible                 = "${var.publicly_accessible}"
163 |   skip_final_snapshot                 = "${var.skip_final_snapshot}"
164 |   storage_encrypted                   = "${var.storage_encrypted}"
165 |   storage_type                        = "${var.storage_type}"
166 |   subnet_ids                          = "${data.terraform_remote_state.vpc_state.private_subnets}"
167 | 
168 |   tags = {
169 |     Application = "${var.application}"
170 |     Environment = "${var.environment}"
171 |     Project     = "${var.project}"
172 |   }
173 | 
174 |   username               = "${data.aws_ssm_parameter.username.value}"
175 |   vpc_security_group_ids = ["${aws_security_group.rds.id}"]
176 | }
177 | 
178 | output "db_instance_address" {
179 |   value = "${module.rds.this_db_instance_address}"
180 | }
181 | 
182 | output "db_instance_arn" {
183 |   value = "${module.rds.this_db_instance_arn}"
184 | }
185 | 
186 | output "tdb_instance_endpoint" {
187 |   value = "${module.rds.this_db_instance_endpoint}"
188 | }
189 | 
190 | output "db_instance_id" {
191 |   value = "${module.rds.this_db_instance_id}"
192 | }
193 | 
194 | output "db_instance_name" {
195 |   value = "${module.rds.this_db_instance_name}"
196 | }
197 | 
198 | output "db_instance_resource_id" {
199 |   value = "${module.rds.this_db_instance_resource_id}"
200 | }
201 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/rds-superset/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | application                         = "superset"
 2 | project                             = "insight"
 3 | environment                         = "prod"
 4 | region                              = "us-east-1"
 5 | 
 6 | allocated_storage                   = "20"
 7 | allow_major_version_upgrade         = "false"
 8 | apply_immediately                   = "true"
 9 | auto_minor_version_upgrade          = "true"
10 | backup_retention_period             = "14"
11 | backup_window                       = "03:00-06:00"
12 | create_db_instance                  = "true"
13 | create_db_option_group              = false
14 | create_db_parameter_group           = false
15 | create_db_subnet_group              = false
16 | create_monitoring_role              = "true"
17 | deletion_protection                 = "false"
18 | engine                              = "postgres"
19 | engine_version                      = "10.6"
20 | family                              = "postgres10"
21 | iam_database_authentication_enabled = "false"
22 | instance_class                      = "db.t3.micro"
23 | iops                                = "0"
24 | maintenance_window                  = "Mon:00:00-Mon:03:00"
25 | major_engine_version                = "10.6"
26 | monitoring_interval                 = "5"
27 | multi_az                            = "false"
28 | port                                = "5432"
29 | publicly_accessible                 = "false"
30 | skip_final_snapshot                 = "false"
31 | storage_encrypted                   = "false"
32 | storage_type                        = "gp2"
33 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/redshift/redshift.tf:
--------------------------------------------------------------------------------
  1 | variable "project"                              { }
  2 | variable "environment"                          { }
  3 | variable "region"                               { }
  4 | variable "allow_version_upgrade"                { }
  5 | variable "automated_snapshot_retention_period"  { }
  6 | variable "cluster_node_type"                    { }
  7 | variable "cluster_number_of_nodes"              { }
  8 | variable "cluster_parameter_group"              { }
  9 | variable "cluster_port"                         { }
 10 | variable "cluster_version"                      { }
 11 | variable "enable_logging"                       { }
 12 | variable "encrypted"                            { }
 13 | variable "enhanced_vpc_routing"                 { }
 14 | variable "logging_bucket_name"                  { }
 15 | variable "preferred_maintenance_window"         { }
 16 | variable "publicly_accessible"                  { }
 17 | variable "skip_final_snapshot"                  { }
 18 | variable "wlm_json_configuration"               { }
 19 | 
 20 | provider "aws" {
 21 |   region = "${var.region}"
 22 | }
 23 | 
 24 | terraform {
 25 |   required_version = "> 0.7.0"
 26 | 
 27 |   backend "s3" {
 28 |     bucket = "insight-prod-terraform"
 29 |     key    = "data/redshift/redshift.tfstate"
 30 |     region = "us-east-1"
 31 |   }
 32 | }
 33 | 
 34 | data "terraform_remote_state" "vpc_state" {
 35 |   backend = "s3"
 36 | 
 37 |   config {
 38 |     bucket = "insight-prod-terraform"
 39 |     region = "${var.region}"
 40 |     key = "network/vpc/vpc.tfstate"
 41 |   }
 42 | }
 43 | 
 44 | data "terraform_remote_state" "airflow_state" {
 45 |   backend = "s3"
 46 | 
 47 |   config {
 48 |     bucket = "insight-prod-terraform"
 49 |     region = "${var.region}"
 50 |     key = "services/airflow/airflow.tfstate"
 51 |   }
 52 | }
 53 | 
 54 | data "aws_ssm_parameter" "cluster_master_username" {
 55 |   name  = "/prod/redshift/CLUSTER_MASTER_USERNAME"
 56 | }
 57 | 
 58 | 
 59 | data "aws_ssm_parameter" "cluster_master_password" {
 60 |   name  = "/prod/redshift/CLUSTER_MASTER_PASSWORD"
 61 | }
 62 | 
 63 | #
 64 | #  IAM role that allows our Redshift cluster to load data from S3.
 65 | #
 66 | resource "aws_iam_service_linked_role" "redshift_service_role" {
 67 |   aws_service_name = "redshift.amazonaws.com"
 68 | }
 69 | 
 70 | resource "aws_iam_policy" "s3_read_write_policy" {
 71 |   name = "${var.project}-${var.environment}-s3-policy-airflow-output"
 72 | 
 73 |   policy = <<EOF
 74 | {
 75 |   "Version": "2012-10-17",
 76 |   "Statement": [
 77 |       {
 78 |           "Effect": "Allow",
 79 |           "Action": [
 80 |               "s3:Get*",
 81 |               "s3:List*",
 82 |               "s3:Put*"
 83 |           ],
 84 |           "Resource": "${data.terraform_remote_state.airflow_state.s3_bucket_arn_ethereum_etl_output}/*"
 85 |       }
 86 |   ]
 87 | }
 88 |   EOF
 89 | }
 90 | 
 91 | resource "aws_iam_policy_attachment" "redshift_role_s3_policy_attachment" {
 92 |   name       = "${var.project}-${var.environment}-redshift-role-s3-policy-attachment"
 93 |   roles      = ["${aws_iam_service_linked_role.redshift_service_role.name}"]
 94 |   policy_arn = "${aws_iam_policy.s3_read_write_policy.arn}"
 95 | 
 96 |   depends_on = [
 97 |     "aws_iam_service_linked_role.redshift_service_role",
 98 |     "aws_iam_policy.s3_read_write_policy"
 99 |   ]
100 | }
101 | 
102 | resource "aws_security_group" "redshift" {
103 |   name        = "${var.project}-${var.environment}-redshift"
104 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
105 |   description = "Redshift security group"
106 | 
107 |   tags      { Name = "${var.project}-${var.environment}-redshift" }
108 |   lifecycle { create_before_destroy = true }
109 | 
110 |   ingress {
111 |     protocol    = "tcp"
112 |     from_port   = 5439
113 |     to_port     = 5439
114 |     cidr_blocks = ["0.0.0.0/0"]
115 |   }
116 | 
117 |   egress {
118 |     protocol    = -1
119 |     from_port   = 0
120 |     to_port     = 0
121 |     cidr_blocks = ["0.0.0.0/0"]
122 |   }
123 | }
124 | 
125 | resource "aws_s3_bucket" "redshift_tables" {
126 |   bucket = "${var.project}-${var.environment}-redshift-tables"
127 | }
128 | 
129 | #
130 | # https://github.com/terraform-aws-modules/terraform-aws-redshift
131 | #
132 | module "redshift" {
133 |   source                              = "github.com/terraform-aws-modules/terraform-aws-redshift"
134 |   allow_version_upgrade               = "${var.allow_version_upgrade}"
135 |   automated_snapshot_retention_period = "${var.automated_snapshot_retention_period}"
136 |   cluster_database_name               = "${var.environment}_${var.project}"
137 |   cluster_iam_roles                   = ["${aws_iam_service_linked_role.redshift_service_role.arn}"]
138 |   cluster_identifier                  = "${var.project}-${var.environment}-cluster"
139 |   cluster_master_password             = "${data.aws_ssm_parameter.cluster_master_password.value}"
140 |   cluster_master_username             = "${data.aws_ssm_parameter.cluster_master_username.value}"
141 |   cluster_node_type                   = "${var.cluster_node_type}"
142 |   cluster_number_of_nodes             = "${var.cluster_number_of_nodes}"
143 |   cluster_parameter_group             = "${var.cluster_parameter_group}"
144 |   cluster_port                        = "${var.cluster_port}"
145 |   cluster_version                     = "${var.cluster_version}"
146 |   enable_logging                      = "${var.enable_logging}"
147 |   encrypted                           = "${var.encrypted}"
148 |   enhanced_vpc_routing                = "${var.enhanced_vpc_routing}"
149 |   final_snapshot_identifier           = "${var.environment}-${var.project}-final"
150 |   logging_bucket_name                 = "${var.logging_bucket_name}"
151 |   parameter_group_name                = ""
152 |   preferred_maintenance_window        = "${var.preferred_maintenance_window}"
153 |   publicly_accessible                 = "${var.publicly_accessible}"
154 |   redshift_subnet_group_name          = ""
155 |   skip_final_snapshot                 = "${var.skip_final_snapshot}"
156 |   subnets                             = "${data.terraform_remote_state.vpc_state.public_subnets}"
157 |   vpc_security_group_ids              = ["${aws_security_group.redshift.id}"]
158 |   wlm_json_configuration              = "${var.wlm_json_configuration}"
159 | }
160 | 
161 | output "s3_bucket_arn_redshift_tables" {
162 |   value = "${aws_s3_bucket.redshift_tables.arn}"
163 | }
164 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/data/redshift/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | project                             = "insight"
 2 | environment                         = "prod"
 3 | region                              = "us-east-1"
 4 | allow_version_upgrade               = "true"
 5 | automated_snapshot_retention_period = "14"
 6 | cluster_node_type                   = "ds2.xlarge"
 7 | cluster_number_of_nodes             = "1"
 8 | cluster_parameter_group             = "redshift-1.0"
 9 | cluster_port                        = "5439"
10 | cluster_version                     = "1.0"
11 | enable_logging                      = "false"
12 | encrypted                           = "false"
13 | enhanced_vpc_routing                = "false"
14 | logging_bucket_name                 = "false"
15 | preferred_maintenance_window        = "sat:10:00-sat:10:30"
16 | publicly_accessible                 = "true"
17 | skip_final_snapshot                 = "false"
18 | wlm_json_configuration              = "[{\"query_concurrency\": 5}]"


--------------------------------------------------------------------------------
/terraform/environments/prod/network/bastion/bastion.tf:
--------------------------------------------------------------------------------
 1 | variable "project"              { }
 2 | variable "environment"          { }
 3 | variable "region"               { }
 4 | variable "allowed_cidr_blocks"  { }
 5 | variable "ami"                  { }
 6 | variable "instance_type"        { }
 7 | variable "key_name"             { }
 8 | variable "name"                 { }
 9 | variable "ssh_user"             { }
10 | 
11 | provider "aws" {
12 |   region = "${var.region}"
13 | }
14 | 
15 | terraform {
16 |   required_version = "> 0.7.0"
17 | 
18 |   backend "s3" {
19 |     bucket = "insight-prod-terraform"
20 |     key    = "network/bastion/bastion.tfstate"
21 |     region = "us-east-1"
22 |   }
23 | }
24 | 
25 | data "terraform_remote_state" "vpc_state" {
26 |   backend = "s3"
27 | 
28 |   config {
29 |     bucket = "insight-prod-terraform"
30 |     region = "${var.region}"
31 |     key = "network/vpc/vpc.tfstate"
32 |   }
33 | }
34 | 
35 | 
36 | resource "aws_security_group" "bastion" {
37 |   name        = "${var.project}-${var.environment}-bastion-sg"
38 |   vpc_id      = "${data.terraform_remote_state.vpc_state.vpc_id}"
39 |   description = "Bastion security group"
40 | 
41 |   tags      { Name = "${var.project}-${var.environment}-bastion" }
42 |   lifecycle { create_before_destroy = true }
43 | 
44 |   ingress {
45 |     protocol    = "tcp"
46 |     from_port   = 22
47 |     to_port     = 22
48 |     cidr_blocks = ["0.0.0.0/0"]
49 |   }
50 | 
51 |   egress {
52 |     protocol    = -1
53 |     from_port   = 0
54 |     to_port     = 0
55 |     cidr_blocks = ["0.0.0.0/0"]
56 |   }
57 | }
58 | 
59 | module "bastion" {
60 |   source              = "github.com/cloudposse/terraform-aws-ec2-bastion-server"
61 |   allowed_cidr_blocks = ["${var.allowed_cidr_blocks}"]
62 |   ami                 = "${var.ami}"
63 |   instance_type       = "${var.instance_type}"
64 |   key_name            = "${var.key_name}"
65 |   name                = "${var.name}"
66 |   namespace           = "${var.project}"
67 |   security_groups     = ["${aws_security_group.bastion.id}"]
68 |   ssh_user            = "${var.ssh_user}"
69 |   stage               = "${var.environment}"
70 |   subnets             = "${data.terraform_remote_state.vpc_state.public_subnets}"
71 |   vpc_id              = "${data.terraform_remote_state.vpc_state.vpc_id}"
72 | }
73 | 
74 | output "security_group_id" {
75 |   value = "${module.bastion.security_group_id}"
76 | }
77 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/network/bastion/terraform.tfvars:
--------------------------------------------------------------------------------
1 | project             = "insight"
2 | environment         = "prod"
3 | region              = "us-east-1"
4 | allowed_cidr_blocks = "0.0.0.0/0"
5 | ami                 = "ami-0ff8a91507f77f867"
6 | instance_type       = "t2.micro"
7 | key_name            = "ops"
8 | name                = "bastion"
9 | ssh_user            = "ec2-user"


--------------------------------------------------------------------------------
/terraform/environments/prod/network/vpc/terraform.tfvars:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------------
 2 | # Networking-related variables for "prod" environment
 3 | #--------------------------------------------------------------
 4 | 
 5 | project                      = "insight"
 6 | environment                  = "prod"
 7 | region                       = "us-east-1"
 8 | cidr                         = "10.0.0.0/16"
 9 | azs                          = "us-east-1a,us-east-1b"
10 | public_subnets               = "10.0.101.0/24,10.0.102.0/24"
11 | private_subnets              = "10.0.1.0/24,10.0.2.0/24"
12 | enable_dns_hostnames         = true
13 | enable_dns_support           = true
14 | enable_nat_gateway           = false
15 | enable_vpn_gateway           = false
16 | create_database_subnet_group = true
17 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/network/vpc/vpc.tf:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------------
 2 | # Networking components for the "prod" environment
 3 | #--------------------------------------------------------------
 4 | 
 5 | variable "project"                      { }
 6 | variable "environment"                  { }
 7 | variable "region"                       { }
 8 | variable "cidr"                         { }
 9 | variable "azs"                          { }
10 | variable "public_subnets"               { }
11 | variable "private_subnets"              { }
12 | variable "create_database_subnet_group" { }
13 | variable "enable_nat_gateway"           { }
14 | variable "enable_vpn_gateway"           { }
15 | variable "enable_dns_hostnames"         { }
16 | variable "enable_dns_support"           { }
17 | 
18 | provider "aws" {
19 |   region = "${var.region}"
20 | }
21 | 
22 | terraform {
23 |   required_version = "> 0.7.0"
24 | 
25 |   backend "s3" {
26 |     bucket = "insight-prod-terraform"
27 |     key    = "network/vpc/vpc.tfstate"
28 |     region = "us-east-1"
29 |   }
30 | }
31 | 
32 | module "vpc" {
33 |   source = "terraform-aws-modules/vpc/aws"
34 | 
35 |   name = "${var.project}-${var.environment}"
36 |   cidr = "${var.cidr}"
37 |   azs             = ["${split(",", var.azs)}"]
38 |   private_subnets = ["${split(",", var.private_subnets)}"]
39 |   public_subnets  = ["${split(",", var.public_subnets)}"]
40 | 
41 |   enable_nat_gateway = "${var.enable_nat_gateway}"
42 |   enable_vpn_gateway = "${var.enable_vpn_gateway}"
43 |   enable_dns_hostnames = "${var.enable_dns_hostnames}"
44 |   enable_dns_support = "${var.enable_dns_support}"
45 | 
46 |   create_database_subnet_group = true
47 | 
48 |   tags = {
49 |     terraform = "true"
50 |     environment = "prod"
51 |   }
52 | 
53 |   public_subnet_tags = {
54 |     "kubernetes.io/role/elb" = "true"
55 |     "kubernetes.io/cluster/insight-prod-cluster" = "shared"
56 |   }
57 | 
58 |   private_subnet_tags = {
59 |     "kubernetes.io/role/internal-elb" = "true"
60 |   }
61 | 
62 | }
63 | 
64 | output "azs" {
65 |   value = "${module.vpc.azs}"
66 | }
67 | 
68 | output "database_subnet_group" {
69 |   value = "${module.vpc.database_subnet_group}"
70 | }
71 | 
72 | output "private_subnets" {
73 |   value = "${module.vpc.private_subnets}"
74 | }
75 | 
76 | output "public_subnets" {
77 |   value = "${module.vpc.public_subnets}"
78 | }
79 | 
80 | output "vpc_id" {
81 |   value = "${module.vpc.vpc_id}"
82 | }
83 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/services/airflow/airflow.tf:
--------------------------------------------------------------------------------
 1 | variable "application"   { }
 2 | variable "environment"   { }
 3 | variable "project"       { }
 4 | variable "region"        { }
 5 | 
 6 | variable "airflow_core_remote_base_log_folder" { }
 7 | variable "airflow_core_encrypt_s3_logs"        { }
 8 | variable "airflow_webserver_rbac"              { }
 9 | 
10 | provider "aws" {
11 |   region = "${var.region}"
12 | }
13 | 
14 | provider "kubernetes" {
15 |   config_context = "arn:aws:eks:us-east-1:772681551441:cluster/insight-prod-cluster"
16 | }
17 | 
18 | terraform {
19 |   backend "s3" {
20 |     bucket = "insight-prod-terraform"
21 |     key    = "services/airflow/airflow.tfstate"
22 |     region = "us-east-1"
23 |   }
24 | }
25 | 
26 | data "aws_ssm_parameter" "airflow_conn_aws_default" {
27 |   name  = "/prod/airflow/AIRFLOW_CONN_AWS_DEFAULT"
28 | }
29 | 
30 | data "aws_ssm_parameter" "airflow_conn_redshift" {
31 |   name  = "/prod/airflow/AIRFLOW_CONN_REDSHIFT"
32 | }
33 | 
34 | data "aws_ssm_parameter" "airflow_core_fernet_key" {
35 |   name  = "/prod/airflow/AIRFLOW__CORE__FERNET_KEY"
36 | }
37 | 
38 | data "aws_ssm_parameter" "airflow_core_remote_log_conn_id" {
39 |   name  = "/prod/airflow/AIRFLOW__CORE__REMOTE_LOG_CONN_ID"
40 | }
41 | 
42 | data "aws_ssm_parameter" "airflow_core_sql_alchemy_conn" {
43 |   name  = "/prod/airflow/AIRFLOW__CORE__SQL_ALCHEMY_CONN"
44 | }
45 | 
46 | data "aws_ssm_parameter" "aws_access_key_id" {
47 |   name  = "/prod/airflow/AWS_ACCESS_KEY_ID"
48 | }
49 | 
50 | data "aws_ssm_parameter" "aws_secret_access_key" {
51 |   name  = "/prod/airflow/AWS_SECRET_ACCESS_KEY"
52 | }
53 | 
54 | module "airflow" {
55 |   source = "../../../../modules/services/airflow"
56 | 
57 |   application = "${var.application}"
58 |   environment = "${var.environment}"
59 |   project     = "${var.project}"
60 |   region      = "${var.region}"
61 | 
62 |   airflow_conn_aws_default            = "${data.aws_ssm_parameter.airflow_conn_aws_default.value}"
63 |   airflow_conn_redshift               = "${data.aws_ssm_parameter.airflow_conn_redshift.value}"
64 |   airflow_core_encrypt_s3_logs        = "${var.airflow_core_encrypt_s3_logs}"
65 |   airflow_core_fernet_key             = "${data.aws_ssm_parameter.airflow_core_fernet_key.value}"
66 |   airflow_core_remote_base_log_folder = "${var.airflow_core_remote_base_log_folder}"
67 |   airflow_core_remote_log_conn_id     = "${data.aws_ssm_parameter.airflow_core_remote_log_conn_id.value}"
68 |   airflow_core_sql_alchemy_conn       = "${data.aws_ssm_parameter.airflow_core_sql_alchemy_conn.value}"
69 |   airflow_webserver_rbac              = "${var.airflow_webserver_rbac}"
70 |   aws_access_key_id                   = "${data.aws_ssm_parameter.aws_access_key_id.value}"
71 |   aws_secret_access_key               = "${data.aws_ssm_parameter.aws_secret_access_key.value}"
72 | }
73 | 
74 | 
75 | output "s3_bucket_arn_ethereum_etl_output" {
76 |   value = "${module.airflow.s3_bucket_arn_ethereum_etl_output}"
77 | }
78 | 
79 | output "s3_bucket_arn_airflow_logs" {
80 |   value = "${module.airflow.s3_bucket_arn_airflow_logs}"
81 | }
82 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/services/airflow/terraform.tfvars:
--------------------------------------------------------------------------------
1 | application   = "airflow"
2 | environment   = "prod"
3 | project       = "insight"
4 | region        = "us-east-1"
5 | 
6 | airflow_core_remote_base_log_folder = "s3://insight-prod-airflow-logs/"
7 | airflow_core_encrypt_s3_logs        = "False"
8 | airflow_webserver_rbac              = "False"


--------------------------------------------------------------------------------
/terraform/environments/prod/services/grafana/grafana.tf:
--------------------------------------------------------------------------------
 1 | variable "application"  { }
 2 | variable "environment"  { }
 3 | variable "project"      { }
 4 | variable "region"       { }
 5 | 
 6 | provider "aws" {
 7 |   region = "${var.region}"
 8 | }
 9 | 
10 | terraform {
11 |   backend "s3" {
12 |     bucket = "insight-prod-terraform"
13 |     key    = "services/grafana/grafana.tfstate"
14 |     region = "us-east-1"
15 |   }
16 | }
17 | 
18 | resource "aws_iam_user" "default" {
19 |   name = "${var.environment}-${var.project}-${var.application}"
20 | }
21 | 
22 | resource "aws_iam_access_key" "default" {
23 |   user = "${aws_iam_user.default.name}"
24 | }
25 | 
26 | resource "aws_iam_user_policy" "default_policy" {
27 |   name = "${var.environment}-${var.project}-${var.application}-policy-default"
28 |   user = "${aws_iam_user.default.name}"
29 | 
30 |   policy = <<EOF
31 | {
32 |     "Version": "2012-10-17",
33 |     "Statement": [
34 |         {
35 |             "Action": [
36 |                 "cloudwatch:*",
37 |                 "logs:*"
38 |             ],
39 |             "Effect": "Allow",
40 |             "Resource": "*"
41 |         }
42 |     ]
43 | }
44 | EOF
45 | }
46 | 
47 | 
48 | output "iam_user_arn" {
49 |   value = "${aws_iam_user.default.arn}"
50 | }
51 | 
52 | output "iam_user_name" {
53 |   value = "${aws_iam_user.default.name}"
54 | }
55 | 
56 | output "aws_iam_access_key_id" {
57 |   value = "${aws_iam_access_key.default.id}"
58 | }
59 | 
60 | output "aws_iam_access_key_secret" {
61 |   value = "${aws_iam_access_key.default.secret}"
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/terraform/environments/prod/services/grafana/terraform.tfvars:
--------------------------------------------------------------------------------
1 | application          = "grafana"
2 | environment          = "prod"
3 | project              = "insight"
4 | region               = "us-east-1"
5 | 


--------------------------------------------------------------------------------
/terraform/modules/services/airflow/main.tf:
--------------------------------------------------------------------------------
  1 | 
  2 | resource "aws_s3_bucket" "ethereum_etl_output" {
  3 |   bucket = "${var.project}-${var.environment}-ethereum-etl-output"
  4 | }
  5 | 
  6 | resource "aws_s3_bucket" "airflow_logs" {
  7 |   bucket = "${var.project}-${var.environment}-airflow-logs"
  8 | }
  9 | 
 10 | resource "kubernetes_secret" "airflow" {
 11 |   metadata {
 12 |     name = "${var.project}-${var.environment}-airflow"
 13 |   }
 14 | 
 15 |   data {
 16 |     AIRFLOW_CONN_AWS_DEFAULT          = "${var.airflow_conn_aws_default}"
 17 |     AIRFLOW_CONN_REDSHIFT             = "${var.airflow_conn_redshift}"
 18 |     AIRFLOW__CORE__FERNET_KEY         = "${var.airflow_core_fernet_key}"
 19 |     AIRFLOW__CORE__REMOTE_LOG_CONN_ID = "${var.airflow_core_remote_log_conn_id}"
 20 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN   = "${var.airflow_core_sql_alchemy_conn}"
 21 |     AWS_ACCESS_KEY_ID                 = "${var.aws_access_key_id}"
 22 |     AWS_SECRET_ACCESS_KEY             = "${var.aws_secret_access_key}"
 23 |   }
 24 | 
 25 |   type = "Opaque"
 26 | }
 27 | 
 28 | resource "kubernetes_deployment" "airflow_webserver" {
 29 |   metadata {
 30 |     name = "${var.environment}-${var.application}-webserver"
 31 |   }
 32 | 
 33 |   spec {
 34 |     replicas = 3
 35 | 
 36 |     selector {
 37 |       match_labels {
 38 |         app = "${var.environment}-${var.application}-webserver"
 39 |       }
 40 |     }
 41 | 
 42 |     template {
 43 |       metadata {
 44 |         labels {
 45 |           app = "${var.environment}-${var.application}-webserver"
 46 |         }
 47 |       }
 48 | 
 49 |       spec {
 50 |         restart_policy = "Always"
 51 | 
 52 |         container = [
 53 |           {
 54 |             name = "${var.environment}-${var.application}-webserver"
 55 |             image = "772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics"
 56 |             image_pull_policy = "Always"
 57 |             args = ["webserver"]
 58 | 
 59 |             port = [
 60 |               {
 61 |                 name = "webserver"
 62 |                 container_port = 8080
 63 |               }
 64 |             ]
 65 | 
 66 |             liveness_probe = [
 67 |               {
 68 |                 http_get = {
 69 |                   path = "/"
 70 |                   port = 8080
 71 |                 }
 72 |                 initial_delay_seconds = 240
 73 |                 period_seconds        = 60
 74 |               }
 75 |             ]
 76 | 
 77 |             env = [
 78 |               {
 79 |                 name = "AIRFLOW_HOME"
 80 |                 value = "${var.airflow_core_home}"
 81 |               },
 82 |               {
 83 |                 name = "AIRFLOW__CORE__ENCRYPT_S3_LOGS"
 84 |                 value = "${var.airflow_core_encrypt_s3_logs}"
 85 |               },
 86 |               {
 87 |                 name = "AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER"
 88 |                 value = "${var.airflow_core_remote_base_log_folder}"
 89 |               },
 90 |               {
 91 |                 name = "AIRFLOW__WEBSERVER__RBAC"
 92 |                 value = "${var.airflow_webserver_rbac}"
 93 |               },
 94 |               {
 95 |                 name = "AIRFLOW_CONN_AWS_DEFAULT"
 96 |                 value_from = {
 97 |                   secret_key_ref = {
 98 |                     key = "AIRFLOW_CONN_AWS_DEFAULT"
 99 |                     name = "${var.project}-${var.environment}-airflow"
100 |                   }
101 |                 }
102 |               },
103 |               {
104 |                 name = "AIRFLOW_CONN_REDSHIFT"
105 |                 value_from = {
106 |                   secret_key_ref = {
107 |                     key = "AIRFLOW_CONN_REDSHIFT"
108 |                     name = "${var.project}-${var.environment}-airflow"
109 |                   }
110 |                 }
111 |               },
112 |               {
113 |                 name = "AIRFLOW__CORE__FERNET_KEY"
114 |                 value_from = {
115 |                   secret_key_ref = {
116 |                     key = "AIRFLOW__CORE__FERNET_KEY"
117 |                     name = "${var.project}-${var.environment}-airflow"
118 |                   }
119 |                 }
120 |               },
121 |               {
122 |                 name = "AIRFLOW__CORE__REMOTE_LOG_CONN_ID"
123 |                 value_from = {
124 |                   secret_key_ref = {
125 |                     key = "AIRFLOW__CORE__REMOTE_LOG_CONN_ID"
126 |                     name = "${var.project}-${var.environment}-airflow"
127 |                   }
128 |                 }
129 |               },
130 |               {
131 |                 name = "AIRFLOW__CORE__SQL_ALCHEMY_CONN"
132 |                 value_from = {
133 |                   secret_key_ref = {
134 |                     key = "AIRFLOW__CORE__SQL_ALCHEMY_CONN"
135 |                     name = "${var.project}-${var.environment}-airflow"
136 |                   }
137 |                 }
138 |               },
139 |               {
140 |                 name = "AWS_ACCESS_KEY_ID"
141 |                 value_from = {
142 |                   secret_key_ref = {
143 |                     key = "AWS_ACCESS_KEY_ID"
144 |                     name = "${var.project}-${var.environment}-airflow"
145 |                   }
146 |                 }
147 |               },
148 |               {
149 |                 name = "AWS_SECRET_ACCESS_KEY"
150 |                 value_from = {
151 |                   secret_key_ref = {
152 |                     key = "AWS_SECRET_ACCESS_KEY"
153 |                     name = "${var.project}-${var.environment}-airflow"
154 |                   }
155 |                 }
156 |               }
157 |             ]
158 | 
159 |             resources {
160 |               limits {
161 |                 cpu    = "2.0"
162 |                 memory = "1024Mi"
163 |               }
164 |               requests {
165 |                 cpu    = "1.0"
166 |                 memory = "512Mi"
167 |               }
168 |             }
169 |           }
170 |         ]
171 |       }
172 |     }
173 |   }
174 | }
175 | 
176 | 
177 | resource "kubernetes_deployment" "airflow_scheduler" {
178 |   metadata {
179 |     name = "${var.environment}-${var.application}-scheduler"
180 |   }
181 | 
182 |   spec {
183 |     replicas = 1
184 | 
185 |     selector {
186 |       match_labels {
187 |         app = "${var.environment}-${var.application}-scheduler"
188 |       }
189 |     }
190 | 
191 |     template {
192 |       metadata {
193 |         labels {
194 |           app = "${var.environment}-${var.application}-scheduler"
195 |         }
196 |       }
197 | 
198 |       spec {
199 |         restart_policy = "Always"
200 |         service_account_name = "default"
201 |         container = [
202 |           {
203 |             name = "${var.environment}-${var.application}-scheduler"
204 |             image = "772681551441.dkr.ecr.us-east-1.amazonaws.com/security-token-analytics"
205 |             image_pull_policy = "Always"
206 |             args = ["scheduler"]
207 | 
208 |             env = [
209 |               {
210 |                 name = "AIRFLOW_HOME"
211 |                 value = "${var.airflow_core_home}"
212 |               },
213 |               {
214 |                 name = "AIRFLOW__CORE__ENCRYPT_S3_LOGS"
215 |                 value = "${var.airflow_core_encrypt_s3_logs}"
216 |               },
217 |               {
218 |                 name = "AIRFLOW__CORE__REMOTE_BASE_LOG_FOLDER"
219 |                 value = "${var.airflow_core_remote_base_log_folder}"
220 |               },
221 |               {
222 |                 name = "AIRFLOW__WEBSERVER__RBAC"
223 |                 value = "${var.airflow_webserver_rbac}"
224 |               },
225 |               {
226 |                 name = "AIRFLOW_CONN_AWS_DEFAULT"
227 |                 value_from = {
228 |                   secret_key_ref = {
229 |                     key = "AIRFLOW_CONN_AWS_DEFAULT"
230 |                     name = "${var.project}-${var.environment}-airflow"
231 |                   }
232 |                 }
233 |               },
234 |               {
235 |                 name = "AIRFLOW_CONN_REDSHIFT"
236 |                 value_from = {
237 |                   secret_key_ref = {
238 |                     key = "AIRFLOW_CONN_REDSHIFT"
239 |                     name = "${var.project}-${var.environment}-airflow"
240 |                   }
241 |                 }
242 |               },
243 |               {
244 |                 name = "AIRFLOW__CORE__FERNET_KEY"
245 |                 value_from = {
246 |                   secret_key_ref = {
247 |                     key = "AIRFLOW__CORE__FERNET_KEY"
248 |                     name = "${var.project}-${var.environment}-airflow"
249 |                   }
250 |                 }
251 |               },
252 |               {
253 |                 name = "AIRFLOW__CORE__REMOTE_LOG_CONN_ID"
254 |                 value_from = {
255 |                   secret_key_ref = {
256 |                     key = "AIRFLOW__CORE__REMOTE_LOG_CONN_ID"
257 |                     name = "${var.project}-${var.environment}-airflow"
258 |                   }
259 |                 }
260 |               },
261 |               {
262 |                 name = "AIRFLOW__CORE__SQL_ALCHEMY_CONN"
263 |                 value_from = {
264 |                   secret_key_ref = {
265 |                     key = "AIRFLOW__CORE__SQL_ALCHEMY_CONN"
266 |                     name = "${var.project}-${var.environment}-airflow"
267 |                   }
268 |                 }
269 |               },
270 |               {
271 |                 name = "AWS_ACCESS_KEY_ID"
272 |                 value_from = {
273 |                   secret_key_ref = {
274 |                     key = "AWS_ACCESS_KEY_ID"
275 |                     name = "${var.project}-${var.environment}-airflow"
276 |                   }
277 |                 }
278 |               },
279 |               {
280 |                 name = "AWS_SECRET_ACCESS_KEY"
281 |                 value_from = {
282 |                   secret_key_ref = {
283 |                     key = "AWS_SECRET_ACCESS_KEY"
284 |                     name = "${var.project}-${var.environment}-airflow"
285 |                   }
286 |                 }
287 |               }
288 |             ]
289 | 
290 |             resources {
291 |               limits {
292 |                 cpu    = "2.0"
293 |                 memory = "1024Mi"
294 |               }
295 |               requests {
296 |                 cpu    = "1.0"
297 |                 memory = "512Mi"
298 |               }
299 |             }
300 | 
301 |             volume_mount = [
302 |               {
303 |                 mount_path = "/var/run/secrets/kubernetes.io/serviceaccount"
304 |                 name = "default-token-dxwqg"
305 |                 read_only = true
306 |               }
307 |             ]
308 |           }
309 |         ]
310 |         volume = [
311 |           {
312 |             name = "default-token-dxwqg"
313 |             secret {
314 |               default_mode = 420
315 |               secret_name = "default-token-dxwqg"
316 |             }
317 |           }
318 |         ]
319 |       }
320 |     }
321 |   }
322 | }
323 | 
324 | #
325 | # Can't add this service using Terraform due to the following:
326 | #
327 | # https://github.com/terraform-providers/terraform-provider-kubernetes/pull/50#issue-251016641
328 | #
329 | # Instead use this command:
330 | #
331 | # eks create -f k8s/services/airflow-webserver.yaml
332 | #
333 | 
334 | #resource "kubernetes_service" "airflow_webserver" {
335 | #  metadata {
336 | #    name = "${var.environment}-${var.application}-webserver"
337 | #    annotations {
338 | #      "service.beta.kubernetes.io/aws-load-balancer-internal" = "0.0.0.0/0"
339 | #    }
340 | #  }
341 | #
342 | #  spec {
343 | #    selector {
344 | #      app = "airflow"
345 | #    }
346 | #
347 | #    port {
348 | #      name = "webserver"
349 | #      protocol = "TCP"
350 | #      port = 8080
351 | #      targetPort = "webserver"
352 | #      nodePort = 32080
353 | #    }
354 | #
355 | #    type = "LoadBalancer"
356 | #  }
357 | #}
358 | 


--------------------------------------------------------------------------------
/terraform/modules/services/airflow/outputs.tf:
--------------------------------------------------------------------------------
1 | 
2 | output "s3_bucket_arn_ethereum_etl_output" {
3 |   value = "${aws_s3_bucket.ethereum_etl_output.arn}"
4 | }
5 | 
6 | output "s3_bucket_arn_airflow_logs" {
7 |   value = "${aws_s3_bucket.airflow_logs.arn}"
8 | }
9 | 


--------------------------------------------------------------------------------
/terraform/modules/services/airflow/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "application" { }
 2 | variable "environment" { }
 3 | variable "project"     { }
 4 | variable "region"      { }
 5 | 
 6 | 
 7 | variable "airflow_conn_aws_default"               { }
 8 | variable "airflow_conn_redshift"                  { }
 9 | variable "airflow_core_encrypt_s3_logs"           { }
10 | variable "airflow_core_fernet_key"                { }
11 | variable "airflow_core_home"                      { default = "/usr/local/airflow"}
12 | variable "airflow_core_remote_base_log_folder"    { }
13 | variable "airflow_core_remote_log_conn_id"        { }
14 | variable "airflow_core_sql_alchemy_conn"          { }
15 | variable "airflow_webserver_rbac"                 { }
16 | variable "aws_access_key_id"                      { }
17 | variable "aws_secret_access_key"                  { }


--------------------------------------------------------------------------------