├── .github └── x-data-infra.webp ├── .gitignore ├── README.md ├── part1 ├── QUICK_START.md ├── compose │ └── docker-compose.yaml ├── main.py ├── requirements.txt └── schema.sql ├── part2 ├── QUICK_START.md ├── compose │ ├── .env.template │ └── docker-compose.yaml ├── pipecraft │ ├── __init__.py │ ├── config │ │ └── __init__.py │ ├── dags │ │ ├── .airflowignore │ │ ├── __init__.py │ │ ├── infopy │ │ │ ├── __init__.py │ │ │ └── dag_infopy.py │ │ └── libs │ │ │ ├── __init__.py │ │ │ └── airtasks │ │ │ ├── __init__.py │ │ │ └── initial.py │ ├── plugins │ │ └── __init__.py │ └── scripts │ │ ├── entry_init.sh │ │ └── gen_fernet_key.py └── requirements.txt ├── part3 ├── QUICK_START.md ├── compose │ ├── .env.template │ └── docker-compose.yaml ├── pipecraft │ ├── __init__.py │ ├── config │ │ └── __init__.py │ ├── dags │ │ ├── .airflowignore │ │ ├── __init__.py │ │ ├── binance_market_data │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── funding.py │ │ │ │ ├── kline.py │ │ │ │ └── symbols.py │ │ │ ├── dag_binance_funding_rate.py │ │ │ ├── dag_binance_kline.py │ │ │ └── process │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── etl_funding_future.py │ │ │ │ └── etl_kline.py │ │ ├── infopy │ │ │ ├── __init__.py │ │ │ └── dag_infopy.py │ │ ├── libs │ │ │ ├── __init__.py │ │ │ ├── airtasks │ │ │ │ ├── __init__.py │ │ │ │ ├── initial.py │ │ │ │ └── timescale │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── conn.py │ │ │ │ │ └── ingester.py │ │ │ └── venues │ │ │ │ ├── __init__.py │ │ │ │ ├── base │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ │ └── binance │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── common.py │ │ │ │ ├── config.py │ │ │ │ └── types.py │ │ └── timescale_init │ │ │ ├── __init__.py │ │ │ ├── dag_timescale_roles.py │ │ │ ├── dag_timescale_tables.py │ │ │ └── process │ │ │ ├── __init__.py │ │ │ ├── create_hypertables.sql │ │ │ ├── create_roles.sql │ │ │ └── tsinit.py │ ├── plugins │ │ └── __init__.py │ └── scripts │ │ ├── entry_init.sh │ │ └── gen_fernet_key.py └── requirements.txt ├── part4 ├── QUICK_START.md ├── compose │ ├── .env.template │ └── docker-compose.yaml ├── grafana │ ├── dashboards │ │ └── MarketMonitor.json │ └── provisioning │ │ ├── dashboards │ │ └── dashboards.yaml │ │ └── datasources │ │ └── datasources.yaml ├── pipecraft │ ├── __init__.py │ ├── config │ │ └── __init__.py │ ├── dags │ │ ├── .airflowignore │ │ ├── __init__.py │ │ ├── binance_market_data │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── funding.py │ │ │ │ ├── kline.py │ │ │ │ └── symbols.py │ │ │ ├── dag_binance_funding_rate.py │ │ │ ├── dag_binance_kline.py │ │ │ └── process │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── etl_funding_future.py │ │ │ │ └── etl_kline.py │ │ ├── infopy │ │ │ ├── __init__.py │ │ │ └── dag_infopy.py │ │ ├── libs │ │ │ ├── __init__.py │ │ │ ├── airtasks │ │ │ │ ├── __init__.py │ │ │ │ ├── initial.py │ │ │ │ └── timescale │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── conn.py │ │ │ │ │ └── ingester.py │ │ │ └── venues │ │ │ │ ├── __init__.py │ │ │ │ ├── base │ │ │ │ ├── __init__.py │ │ │ │ └── base.py │ │ │ │ └── binance │ │ │ │ ├── __init__.py │ │ │ │ ├── client.py │ │ │ │ ├── common.py │ │ │ │ ├── config.py │ │ │ │ └── types.py │ │ └── timescale_init │ │ │ ├── __init__.py │ │ │ ├── dag_timescale_roles.py │ │ │ ├── dag_timescale_tables.py │ │ │ └── process │ │ │ ├── __init__.py │ │ │ ├── create_hypertables.sql │ │ │ ├── create_roles.sql │ │ │ └── tsinit.py │ ├── plugins │ │ └── __init__.py │ └── scripts │ │ ├── entry_init.sh │ │ └── gen_fernet_key.py └── requirements.txt └── part5 ├── QUICK_START.md ├── QUICK_START_PROD.md ├── compose ├── .env.prod.template ├── .env.template ├── compose.infra.core.yaml ├── compose.infra.dev.yaml ├── compose.infra.prod.yaml ├── compose.traefik.core.yaml ├── compose.traefik.dev.yaml └── compose.traefik.prod.yaml ├── grafana ├── dev │ ├── dashboards │ │ └── MarketMonitor.json │ └── provisioning │ │ ├── dashboards │ │ └── dashboards.yaml │ │ └── datasources │ │ └── datasources.yaml ├── grafana.Dockerfile ├── grafana_build_and_push.sh └── prod │ ├── dashboards │ └── MarketMonitor.json │ └── provisioning │ ├── dashboards │ └── dashboards.yaml │ └── datasources │ └── datasources.yaml ├── pipecraft ├── __init__.py ├── config │ └── __init__.py ├── dags │ ├── .airflowignore │ ├── __init__.py │ ├── binance_market_data │ │ ├── __init__.py │ │ ├── config │ │ │ ├── __init__.py │ │ │ ├── funding.py │ │ │ ├── kline.py │ │ │ └── symbols.py │ │ ├── dag_binance_funding_rate.py │ │ ├── dag_binance_kline.py │ │ └── process │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── etl_funding_future.py │ │ │ └── etl_kline.py │ ├── infopy │ │ ├── __init__.py │ │ └── dag_infopy.py │ ├── libs │ │ ├── __init__.py │ │ ├── airtasks │ │ │ ├── __init__.py │ │ │ ├── initial.py │ │ │ └── timescale │ │ │ │ ├── __init__.py │ │ │ │ ├── conn.py │ │ │ │ └── ingester.py │ │ └── venues │ │ │ ├── __init__.py │ │ │ ├── base │ │ │ ├── __init__.py │ │ │ └── base.py │ │ │ └── binance │ │ │ ├── __init__.py │ │ │ ├── client.py │ │ │ ├── common.py │ │ │ ├── config.py │ │ │ └── types.py │ └── timescale_init │ │ ├── __init__.py │ │ ├── dag_timescale_roles.py │ │ ├── dag_timescale_tables.py │ │ └── process │ │ ├── __init__.py │ │ ├── create_hypertables.sql │ │ ├── create_roles.sql │ │ └── tsinit.py ├── pipecraft.Dockerfile ├── pipecraft_build_and_push.sh ├── plugins │ └── __init__.py └── scripts │ ├── entry_init.sh │ └── gen_fernet_key.py └── requirements.txt /.github/x-data-infra.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/.github/x-data-infra.webp -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # docker 156 | .storage/ 157 | storage/ 158 | *.secret 159 | .env.prod 160 | # airflow 161 | part2/pipecraft/logs/ 162 | part3/pipecraft/logs/ 163 | part4/pipecraft/logs/ 164 | part5/pipecraft/logs/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Practical Guide to a Simple Data Stack 2 | 3 | This GitHub repository hosts the source code for my infrastructure 4 | series, "A Practical Guide to a Simple Data Stack" (SDS). It expands 5 | upon the [X articles](https://x.com/bylethquant/articles) and provides a step-by-step guide to building the data infrastructure depicted in the figure 6 | below. 7 | 8 | **Articles** 9 | * [SDS #0: A Practical Guide to a Simple Data Stack](https://x.com/bylethquant/status/1826891957249212691) 10 | * [SDS #1: Docker, Docker Compose and Setting Up Timescale](https://x.com/bylethquant/status/1828041355131859198) 11 | * [SDS #2: Setting up Apache Airflow and Creating Pipelines](https://x.com/bylethquant/status/1830558712899228012) 12 | * [SDS #3: Robust Crypto Data Pipelines with Apache Airflow](https://x.com/bylethquant/status/1831899712749699506) 13 | * [SDS #4: Crypto Market Data Dashboard with Grafana](https://x.com/bylethquant/status/1833141733305295348) 14 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627) 15 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387) 16 | 17 | ## Introduction 18 | 19 | The series is inspired by recent discussions in my crypto quant circle about leveraging modern applications like 20 | [Grafana](https://grafana.com) for a simple data infrastructure. But what is the easiest way to gain 21 | exposure and 22 | utilize these tools? Today, [Docker](https://www.docker.com) stands out as an excellent tool for experimenting 23 | with these applications. For 24 | instance, [Grafana](https://grafana.com) can be run locally with just a few lines of code. This series aims to equip 25 | everyone with the 26 | knowledge necessary to deploy tools such as 27 | [Grafana](https://grafana.com), [Timescale](https://www.timescale.com), [Apache Airflow](https://airflow.apache.org), 28 | and [Traefik](https://traefik.io/traefik). 29 | 30 | ## Overview 31 | 32 | Let's start with the outcome: In the substack posts, I provided a guide to building step by step the data stack as 33 | illustrated in the figure below: 34 | 35 |
36 | data-infra 37 |
38 | 39 | *Orchestration* is managed using Apache Airflow, which facilitates the implementation and management of data pipelines. 40 | As a practical example, I will develop Extract-Transform-Load (ETL) pipelines to process 1-minute candlestick data, 41 | including spot and future price data as well as funding rates from Binance. *Data Storage* will be handled by 42 | Timescale. *Visualization* will be provided by Grafana, with Docker as the main hosting tool and Traefik serving as 43 | the reverse proxy. 44 | 45 | Following local development, I will showcase how to deploy this infrastructure on a cloud service 46 | provider. A domain will be registered through [Porkbun](https://porkbun.com/), with DNS records set up to enable access 47 | to Docker containers via 48 | subdomains such as airflow.mydomain.com and grafana.mydomain.com. 49 | 50 | Additionally, while Binance data ingestion pipelines serve as the primary example, the infrastructure is designed with 51 | flexibility in mind. It can accommodate pipelines for processing log files or any other data sources. 52 | 53 | In what follows, I aim to keep everything straightforward so that anyone can adopt elements of this data stack 54 | for their own data infrastructure. 55 | 56 | ## Who Should Read This Series 57 | 58 | This series is for those who are looking to take their first steps in developing their own data infrastructure. It is for individuals who want to: 59 | * Host these tools locally via Docker to experiment with them. 60 | * Deploy them to the cloud. 61 | 62 | 63 | ## Tools 64 | 65 | * Docker Desktop 4.27.2 66 | * Python 3.11.5 67 | * Airflow 2.8.1 68 | * Grafana 10.0.2 69 | * Timescale pg15 70 | * Traefik 3.0 71 | -------------------------------------------------------------------------------- /part1/QUICK_START.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | Follow these steps to set up the application using Docker Compose: 3 | 1. Open your terminal. 4 | 2. Change to the directory ``./part1/compose`` with the ``docker-compose.yaml`` file. 5 | 3. Start Timescale database in detached mode by executing ``docker compose up -d``. 6 | 4. Run ``main.py`` to ingest some mock data to Timescale. 7 | 8 | A detailed guide can be found here: 9 | [SDS #1: Docker, Docker Compose and Setting Up Timescale](https://x.com/bylethquant/status/1828041355131859198). -------------------------------------------------------------------------------- /part1/compose/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-part1 2 | 3 | services: 4 | timescale: 5 | container_name: timescale 6 | image: timescale/timescaledb:latest-pg15 7 | environment: 8 | POSTGRES_DB: timescale 9 | POSTGRES_USER: user 10 | POSTGRES_PASSWORD: password 11 | ports: 12 | - "5432:5432" 13 | volumes: 14 | - ../.storage/timescale:/var/lib/postgresql/data 15 | healthcheck: 16 | test: [ "CMD", "pg_isready", "-q", "-d", "timescale", "-U", "user" ] 17 | interval: 5s 18 | timeout: 5s 19 | retries: 5 -------------------------------------------------------------------------------- /part1/main.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import random 3 | 4 | from psycopg2.extras import execute_values 5 | from dataclasses import dataclass, asdict, astuple, fields 6 | from datetime import datetime, timedelta 7 | from typing import List 8 | 9 | 10 | @dataclass 11 | class TimescaleConfig: 12 | database: str 13 | host: str 14 | user: str 15 | password: str 16 | port: int 17 | 18 | 19 | class Event: 20 | pass 21 | 22 | 23 | @dataclass 24 | class PriceUpdated(Event): 25 | time: datetime 26 | close: float 27 | 28 | 29 | def insert(events: List[PriceUpdated], timescale_config: TimescaleConfig, table_name: str) -> None: 30 | """Inserts a price update event to timescale database.""" 31 | data_tpl = [astuple(event) for event in events] 32 | col_name = ','.join([field.name for field in fields(PriceUpdated)]) 33 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, col_name) 34 | with psycopg2.connect(**asdict(timescale_config)) as conn: 35 | with conn.cursor() as cursor: 36 | execute_values(cursor, query, data_tpl) 37 | conn.commit() 38 | 39 | 40 | def create_hypertable(timescale_config: TimescaleConfig, sql_file_path: str = "schema.sql") -> None: 41 | """Creates timescale schema.""" 42 | with psycopg2.connect(**asdict(timescale_config)) as conn: 43 | with conn.cursor() as cursor: 44 | with open(sql_file_path, 'r') as sql_file: 45 | cursor.execute(sql_file.read()) 46 | conn.commit() 47 | 48 | 49 | def read(timescale_config: TimescaleConfig, table_name: str) -> List[tuple]: 50 | """Reads price update events from timescale database.""" 51 | with psycopg2.connect(**asdict(timescale_config)) as conn: 52 | with conn.cursor() as cursor: 53 | cursor.execute(f"SELECT * FROM {table_name}") 54 | data = cursor.fetchall() 55 | 56 | return data 57 | 58 | 59 | def get_mock_data(num: int) -> List[PriceUpdated]: 60 | """Gets some mock data.""" 61 | mock_events = [PriceUpdated(time=datetime.utcnow() - timedelta(minutes=i), 62 | close=random.randint(0, 1)) for i in range(num)] 63 | 64 | return mock_events 65 | 66 | 67 | def main(): 68 | mock_events = get_mock_data(5) 69 | ts_config = TimescaleConfig("timescale", "localhost", "user", "password", 5432) 70 | create_hypertable(ts_config) 71 | insert(events=mock_events, timescale_config=ts_config, table_name="price") 72 | print(read(ts_config, table_name="price")) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /part1/requirements.txt: -------------------------------------------------------------------------------- 1 | psycopg2-binary~=2.9.7 -------------------------------------------------------------------------------- /part1/schema.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS price CASCADE; 2 | CREATE TABLE price ( 3 | time TIMESTAMPTZ, 4 | close DOUBLE PRECISION 5 | ); 6 | SELECT create_hypertable('price', 'time'); -------------------------------------------------------------------------------- /part2/QUICK_START.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | Follow these steps to set up the application using Docker Compose: 3 | 1. Change directory to `./part2/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. 4 | 2. Change directory to `./part2/compose/` and create a `.env` file (see `.env.template`): 5 | * Set the environment variable `AIRFLOW_FERNET_KEY` to the fernet key created in step 1. 6 | 3. Open your terminal. 7 | 4. Change to the directory ``./part2/compose`` with the ``docker-compose.yaml`` file. 8 | 5. Initialize Apache Airflow by executing ``docker compose up airflow-init``. 9 | 6. Start the data infrastructure in detached mode by executing ``docker compose up -d``. 10 | 7. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time 11 | initialization of Timescale: 12 | - Create a connection to Timescale: Admin → Connections 13 | * Connection Id: timescale_conn_admin 14 | * Connection Type: Postgres 15 | * Host: host.docker.internal 16 | * Database: timescale 17 | * Login: admin 18 | * Password: password 19 | * Port: 5433 20 | 21 | A detailed guide can be found here: [SDS #2: Setting up Apache Airflow and Creating Pipelines](https://x.com/bylethquant/status/1830558712899228012). -------------------------------------------------------------------------------- /part2/compose/.env.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_FERNET_KEY= -------------------------------------------------------------------------------- /part2/compose/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-part2 2 | 3 | x-airflow-common: 4 | &airflow-common 5 | image: apache/airflow:2.8.1-python3.11 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}" 9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}" 10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}" 11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}" 12 | _AIRFLOW_WWW_USER_ROLE: "Admin" 13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}" 14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}" 15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}" 16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}" 17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}" 18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}" 19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}" 20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part2}" 21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false" 23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false" 24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true" 25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG" 26 | user: ${AIRFLOW_UID:-50000} 27 | depends_on: 28 | airflow-postgres: 29 | condition: service_healthy 30 | volumes: 31 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs 32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags 33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config 34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins 35 | 36 | services: 37 | 38 | airflow-webserver: 39 | <<: *airflow-common 40 | container_name: airflow-webserver 41 | command: webserver 42 | ports: 43 | - "${AIRFLOW_WWW_PORT:-8080}:8080" 44 | restart: always 45 | 46 | airflow-scheduler: 47 | <<: *airflow-common 48 | container_name: airflow-scheduler 49 | command: scheduler 50 | restart: always 51 | 52 | airflow-postgres: 53 | container_name: airflow-postgres 54 | image: postgres:13 55 | environment: 56 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}" 57 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}" 58 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}" 59 | ports: 60 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432" 61 | volumes: 62 | - ../.storage/postgres:/var/lib/postgresql/data 63 | healthcheck: 64 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ] 65 | interval: 5s 66 | retries: 2 67 | start_period: 3s 68 | restart: unless-stopped 69 | 70 | airflow-init: 71 | <<: *airflow-common 72 | container_name: airflow-init 73 | environment: 74 | <<: *airflow-common-env 75 | _AIRFLOW_DB_UPGRADE: true 76 | restart: no 77 | entrypoint: /opt/airflow/scripts/entry_init.sh 78 | volumes: 79 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts 80 | 81 | timescale: 82 | container_name: timescale 83 | image: timescale/timescaledb:latest-pg15 84 | environment: 85 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}" 86 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}" 87 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}" 88 | ports: 89 | - "${TIMESCALE_PORT:-5433}:5432" 90 | volumes: 91 | - ../.storage/timescale:/var/lib/postgresql/data 92 | healthcheck: 93 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ] 94 | interval: 5s 95 | retries: 2 96 | start_period: 3s 97 | restart: unless-stopped 98 | -------------------------------------------------------------------------------- /part2/pipecraft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/config/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/dags/.airflowignore: -------------------------------------------------------------------------------- 1 | libs/ -------------------------------------------------------------------------------- /part2/pipecraft/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/dags/infopy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/infopy/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/dags/infopy/dag_infopy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | 7 | from libs.airtasks.initial import start_task, end_task 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_infopy", 13 | description="Show all installed python packages.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - execute pip freeze 20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze') 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> pip_task >> end_dummy 25 | -------------------------------------------------------------------------------- /part2/pipecraft/dags/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/libs/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/dags/libs/airtasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .initial import start_task, end_task 2 | -------------------------------------------------------------------------------- /part2/pipecraft/dags/libs/airtasks/initial.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.empty import EmptyOperator 2 | from typing import Optional 3 | 4 | 5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 6 | tid = "start" if task_id is None else task_id 7 | return EmptyOperator(task_id=tid, **kwargs) 8 | 9 | 10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 11 | tid = "end" if task_id is None else task_id 12 | return EmptyOperator(task_id=tid, **kwargs) 13 | 14 | 15 | -------------------------------------------------------------------------------- /part2/pipecraft/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/plugins/__init__.py -------------------------------------------------------------------------------- /part2/pipecraft/scripts/entry_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | airflow db migrate 4 | 5 | airflow users create \ 6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \ 7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \ 8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \ 9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \ 10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \ 11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true 12 | 13 | echo "Airflow database initialization completed." 14 | -------------------------------------------------------------------------------- /part2/pipecraft/scripts/gen_fernet_key.py: -------------------------------------------------------------------------------- 1 | from cryptography.fernet import Fernet 2 | 3 | 4 | def get_fernet_key(): 5 | """Generates a fernet key.""" 6 | return Fernet.generate_key().decode() 7 | 8 | 9 | def main(): 10 | print(get_fernet_key()) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /part2/requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography~=42.0.5 2 | apache-airflow~=2.8.1 -------------------------------------------------------------------------------- /part3/QUICK_START.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | Follow these steps to set up the application using Docker Compose: 3 | 1. Change directory to `./part3/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key. 4 | 2. Change directory to `./part3/compose/` and create a `.env` file (see `.env.template`): 5 | * Set the environment variable `AIRFLOW_FERNET_KEY` to the fernet key created in step 1. 6 | * Set the environment variable `BINANCE_API_KEY` with your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072). 7 | 3. Open your terminal. 8 | 4. Change to the directory ``./part3/compose`` with the ``docker-compose.yaml`` file. 9 | 5. Initialize Apache Airflow by executing ``docker compose up airflow-init``. 10 | 6. Start the data infrastructure in detached mode by executing ``docker compose up -d``. 11 | 7. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time 12 | initialization of Timescale: 13 | - Create a connection to Timescale: Admin → Connections 14 | * Connection Id: timescale_conn_admin 15 | * Connection Type: Postgres 16 | * Host: host.docker.internal 17 | * Database: timescale 18 | * Login: admin 19 | * Password: password 20 | * Port: 5433 21 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles. 22 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables. 23 | 8. Start the Binance data pipelines. 24 | 25 | A detailed guide can be found here: [SDS #3: Robust Crypto Data Pipelines with Apache Airflow](https://x.com/bylethquant/status/1831899712749699506). -------------------------------------------------------------------------------- /part3/compose/.env.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_FERNET_KEY= 2 | BINANCE_API_KEY= -------------------------------------------------------------------------------- /part3/compose/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-part3 2 | 3 | x-airflow-common: 4 | &airflow-common 5 | image: apache/airflow:2.8.1-python3.11 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}" 9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}" 10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}" 11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}" 12 | _AIRFLOW_WWW_USER_ROLE: "Admin" 13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}" 14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}" 15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}" 16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}" 17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}" 18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}" 19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}" 20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part3}" 21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false" 23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false" 24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true" 25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG" 26 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}" 27 | user: ${AIRFLOW_UID:-50000} 28 | depends_on: 29 | airflow-postgres: 30 | condition: service_healthy 31 | volumes: 32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs 33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags 34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config 35 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins 36 | 37 | services: 38 | 39 | airflow-webserver: 40 | <<: *airflow-common 41 | container_name: airflow-webserver 42 | command: webserver 43 | ports: 44 | - "${AIRFLOW_WWW_PORT:-8080}:8080" 45 | restart: always 46 | 47 | airflow-scheduler: 48 | <<: *airflow-common 49 | container_name: airflow-scheduler 50 | command: scheduler 51 | restart: always 52 | 53 | airflow-postgres: 54 | container_name: airflow-postgres 55 | image: postgres:13 56 | environment: 57 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}" 58 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}" 59 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}" 60 | ports: 61 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432" 62 | volumes: 63 | - ../.storage/postgres:/var/lib/postgresql/data 64 | healthcheck: 65 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ] 66 | interval: 5s 67 | retries: 2 68 | start_period: 3s 69 | restart: unless-stopped 70 | 71 | airflow-init: 72 | <<: *airflow-common 73 | container_name: airflow-init 74 | environment: 75 | <<: *airflow-common-env 76 | _AIRFLOW_DB_UPGRADE: true 77 | restart: no 78 | entrypoint: /opt/airflow/scripts/entry_init.sh 79 | volumes: 80 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts 81 | 82 | timescale: 83 | container_name: timescale 84 | image: timescale/timescaledb:latest-pg15 85 | environment: 86 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}" 87 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}" 88 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}" 89 | ports: 90 | - "${TIMESCALE_PORT:-5433}:5432" 91 | volumes: 92 | - ../.storage/timescale:/var/lib/postgresql/data 93 | healthcheck: 94 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ] 95 | interval: 5s 96 | retries: 2 97 | start_period: 3s 98 | restart: unless-stopped 99 | -------------------------------------------------------------------------------- /part3/pipecraft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/config/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/.airflowignore: -------------------------------------------------------------------------------- 1 | libs/ -------------------------------------------------------------------------------- /part3/pipecraft/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/binance_market_data/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import SPOT, FUTURE 2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS 3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS 4 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/config/funding.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *" 4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future" 5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 6 | "retries": 2} 7 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/config/kline.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *" 4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot" 5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future" 6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 7 | "retries": 2} 8 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/config/symbols.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base import Instrument, Venue, ContractType 2 | from datetime import datetime, timezone 3 | 4 | SPOT = [ 5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)), 6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)), 7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)), 8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)), 10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)), 13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)), 14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)), 15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)), 16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)), 17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)), 18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)), 19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc)) 20 | ] 21 | 22 | FUTURE = [ 23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)), 24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)), 25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)), 26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)), 27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)), 28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)), 29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)), 31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)), 32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)), 33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)), 34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)), 35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)), 36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)), 37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc)) 38 | ] 39 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def generate_binance_funding_rate_dag(dag_id: str, 17 | instrument: Instrument, 18 | schedule_interval: str, 19 | catchup: bool = False, 20 | testnet: bool = False) -> DAG: 21 | """Generates a DAG for binance funding rate data pipeline.""" 22 | with DAG(dag_id=dag_id, 23 | description="Data ingestion pipeline for Binance funding rates.", 24 | start_date=instrument.first_date, 25 | catchup=catchup, 26 | schedule_interval=schedule_interval, 27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag: 28 | # task flow 29 | start_dummy = start_task() 30 | binance_keys = retrieve_binance_secrets() 31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet) 33 | transform = etl_funding_tasks.transform_data(extract) 34 | ingest = etl_funding_tasks.insert_data(transform) 35 | end_dummy = end_task() 36 | 37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 38 | 39 | return dag 40 | 41 | 42 | # create DAGs for funding rates 43 | for instr in dag_config.FUTURE: 44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}" 45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id, 46 | instrument=instr, 47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP) 48 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/dag_binance_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_kline as etl_kline_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | # create module logger 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def generate_binance_candlestick_dag(dag_id: str, 16 | instrument: Instrument, 17 | schedule_interval: str, 18 | catchup: bool = False, 19 | testnet: bool = False) -> DAG: 20 | """Generates a DAG for binance candlestick data pipeline.""" 21 | with DAG(dag_id=dag_id, 22 | description="Data ingestion pipeline for Binance candlestick data.", 23 | start_date=instrument.first_date, 24 | catchup=catchup, 25 | schedule_interval=schedule_interval, 26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag: 27 | # task flow 28 | # - create start task 29 | start_dummy = start_task() 30 | # - retrieve binance api keys 31 | binance_keys = retrieve_binance_secrets() 32 | # - test connectivity of binance api 33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 34 | # - fetch binance candlestick data 35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet) 36 | # - transform data 37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol) 38 | # - insert data to timescale database 39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform) 40 | # - create end task 41 | end_dummy = end_task() 42 | 43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 44 | 45 | return dag 46 | 47 | 48 | # create DAGs for kline 49 | for instr in dag_config.SPOT + dag_config.FUTURE: 50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}" 51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id, 52 | instrument=instr, 53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE) 54 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/process/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/binance_market_data/process/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/process/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from typing import Dict, Any 4 | from airflow.models import Variable 5 | from airflow.decorators import task 6 | 7 | from libs.venues import binance as binance_client 8 | from libs.venues.base import ContractType 9 | 10 | # module logger 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @task 15 | def retrieve_binance_secrets() -> Dict[str, Any]: 16 | """Retrieves Binance API keys.""" 17 | try: 18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY")) 19 | except Exception as exc: 20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.") 21 | raise 22 | else: 23 | logger.info(f"Retrieving Binance keys was successful.") 24 | return binance_keys.as_dict() 25 | 26 | 27 | @task 28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None: 29 | """Tests connectivity to the Rest API.""" 30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api, 31 | ContractType.future: binance_client.ping_future_api} 32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet) 33 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/process/etl_funding_future.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, Dict, Any, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME 11 | 12 | 13 | # module logger 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | @task 18 | def fetch_data(auth: dict, 19 | symbol: str, 20 | testnet: bool = False, 21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]: 22 | """Fetches funding rate data.""" 23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 24 | start_time = datetime(data_interval_start.year, 25 | data_interval_start.month, 26 | data_interval_start.day, 27 | data_interval_start.hour) 28 | end_time = start_time + timedelta(days=1) 29 | # fetch funding rate data 30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth), 31 | symbol=symbol, 32 | start_time=start_time, 33 | end_time=end_time, 34 | testnet=testnet) 35 | return response 36 | 37 | 38 | @task 39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame: 40 | """Transforms funding rate response from API. """ 41 | try: 42 | # process funding rate 43 | field_types = binance_client.FundingRate.get_field_types() 44 | df = pd.DataFrame(data=response) 45 | # re-name columns 46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict()) 47 | # remove ignore columns 48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 49 | # set type of each column that is kept 50 | for i_col in df.columns: 51 | df = df.astype({i_col: field_types[i_col]}) 52 | # timestamp 53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True) 54 | except Exception as exc: 55 | logger.exception(f"Transformation of data: failed. {exc}") 56 | raise 57 | else: 58 | logger.info("Transformation of data: successful.") 59 | return df 60 | 61 | 62 | @task 63 | def insert_data(df: pd.DataFrame) -> None: 64 | """Inserts funding rate data to timescale.""" 65 | try: 66 | conn_id = retrieve_conn_id() 67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df) 68 | except Exception as exc: 69 | logger.exception(f"Insert data to timescale: failed. {exc}") 70 | raise 71 | else: 72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.") 73 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/binance_market_data/process/etl_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from libs.venues.base import ContractType, Instrument 11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME 12 | 13 | 14 | # module logger 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @task 19 | def fetch_data(auth: dict, 20 | instrument: Instrument, 21 | testnet: bool = False, 22 | data_interval_start: Optional[datetime] = None) -> List[list]: 23 | """Sends get request to fetch candlestick data for the previous hour.""" 24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline, 25 | ContractType.future: binance_client.fetch_future_kline} 26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 27 | start_time = datetime(data_interval_start.year, 28 | data_interval_start.month, 29 | data_interval_start.day, 30 | data_interval_start.hour) 31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1) 32 | # fetch candlestick data 33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth), 34 | symbol=instrument.symbol, 35 | start_time=start_time, 36 | end_time=end_time, 37 | testnet=testnet) 38 | return response 39 | 40 | 41 | @task 42 | def transform_data(response: list, symbol: str) -> pd.DataFrame: 43 | """Transforms the data and prepares to insert.""" 44 | try: 45 | # process klines 46 | field_types = binance_client.Kline.get_field_types() 47 | df = pd.DataFrame(data=response, columns=list(field_types.keys())) 48 | # remove ignore columns 49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 50 | # set type of each column that is kept 51 | for i_col in df.columns: 52 | df = df.astype({i_col: field_types[i_col]}) 53 | # set time 54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True) 55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True) 56 | # add symbol column 57 | df["symbol"] = symbol 58 | except Exception as exc: 59 | logger.exception(f"Transformation of data: failed. {exc}") 60 | raise 61 | else: 62 | logger.info("Transformation of data: successful.") 63 | return df 64 | 65 | 66 | @task 67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None: 68 | """Inserts data to timescale.""" 69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME, 70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME} 71 | table_name = timescale_schema_map[contract_type] 72 | try: 73 | conn_id = retrieve_conn_id() 74 | ingest_data(conn_id, table_name, df) 75 | except Exception as exc: 76 | logger.exception(f"Insert data to timescale: failed. {exc}") 77 | raise 78 | else: 79 | logger.info(f"Insert data to timescale table {table_name}: successful.") 80 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/infopy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/infopy/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/infopy/dag_infopy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | 7 | from libs.airtasks.initial import start_task, end_task 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_infopy", 13 | description="Show all installed python packages.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - execute pip freeze 20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze') 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> pip_task >> end_dummy 25 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/__init__.py: -------------------------------------------------------------------------------- 1 | from . import venues 2 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/airtasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .initial import start_task, end_task 2 | from . import timescale 3 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/airtasks/initial.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.empty import EmptyOperator 2 | from typing import Optional 3 | 4 | 5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 6 | tid = "start" if task_id is None else task_id 7 | return EmptyOperator(task_id=tid, **kwargs) 8 | 9 | 10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 11 | tid = "end" if task_id is None else task_id 12 | return EmptyOperator(task_id=tid, **kwargs) 13 | 14 | 15 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/airtasks/timescale/__init__.py: -------------------------------------------------------------------------------- 1 | from .ingester import ingest_data 2 | from .conn import retrieve_conn_id 3 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/airtasks/timescale/conn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.models import Variable 4 | 5 | # create module logger 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def retrieve_conn_id(id_key: str = "admin") -> str: 10 | """Retrieves timescale connection id.""" 11 | try: 12 | if id_key == "admin": 13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN") 14 | elif id_key == "readonly": 15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY") 16 | else: 17 | raise ValueError("Unknown id_key. Select admin or readonly.") 18 | except Exception as exc: 19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.") 20 | raise 21 | else: 22 | logger.info(f"Retrieving admin timescale connection id: successful.") 23 | return conn_id 24 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/airtasks/timescale/ingester.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from psycopg2.extras import execute_values 5 | from psycopg2.extensions import connection 6 | from airflow.providers.postgres.hooks.postgres import PostgresHook 7 | 8 | # create module logger 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None: 13 | """Bulk insert to timescale.""" 14 | try: 15 | # create a list of tuples from dataframe 16 | data_tuples = [tuple(x) for x in df_data.to_numpy()] 17 | # comma-separated dataframe columns 18 | cols = ','.join(list(df_data.columns)) 19 | # SQL query to execute 20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols) 21 | with conn.cursor() as crs: 22 | execute_values(crs, query, data_tuples) 23 | conn.commit() 24 | except Exception as exc: 25 | logger.exception(f"Bulk insert: failed. {exc}.") 26 | raise 27 | else: 28 | logger.info("Bulk insert: successful.") 29 | 30 | 31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None: 32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 33 | _bulk_insert(conn, table_name, df_data) 34 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/__init__.py: -------------------------------------------------------------------------------- 1 | from . import binance 2 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure 2 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/base/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from dataclasses import dataclass, fields 3 | from datetime import datetime 4 | 5 | 6 | class Venue(Enum): 7 | """Crypto venues.""" 8 | binance = "binance" 9 | 10 | 11 | class VenueAuthentication: 12 | """Base class to authenticate at a venue.""" 13 | pass 14 | 15 | 16 | class VenueNet(Enum): 17 | """Production vs test environment.""" 18 | mainnet = "mainnet" 19 | testnet = "testnet" 20 | 21 | 22 | class ContractType(Enum): 23 | """The contract type of traded instrument.""" 24 | spot = "spot" 25 | future = "future" 26 | 27 | 28 | @dataclass 29 | class Instrument: 30 | """The traded instrument.""" 31 | symbol: str 32 | venue: Venue 33 | contract_type: ContractType 34 | first_date: datetime 35 | 36 | 37 | @dataclass 38 | class MarketDataStructure: 39 | """Base class for market data API responses.""" 40 | 41 | @classmethod 42 | def get_field_types(cls) -> dict: 43 | return {field.name: field.type for field in fields(cls)} 44 | 45 | 46 | @dataclass 47 | class RequestResultLimit: 48 | """Default and maximum limit on result of an API market data request.""" 49 | default: int 50 | max: int 51 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/binance/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import BinanceAuth 2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api 3 | from .config import * 4 | from .types import Kline, FundingRate 5 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/binance/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from datetime import datetime 4 | from requests import Response, HTTPError 5 | from typing import Optional, Dict, Any, List 6 | from tenacity import retry, stop_after_attempt, wait_exponential 7 | from time import sleep 8 | 9 | from libs.venues.base.base import ContractType, VenueNet 10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers 11 | import libs.venues.binance.config as binance_config 12 | 13 | # create module logger 14 | logger = logging.getLogger(__name__) 15 | # log messages from requests above level warning 16 | logging.getLogger('urllib3').setLevel(logging.WARNING) 17 | 18 | # module constants 19 | _KLINE_INTERVAL: str = "1m" 20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60 21 | 22 | 23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str: 24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL, 25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL}, 26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL, 27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}} 28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet] 29 | 30 | 31 | def _get_kline_endpoint(contract_type: ContractType) -> str: 32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE, 33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE} 34 | return kline_ep_map[contract_type] 35 | 36 | 37 | def _get_ping_endpoint(contract_type: ContractType) -> str: 38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING, 39 | ContractType.future: binance_config.FUT_ENDPOINT_PING} 40 | return ping_ep_map[contract_type] 41 | 42 | 43 | def _raise_for_status(response: Response) -> None: 44 | try: 45 | response.raise_for_status() 46 | except HTTPError as http_err: 47 | if response.status_code == 429: 48 | logger.exception(f"Binance rate limit was reached. " 49 | f"I need to sleep immediately for a while to avoid any IP ban!") 50 | sleep(5*60) 51 | logger.exception(http_err) 52 | raise 53 | 54 | 55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10)) 56 | def _fetch_api_data(auth: BinanceAuth, 57 | base_url: str, 58 | endpoint: str, 59 | symbol: Optional[str] = None, 60 | start_time: Optional[datetime] = None, 61 | end_time: Optional[datetime] = None, 62 | kline_interval: Optional[str] = None, 63 | request_result_limit: int = None, 64 | request_timeout_in_secs: int = 10) -> Any: 65 | """Market data fetcher for Binance API.""" 66 | request_url: str = f"{base_url}{endpoint}" 67 | headers: dict = prepare_binance_request_headers(auth) 68 | 69 | # build request url, if necessary 70 | if symbol is not None: 71 | request_url += f"?symbol={symbol}" 72 | if start_time is not None: 73 | request_url += f"&startTime={to_ms_int(start_time)}" 74 | if end_time is not None: 75 | request_url += f"&endTime={to_ms_int(end_time)}" 76 | if kline_interval is not None: 77 | request_url += f"&interval={kline_interval}" 78 | if request_result_limit is not None: 79 | request_url += f"&limit={request_result_limit}" 80 | # send get request 81 | response = requests.get(request_url, 82 | headers=headers, 83 | timeout=request_timeout_in_secs) 84 | _raise_for_status(response) 85 | return response.json() 86 | 87 | 88 | def fetch_spot_kline(auth: BinanceAuth, 89 | symbol: str, 90 | start_time: datetime, 91 | end_time: datetime, 92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default, 93 | testnet: bool = False) -> List[list]: 94 | """Fetches spot kline market data from Binance API.""" 95 | return _fetch_api_data(auth=auth, 96 | base_url=_get_base_url(ContractType.spot, testnet), 97 | endpoint=_get_kline_endpoint(ContractType.spot), 98 | symbol=symbol, 99 | start_time=start_time, 100 | end_time=end_time, 101 | request_result_limit=request_result_limit, 102 | kline_interval=_KLINE_INTERVAL) 103 | 104 | 105 | def fetch_future_kline(auth: BinanceAuth, 106 | symbol: str, 107 | start_time: Optional[datetime] = None, 108 | end_time: Optional[datetime] = None, 109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default, 110 | testnet: bool = False) -> List[list]: 111 | """Fetches future kline market data from Binance API.""" 112 | return _fetch_api_data(auth=auth, 113 | base_url=_get_base_url(ContractType.future, testnet), 114 | endpoint=_get_kline_endpoint(ContractType.future), 115 | symbol=symbol, 116 | start_time=start_time, 117 | end_time=end_time, 118 | request_result_limit=request_result_limit, 119 | kline_interval=_KLINE_INTERVAL) 120 | 121 | 122 | def fetch_funding_rate(auth: BinanceAuth, 123 | symbol: str, 124 | start_time: Optional[datetime] = None, 125 | end_time: Optional[datetime] = None, 126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default, 127 | testnet: bool = False) -> List[Dict[str, Any]]: 128 | """Fetches funding rate market data from Binance API.""" 129 | return _fetch_api_data(auth=auth, 130 | base_url=_get_base_url(ContractType.future, testnet), 131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING, 132 | symbol=symbol, 133 | start_time=start_time, 134 | end_time=end_time, 135 | request_result_limit=request_result_limit) 136 | 137 | 138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict: 139 | """Tests connectivity to spot Binance API.""" 140 | return _fetch_api_data(auth=auth, 141 | base_url=_get_base_url(ContractType.spot, testnet), 142 | endpoint=binance_config.SPOT_ENDPOINT_PING) 143 | 144 | 145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict: 146 | """Tests connectivity to future Binance API.""" 147 | return _fetch_api_data(auth=auth, 148 | base_url=_get_base_url(ContractType.future, testnet), 149 | endpoint=binance_config.FUT_ENDPOINT_PING) 150 | 151 | 152 | def fetch_spot_exchange_info() -> Dict[str, Any]: 153 | raise NotImplementedError 154 | 155 | 156 | def fetch_fut_exchange_info() -> Dict[str, Any]: 157 | raise NotImplementedError 158 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/binance/common.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | from datetime import datetime, timezone 3 | from typing import Dict, Any 4 | 5 | from libs.venues.base.base import VenueAuthentication 6 | 7 | 8 | @dataclass 9 | class BinanceAuth(VenueAuthentication): 10 | BINANCE_API_KEY: str 11 | 12 | @classmethod 13 | def from_dict(cls, auth_dict: Dict[str, str]): 14 | return cls(auth_dict["BINANCE_API_KEY"]) 15 | 16 | def as_dict(self) -> Dict[str, str]: 17 | return asdict(self) 18 | 19 | 20 | def to_ms_int(dt: datetime) -> int: 21 | """Converts datetime timestamp to integer in ms.""" 22 | return int(round(dt.timestamp() * 1000)) 23 | 24 | 25 | def to_dt(ms_int: int) -> datetime: 26 | """Converts timestamp in ms (integer) to datetime.""" 27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc) 28 | 29 | 30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]: 31 | """Creates headers for Binance REST API.""" 32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY} 33 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/binance/config.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base.base import RequestResultLimit 2 | 3 | 4 | # spot base 5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info 6 | SPOT_MAINNET_URL: str = "https://api.binance.com" 7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision" 8 | SPOT_REQUEST_RATE_LIMIT: int = 6000 9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1 10 | 11 | # spot ping 12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity 13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping" 14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 15 | 16 | # spot exchange info 17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information 18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo" 19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20 20 | 21 | # spot kline 22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data 23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines" 24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2 25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000) 26 | 27 | # futures base 28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info 29 | FUT_MAINNET_URL: str = "https://fapi.binance.com" 30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com" 31 | FUT_REQUEST_RATE_LIMIT: int = 2400 32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1 33 | 34 | # future ping 35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity 36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping" 37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 38 | 39 | # future exchangeInfo 40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information 41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo" 42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1 43 | 44 | # future funding rate 45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history 46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate" 47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500 48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5 49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000) 50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption 51 | 52 | # future kline 53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data 54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines" 55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500) 56 | 57 | 58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int: 59 | """Returns the weight conditional on the request result limit.""" 60 | if (request_result_limit >= 1) & (request_result_limit < 100): 61 | weight = 1 62 | elif (request_result_limit >= 100) & (request_result_limit < 500): 63 | weight = 2 64 | elif (request_result_limit >= 500) & (request_result_limit < 1000): 65 | weight = 5 66 | else: 67 | weight = 10 68 | return weight 69 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/libs/venues/binance/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any 3 | 4 | from libs.venues.base.base import MarketDataStructure 5 | 6 | 7 | @dataclass 8 | class Kline(MarketDataStructure): 9 | open_time: int 10 | open: float 11 | high: float 12 | low: float 13 | close: float 14 | volume: float 15 | close_time: int 16 | quote_asset_volume: float 17 | number_of_trades: int 18 | taker_buy_base_asset_volume: float 19 | taker_buy_quote_asset_volume: float 20 | ignored: Any 21 | 22 | 23 | @dataclass 24 | class FundingRate(MarketDataStructure): 25 | symbol: str 26 | time: int 27 | funding_rate: float 28 | ignored: Any 29 | 30 | @staticmethod 31 | def get_rename_dict() -> dict: 32 | return {"symbol": "symbol", 33 | "fundingTime": "time", 34 | "fundingRate": "funding_rate", 35 | "markPrice": "ignored"} 36 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/timescale_init/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/dag_timescale_roles.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_roles 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_roles", 13 | description="Timescale initialization pipeline for creating user roles.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create read only user role 20 | roles = create_roles("dags/timescale_init/process/create_roles.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> roles >> end_dummy 25 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/dag_timescale_tables.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_tables 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_tables", 13 | description="Timescale initialization pipeline for creating hypertables.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create hypertables 20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> tables >> end_dummy 25 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/process/__init__.py: -------------------------------------------------------------------------------- 1 | from .tsinit import create_roles, create_tables 2 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/process/create_hypertables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS binance_kline_spot ( 2 | open_time TIMESTAMPTZ, 3 | symbol TEXT NOT NULL, 4 | open DOUBLE PRECISION, 5 | high DOUBLE PRECISION, 6 | low DOUBLE PRECISION, 7 | close DOUBLE PRECISION, 8 | volume DOUBLE PRECISION, 9 | close_time TIMESTAMPTZ, 10 | quote_asset_volume DOUBLE PRECISION, 11 | number_of_trades BIGINT, 12 | taker_buy_base_asset_volume DOUBLE PRECISION, 13 | taker_buy_quote_asset_volume DOUBLE PRECISION 14 | ); 15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE); 16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC); 17 | 18 | CREATE TABLE IF NOT EXISTS binance_kline_future ( 19 | open_time TIMESTAMPTZ, 20 | symbol TEXT NOT NULL, 21 | open DOUBLE PRECISION, 22 | high DOUBLE PRECISION, 23 | low DOUBLE PRECISION, 24 | close DOUBLE PRECISION, 25 | volume DOUBLE PRECISION, 26 | close_time TIMESTAMPTZ, 27 | quote_asset_volume DOUBLE PRECISION, 28 | number_of_trades BIGINT, 29 | taker_buy_base_asset_volume DOUBLE PRECISION, 30 | taker_buy_quote_asset_volume DOUBLE PRECISION 31 | ); 32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE); 33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC); 34 | 35 | 36 | CREATE TABLE IF NOT EXISTS binance_funding_future ( 37 | time TIMESTAMPTZ, 38 | symbol TEXT NOT NULL, 39 | funding_rate DOUBLE PRECISION 40 | ); 41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE); 42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC); 43 | -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/process/create_roles.sql: -------------------------------------------------------------------------------- 1 | CREATE ROLE readaccess; 2 | GRANT USAGE ON SCHEMA public TO readaccess; 3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess; 4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess; 5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD}; 6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME}; -------------------------------------------------------------------------------- /part3/pipecraft/dags/timescale_init/process/tsinit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from psycopg2 import sql 5 | from psycopg2.sql import Composable 6 | from airflow.models import Variable 7 | from airflow.decorators import task 8 | from typing import Union 9 | 10 | from libs.airtasks.timescale import retrieve_conn_id 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _read_sql(path: str) -> str: 17 | """Reads an sql script.""" 18 | try: 19 | with open(path, "r") as sql_script: 20 | sql_cmd_str = sql_script.read() 21 | except Exception as exc: 22 | logger.exception(f"Could not read sql file. {exc}") 23 | raise 24 | else: 25 | logger.info(f"Read sql file successfully.") 26 | return sql_cmd_str 27 | 28 | 29 | def _get_roles_sql(path_str: str) -> Composable: 30 | """Constructs the sql script for creating roles.""" 31 | # read file 32 | sql_cmd_str = _read_sql(path_str) 33 | try: 34 | # replace dummy variables with environmental variables 35 | sql_cmd = sql.SQL(sql_cmd_str).format( 36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")), 37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD")) 38 | ) 39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD")) 40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD"))) 41 | except Exception as exc: 42 | logger.exception(f"Get create roles sql statement: failed. {exc}") 43 | raise 44 | else: 45 | logger.info("Get create roles sql statement: successful.") 46 | return sql_cmd 47 | 48 | 49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None: 50 | try: 51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}") 53 | with conn.cursor() as crs: 54 | # execute sql 55 | crs.execute(sql_cmd) 56 | # commit 57 | conn.commit() 58 | except Exception as exc: 59 | logger.exception(f"Executing query: failed. {exc}") 60 | raise 61 | else: 62 | logger.info(f"Executing query: successful.") 63 | 64 | 65 | @task 66 | def create_roles(path_str: str) -> None: 67 | """Creates roles.""" 68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str)) 69 | 70 | 71 | @task 72 | def create_tables(path_str: str) -> None: 73 | """Creates hypertables.""" 74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str)) 75 | 76 | -------------------------------------------------------------------------------- /part3/pipecraft/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/plugins/__init__.py -------------------------------------------------------------------------------- /part3/pipecraft/scripts/entry_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | airflow db migrate 4 | 5 | airflow users create \ 6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \ 7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \ 8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \ 9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \ 10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \ 11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true 12 | 13 | echo "Airflow database initialization completed." 14 | -------------------------------------------------------------------------------- /part3/pipecraft/scripts/gen_fernet_key.py: -------------------------------------------------------------------------------- 1 | from cryptography.fernet import Fernet 2 | 3 | 4 | def get_fernet_key(): 5 | """Generates a fernet key.""" 6 | return Fernet.generate_key().decode() 7 | 8 | 9 | def main(): 10 | print(get_fernet_key()) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /part3/requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography~=42.0.5 2 | apache-airflow~=2.8.1 3 | apache-airflow-providers-postgres~=5.10.0 4 | numpy~=1.24.4 5 | pandas~=2.0.3 6 | psycopg2-binary~=2.9.7 7 | requests~=2.31.0 8 | tenacity~=8.2.3 -------------------------------------------------------------------------------- /part4/QUICK_START.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | Follow these steps to set up the application using Docker Compose: 4 | 5 | 1. Change directory to `./part4/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key. 6 | 2. Change directory to `./part4/compose/` and create a `.env` file (see `.env.template`): 7 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1. 8 | * Set the environment variable `BINANCE_API_KEY` with 9 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072). 10 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and 11 | `TIMESCALE_READONLY_PASSWORD`. 12 | 3. Open your terminal. 13 | 4. Initialize Apache Airflow by executing ``docker compose up airflow-init``. 14 | 5. Start the data infrastructure in detached mode by executing ``docker compose up -d``. 15 | 6. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time 16 | initialization of Timescale: 17 | - Create a connection to Timescale: Admin → Connections 18 | * Connection Id: timescale_conn_admin 19 | * Connection Type: Postgres 20 | * Host: host.docker.internal 21 | * Database: timescale 22 | * Login: admin 23 | * Password: password 24 | * Port: 5433 25 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles. 26 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables. 27 | 7. Start the Binance data pipelines. 28 | 8. Access Grafana web interface through a browser at ``localhost:3000``. 29 | 30 | A detailed guide can be found here: [SDS #4: Crypto Market Data Dashboard with Grafana](https://x.com/bylethquant/status/1833141733305295348). -------------------------------------------------------------------------------- /part4/compose/.env.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_FERNET_KEY= 2 | BINANCE_API_KEY= 3 | # needed for setting grafana datasources.yaml correctly 4 | TIMESCALE_PORT=5433 5 | TIMESCALE_DATABASE_NAME=timescale 6 | TIMESCALE_READONLY_USERNAME=user 7 | TIMESCALE_READONLY_PASSWORD=password -------------------------------------------------------------------------------- /part4/compose/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-part4 2 | 3 | x-airflow-common: 4 | &airflow-common 5 | image: apache/airflow:2.8.1-python3.11 6 | environment: 7 | &airflow-common-env 8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}" 9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}" 10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}" 11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}" 12 | _AIRFLOW_WWW_USER_ROLE: "Admin" 13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}" 14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}" 15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}" 16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}" 17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}" 18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}" 19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}" 20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part4}" 21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false" 23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false" 24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true" 25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG" 26 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}" 27 | user: ${AIRFLOW_UID:-50000} 28 | depends_on: 29 | airflow-postgres: 30 | condition: service_healthy 31 | volumes: 32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs 33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags 34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config 35 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins 36 | 37 | services: 38 | 39 | airflow-webserver: 40 | <<: *airflow-common 41 | container_name: airflow-webserver 42 | command: webserver 43 | ports: 44 | - "${AIRFLOW_WWW_PORT:-8080}:8080" 45 | restart: always 46 | 47 | airflow-scheduler: 48 | <<: *airflow-common 49 | container_name: airflow-scheduler 50 | command: scheduler 51 | restart: always 52 | 53 | airflow-postgres: 54 | container_name: airflow-postgres 55 | image: postgres:13 56 | environment: 57 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}" 58 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}" 59 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}" 60 | ports: 61 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432" 62 | volumes: 63 | - ../.storage/postgres:/var/lib/postgresql/data 64 | healthcheck: 65 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ] 66 | interval: 5s 67 | retries: 2 68 | start_period: 3s 69 | restart: unless-stopped 70 | 71 | airflow-init: 72 | <<: *airflow-common 73 | container_name: airflow-init 74 | environment: 75 | <<: *airflow-common-env 76 | _AIRFLOW_DB_UPGRADE: true 77 | restart: no 78 | entrypoint: /opt/airflow/scripts/entry_init.sh 79 | volumes: 80 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts 81 | 82 | timescale: 83 | container_name: timescale 84 | image: timescale/timescaledb:latest-pg15 85 | environment: 86 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}" 87 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}" 88 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}" 89 | ports: 90 | - "${TIMESCALE_PORT:-5433}:5432" 91 | volumes: 92 | - ../.storage/timescale:/var/lib/postgresql/data 93 | healthcheck: 94 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ] 95 | interval: 5s 96 | retries: 2 97 | start_period: 3s 98 | restart: unless-stopped 99 | 100 | grafana: 101 | container_name: grafana 102 | image: grafana/grafana:10.0.2 103 | environment: 104 | GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" 105 | GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-password}" 106 | GF_DATABASE_SSL_MODE: disable 107 | GF_ENABLE_GZIP: true 108 | env_file: 109 | - .env 110 | ports: 111 | - "${GRAFANA_PORT:-3000}:3000" 112 | depends_on: 113 | timescale: 114 | condition: service_healthy 115 | volumes: 116 | - ../grafana/provisioning:/etc/grafana/provisioning 117 | - ../grafana/dashboards:/var/lib/grafana/dashboards 118 | restart: unless-stopped -------------------------------------------------------------------------------- /part4/grafana/provisioning/dashboards/dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'dashboards' 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: true 10 | editable: true 11 | updateIntervalSeconds: 10 12 | allowUiUpdates: false 13 | options: 14 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /part4/grafana/provisioning/datasources/datasources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: timescale 5 | type: postgres 6 | url: "host.docker.internal:${TIMESCALE_PORT}" 7 | database: "${TIMESCALE_DATABASE_NAME}" 8 | user: "${TIMESCALE_READONLY_USERNAME}" 9 | secureJsonData: 10 | password: "${TIMESCALE_READONLY_PASSWORD}" 11 | jsonData: 12 | postgresVersion: 1500 13 | sslmode: "disable" 14 | timescaledb: true 15 | tlsAuth: false 16 | tlsAuthWithCACert: false 17 | tlsConfigurationMethod: "file-path" 18 | tlsSkipVerify: true -------------------------------------------------------------------------------- /part4/pipecraft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/config/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/.airflowignore: -------------------------------------------------------------------------------- 1 | libs/ -------------------------------------------------------------------------------- /part4/pipecraft/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/binance_market_data/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import SPOT, FUTURE 2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS 3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS 4 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/config/funding.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *" 4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future" 5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 6 | "retries": 2} 7 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/config/kline.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *" 4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot" 5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future" 6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 7 | "retries": 2} 8 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/config/symbols.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base import Instrument, Venue, ContractType 2 | from datetime import datetime, timezone 3 | 4 | SPOT = [ 5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)), 6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)), 7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)), 8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)), 10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)), 13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)), 14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)), 15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)), 16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)), 17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)), 18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)), 19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc)) 20 | ] 21 | 22 | FUTURE = [ 23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)), 24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)), 25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)), 26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)), 27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)), 28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)), 29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)), 31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)), 32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)), 33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)), 34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)), 35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)), 36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)), 37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc)) 38 | ] 39 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def generate_binance_funding_rate_dag(dag_id: str, 17 | instrument: Instrument, 18 | schedule_interval: str, 19 | catchup: bool = False, 20 | testnet: bool = False) -> DAG: 21 | """Generates a DAG for binance funding rate data pipeline.""" 22 | with DAG(dag_id=dag_id, 23 | description="Data ingestion pipeline for Binance funding rates.", 24 | start_date=instrument.first_date, 25 | catchup=catchup, 26 | schedule_interval=schedule_interval, 27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag: 28 | # task flow 29 | start_dummy = start_task() 30 | binance_keys = retrieve_binance_secrets() 31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet) 33 | transform = etl_funding_tasks.transform_data(extract) 34 | ingest = etl_funding_tasks.insert_data(transform) 35 | end_dummy = end_task() 36 | 37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 38 | 39 | return dag 40 | 41 | 42 | # create DAGs for funding rates 43 | for instr in dag_config.FUTURE: 44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}" 45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id, 46 | instrument=instr, 47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP) 48 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/dag_binance_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_kline as etl_kline_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | # create module logger 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def generate_binance_candlestick_dag(dag_id: str, 16 | instrument: Instrument, 17 | schedule_interval: str, 18 | catchup: bool = False, 19 | testnet: bool = False) -> DAG: 20 | """Generates a DAG for binance candlestick data pipeline.""" 21 | with DAG(dag_id=dag_id, 22 | description="Data ingestion pipeline for Binance candlestick data.", 23 | start_date=instrument.first_date, 24 | catchup=catchup, 25 | schedule_interval=schedule_interval, 26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag: 27 | # task flow 28 | # - create start task 29 | start_dummy = start_task() 30 | # - retrieve binance api keys 31 | binance_keys = retrieve_binance_secrets() 32 | # - test connectivity of binance api 33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 34 | # - fetch binance candlestick data 35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet) 36 | # - transform data 37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol) 38 | # - insert data to timescale database 39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform) 40 | # - create end task 41 | end_dummy = end_task() 42 | 43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 44 | 45 | return dag 46 | 47 | 48 | # create DAGs for kline 49 | for instr in dag_config.SPOT + dag_config.FUTURE: 50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}" 51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id, 52 | instrument=instr, 53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE) 54 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/process/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/binance_market_data/process/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/process/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from typing import Dict, Any 4 | from airflow.models import Variable 5 | from airflow.decorators import task 6 | 7 | from libs.venues import binance as binance_client 8 | from libs.venues.base import ContractType 9 | 10 | # module logger 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @task 15 | def retrieve_binance_secrets() -> Dict[str, Any]: 16 | """Retrieves Binance API keys.""" 17 | try: 18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY")) 19 | except Exception as exc: 20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.") 21 | raise 22 | else: 23 | logger.info(f"Retrieving Binance keys was successful.") 24 | return binance_keys.as_dict() 25 | 26 | 27 | @task 28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None: 29 | """Tests connectivity to the Rest API.""" 30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api, 31 | ContractType.future: binance_client.ping_future_api} 32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet) 33 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/process/etl_funding_future.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, Dict, Any, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME 11 | 12 | 13 | # module logger 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | @task 18 | def fetch_data(auth: dict, 19 | symbol: str, 20 | testnet: bool = False, 21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]: 22 | """Fetches funding rate data.""" 23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 24 | start_time = datetime(data_interval_start.year, 25 | data_interval_start.month, 26 | data_interval_start.day, 27 | data_interval_start.hour) 28 | end_time = start_time + timedelta(days=1) 29 | # fetch funding rate data 30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth), 31 | symbol=symbol, 32 | start_time=start_time, 33 | end_time=end_time, 34 | testnet=testnet) 35 | return response 36 | 37 | 38 | @task 39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame: 40 | """Transforms funding rate response from API. """ 41 | try: 42 | # process funding rate 43 | field_types = binance_client.FundingRate.get_field_types() 44 | df = pd.DataFrame(data=response) 45 | # re-name columns 46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict()) 47 | # remove ignore columns 48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 49 | # set type of each column that is kept 50 | for i_col in df.columns: 51 | df = df.astype({i_col: field_types[i_col]}) 52 | # timestamp 53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True) 54 | except Exception as exc: 55 | logger.exception(f"Transformation of data: failed. {exc}") 56 | raise 57 | else: 58 | logger.info("Transformation of data: successful.") 59 | return df 60 | 61 | 62 | @task 63 | def insert_data(df: pd.DataFrame) -> None: 64 | """Inserts funding rate data to timescale.""" 65 | try: 66 | conn_id = retrieve_conn_id() 67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df) 68 | except Exception as exc: 69 | logger.exception(f"Insert data to timescale: failed. {exc}") 70 | raise 71 | else: 72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.") 73 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/binance_market_data/process/etl_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from libs.venues.base import ContractType, Instrument 11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME 12 | 13 | 14 | # module logger 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @task 19 | def fetch_data(auth: dict, 20 | instrument: Instrument, 21 | testnet: bool = False, 22 | data_interval_start: Optional[datetime] = None) -> List[list]: 23 | """Sends get request to fetch candlestick data for the previous hour.""" 24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline, 25 | ContractType.future: binance_client.fetch_future_kline} 26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 27 | start_time = datetime(data_interval_start.year, 28 | data_interval_start.month, 29 | data_interval_start.day, 30 | data_interval_start.hour) 31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1) 32 | # fetch candlestick data 33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth), 34 | symbol=instrument.symbol, 35 | start_time=start_time, 36 | end_time=end_time, 37 | testnet=testnet) 38 | return response 39 | 40 | 41 | @task 42 | def transform_data(response: list, symbol: str) -> pd.DataFrame: 43 | """Transforms the data and prepares to insert.""" 44 | try: 45 | # process klines 46 | field_types = binance_client.Kline.get_field_types() 47 | df = pd.DataFrame(data=response, columns=list(field_types.keys())) 48 | # remove ignore columns 49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 50 | # set type of each column that is kept 51 | for i_col in df.columns: 52 | df = df.astype({i_col: field_types[i_col]}) 53 | # set time 54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True) 55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True) 56 | # add symbol column 57 | df["symbol"] = symbol 58 | except Exception as exc: 59 | logger.exception(f"Transformation of data: failed. {exc}") 60 | raise 61 | else: 62 | logger.info("Transformation of data: successful.") 63 | return df 64 | 65 | 66 | @task 67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None: 68 | """Inserts data to timescale.""" 69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME, 70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME} 71 | table_name = timescale_schema_map[contract_type] 72 | try: 73 | conn_id = retrieve_conn_id() 74 | ingest_data(conn_id, table_name, df) 75 | except Exception as exc: 76 | logger.exception(f"Insert data to timescale: failed. {exc}") 77 | raise 78 | else: 79 | logger.info(f"Insert data to timescale table {table_name}: successful.") 80 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/infopy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/infopy/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/infopy/dag_infopy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | 7 | from libs.airtasks.initial import start_task, end_task 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_infopy", 13 | description="Show all installed python packages.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - execute pip freeze 20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze') 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> pip_task >> end_dummy 25 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/__init__.py: -------------------------------------------------------------------------------- 1 | from . import venues 2 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/airtasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .initial import start_task, end_task 2 | from . import timescale 3 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/airtasks/initial.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.empty import EmptyOperator 2 | from typing import Optional 3 | 4 | 5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 6 | tid = "start" if task_id is None else task_id 7 | return EmptyOperator(task_id=tid, **kwargs) 8 | 9 | 10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 11 | tid = "end" if task_id is None else task_id 12 | return EmptyOperator(task_id=tid, **kwargs) 13 | 14 | 15 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/airtasks/timescale/__init__.py: -------------------------------------------------------------------------------- 1 | from .ingester import ingest_data 2 | from .conn import retrieve_conn_id 3 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/airtasks/timescale/conn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.models import Variable 4 | 5 | # create module logger 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def retrieve_conn_id(id_key: str = "admin") -> str: 10 | """Retrieves timescale connection id.""" 11 | try: 12 | if id_key == "admin": 13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN") 14 | elif id_key == "readonly": 15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY") 16 | else: 17 | raise ValueError("Unknown id_key. Select admin or readonly.") 18 | except Exception as exc: 19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.") 20 | raise 21 | else: 22 | logger.info(f"Retrieving admin timescale connection id: successful.") 23 | return conn_id 24 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/airtasks/timescale/ingester.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from psycopg2.extras import execute_values 5 | from psycopg2.extensions import connection 6 | from airflow.providers.postgres.hooks.postgres import PostgresHook 7 | 8 | # create module logger 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None: 13 | """Bulk insert to timescale.""" 14 | try: 15 | # create a list of tuples from dataframe 16 | data_tuples = [tuple(x) for x in df_data.to_numpy()] 17 | # comma-separated dataframe columns 18 | cols = ','.join(list(df_data.columns)) 19 | # SQL query to execute 20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols) 21 | with conn.cursor() as crs: 22 | execute_values(crs, query, data_tuples) 23 | conn.commit() 24 | except Exception as exc: 25 | logger.exception(f"Bulk insert: failed. {exc}.") 26 | raise 27 | else: 28 | logger.info("Bulk insert: successful.") 29 | 30 | 31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None: 32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 33 | _bulk_insert(conn, table_name, df_data) 34 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/__init__.py: -------------------------------------------------------------------------------- 1 | from . import binance 2 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure 2 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/base/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from dataclasses import dataclass, fields 3 | from datetime import datetime 4 | 5 | 6 | class Venue(Enum): 7 | """Crypto venues.""" 8 | binance = "binance" 9 | 10 | 11 | class VenueAuthentication: 12 | """Base class to authenticate at a venue.""" 13 | pass 14 | 15 | 16 | class VenueNet(Enum): 17 | """Production vs test environment.""" 18 | mainnet = "mainnet" 19 | testnet = "testnet" 20 | 21 | 22 | class ContractType(Enum): 23 | """The contract type of traded instrument.""" 24 | spot = "spot" 25 | future = "future" 26 | 27 | 28 | @dataclass 29 | class Instrument: 30 | """The traded instrument.""" 31 | symbol: str 32 | venue: Venue 33 | contract_type: ContractType 34 | first_date: datetime 35 | 36 | 37 | @dataclass 38 | class MarketDataStructure: 39 | """Base class for market data API responses.""" 40 | 41 | @classmethod 42 | def get_field_types(cls) -> dict: 43 | return {field.name: field.type for field in fields(cls)} 44 | 45 | 46 | @dataclass 47 | class RequestResultLimit: 48 | """Default and maximum limit on result of an API market data request.""" 49 | default: int 50 | max: int 51 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/binance/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import BinanceAuth 2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api 3 | from .config import * 4 | from .types import Kline, FundingRate 5 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/binance/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from datetime import datetime 4 | from requests import Response, HTTPError 5 | from typing import Optional, Dict, Any, List 6 | from tenacity import retry, stop_after_attempt, wait_exponential 7 | from time import sleep 8 | 9 | from libs.venues.base.base import ContractType, VenueNet 10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers 11 | import libs.venues.binance.config as binance_config 12 | 13 | # create module logger 14 | logger = logging.getLogger(__name__) 15 | # log messages from requests above level warning 16 | logging.getLogger('urllib3').setLevel(logging.WARNING) 17 | 18 | # module constants 19 | _KLINE_INTERVAL: str = "1m" 20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60 21 | 22 | 23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str: 24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL, 25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL}, 26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL, 27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}} 28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet] 29 | 30 | 31 | def _get_kline_endpoint(contract_type: ContractType) -> str: 32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE, 33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE} 34 | return kline_ep_map[contract_type] 35 | 36 | 37 | def _get_ping_endpoint(contract_type: ContractType) -> str: 38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING, 39 | ContractType.future: binance_config.FUT_ENDPOINT_PING} 40 | return ping_ep_map[contract_type] 41 | 42 | 43 | def _raise_for_status(response: Response) -> None: 44 | try: 45 | response.raise_for_status() 46 | except HTTPError as http_err: 47 | if response.status_code == 429: 48 | logger.exception(f"Binance rate limit was reached. " 49 | f"I need to sleep immediately for a while to avoid any IP ban!") 50 | sleep(5*60) 51 | logger.exception(http_err) 52 | raise 53 | 54 | 55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10)) 56 | def _fetch_api_data(auth: BinanceAuth, 57 | base_url: str, 58 | endpoint: str, 59 | symbol: Optional[str] = None, 60 | start_time: Optional[datetime] = None, 61 | end_time: Optional[datetime] = None, 62 | kline_interval: Optional[str] = None, 63 | request_result_limit: int = None, 64 | request_timeout_in_secs: int = 10) -> Any: 65 | """Market data fetcher for Binance API.""" 66 | request_url: str = f"{base_url}{endpoint}" 67 | headers: dict = prepare_binance_request_headers(auth) 68 | 69 | # build request url, if necessary 70 | if symbol is not None: 71 | request_url += f"?symbol={symbol}" 72 | if start_time is not None: 73 | request_url += f"&startTime={to_ms_int(start_time)}" 74 | if end_time is not None: 75 | request_url += f"&endTime={to_ms_int(end_time)}" 76 | if kline_interval is not None: 77 | request_url += f"&interval={kline_interval}" 78 | if request_result_limit is not None: 79 | request_url += f"&limit={request_result_limit}" 80 | # send get request 81 | response = requests.get(request_url, 82 | headers=headers, 83 | timeout=request_timeout_in_secs) 84 | _raise_for_status(response) 85 | return response.json() 86 | 87 | 88 | def fetch_spot_kline(auth: BinanceAuth, 89 | symbol: str, 90 | start_time: datetime, 91 | end_time: datetime, 92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default, 93 | testnet: bool = False) -> List[list]: 94 | """Fetches spot kline market data from Binance API.""" 95 | return _fetch_api_data(auth=auth, 96 | base_url=_get_base_url(ContractType.spot, testnet), 97 | endpoint=_get_kline_endpoint(ContractType.spot), 98 | symbol=symbol, 99 | start_time=start_time, 100 | end_time=end_time, 101 | request_result_limit=request_result_limit, 102 | kline_interval=_KLINE_INTERVAL) 103 | 104 | 105 | def fetch_future_kline(auth: BinanceAuth, 106 | symbol: str, 107 | start_time: Optional[datetime] = None, 108 | end_time: Optional[datetime] = None, 109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default, 110 | testnet: bool = False) -> List[list]: 111 | """Fetches future kline market data from Binance API.""" 112 | return _fetch_api_data(auth=auth, 113 | base_url=_get_base_url(ContractType.future, testnet), 114 | endpoint=_get_kline_endpoint(ContractType.future), 115 | symbol=symbol, 116 | start_time=start_time, 117 | end_time=end_time, 118 | request_result_limit=request_result_limit, 119 | kline_interval=_KLINE_INTERVAL) 120 | 121 | 122 | def fetch_funding_rate(auth: BinanceAuth, 123 | symbol: str, 124 | start_time: Optional[datetime] = None, 125 | end_time: Optional[datetime] = None, 126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default, 127 | testnet: bool = False) -> List[Dict[str, Any]]: 128 | """Fetches funding rate market data from Binance API.""" 129 | return _fetch_api_data(auth=auth, 130 | base_url=_get_base_url(ContractType.future, testnet), 131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING, 132 | symbol=symbol, 133 | start_time=start_time, 134 | end_time=end_time, 135 | request_result_limit=request_result_limit) 136 | 137 | 138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict: 139 | """Tests connectivity to spot Binance API.""" 140 | return _fetch_api_data(auth=auth, 141 | base_url=_get_base_url(ContractType.spot, testnet), 142 | endpoint=binance_config.SPOT_ENDPOINT_PING) 143 | 144 | 145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict: 146 | """Tests connectivity to future Binance API.""" 147 | return _fetch_api_data(auth=auth, 148 | base_url=_get_base_url(ContractType.future, testnet), 149 | endpoint=binance_config.FUT_ENDPOINT_PING) 150 | 151 | 152 | def fetch_spot_exchange_info() -> Dict[str, Any]: 153 | raise NotImplementedError 154 | 155 | 156 | def fetch_fut_exchange_info() -> Dict[str, Any]: 157 | raise NotImplementedError 158 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/binance/common.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | from datetime import datetime, timezone 3 | from typing import Dict, Any 4 | 5 | from libs.venues.base.base import VenueAuthentication 6 | 7 | 8 | @dataclass 9 | class BinanceAuth(VenueAuthentication): 10 | BINANCE_API_KEY: str 11 | 12 | @classmethod 13 | def from_dict(cls, auth_dict: Dict[str, str]): 14 | return cls(auth_dict["BINANCE_API_KEY"]) 15 | 16 | def as_dict(self) -> Dict[str, str]: 17 | return asdict(self) 18 | 19 | 20 | def to_ms_int(dt: datetime) -> int: 21 | """Converts datetime timestamp to integer in ms.""" 22 | return int(round(dt.timestamp() * 1000)) 23 | 24 | 25 | def to_dt(ms_int: int) -> datetime: 26 | """Converts timestamp in ms (integer) to datetime.""" 27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc) 28 | 29 | 30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]: 31 | """Creates headers for Binance REST API.""" 32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY} 33 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/binance/config.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base.base import RequestResultLimit 2 | 3 | 4 | # spot base 5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info 6 | SPOT_MAINNET_URL: str = "https://api.binance.com" 7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision" 8 | SPOT_REQUEST_RATE_LIMIT: int = 6000 9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1 10 | 11 | # spot ping 12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity 13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping" 14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 15 | 16 | # spot exchange info 17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information 18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo" 19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20 20 | 21 | # spot kline 22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data 23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines" 24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2 25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000) 26 | 27 | # futures base 28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info 29 | FUT_MAINNET_URL: str = "https://fapi.binance.com" 30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com" 31 | FUT_REQUEST_RATE_LIMIT: int = 2400 32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1 33 | 34 | # future ping 35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity 36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping" 37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 38 | 39 | # future exchangeInfo 40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information 41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo" 42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1 43 | 44 | # future funding rate 45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history 46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate" 47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500 48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5 49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000) 50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption 51 | 52 | # future kline 53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data 54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines" 55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500) 56 | 57 | 58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int: 59 | """Returns the weight conditional on the request result limit.""" 60 | if (request_result_limit >= 1) & (request_result_limit < 100): 61 | weight = 1 62 | elif (request_result_limit >= 100) & (request_result_limit < 500): 63 | weight = 2 64 | elif (request_result_limit >= 500) & (request_result_limit < 1000): 65 | weight = 5 66 | else: 67 | weight = 10 68 | return weight 69 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/libs/venues/binance/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any 3 | 4 | from libs.venues.base.base import MarketDataStructure 5 | 6 | 7 | @dataclass 8 | class Kline(MarketDataStructure): 9 | open_time: int 10 | open: float 11 | high: float 12 | low: float 13 | close: float 14 | volume: float 15 | close_time: int 16 | quote_asset_volume: float 17 | number_of_trades: int 18 | taker_buy_base_asset_volume: float 19 | taker_buy_quote_asset_volume: float 20 | ignored: Any 21 | 22 | 23 | @dataclass 24 | class FundingRate(MarketDataStructure): 25 | symbol: str 26 | time: int 27 | funding_rate: float 28 | ignored: Any 29 | 30 | @staticmethod 31 | def get_rename_dict() -> dict: 32 | return {"symbol": "symbol", 33 | "fundingTime": "time", 34 | "fundingRate": "funding_rate", 35 | "markPrice": "ignored"} 36 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/timescale_init/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/dag_timescale_roles.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_roles 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_roles", 13 | description="Timescale initialization pipeline for creating user roles.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create read only user role 20 | roles = create_roles("dags/timescale_init/process/create_roles.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> roles >> end_dummy 25 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/dag_timescale_tables.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_tables 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_tables", 13 | description="Timescale initialization pipeline for creating hypertables.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create hypertables 20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> tables >> end_dummy 25 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/process/__init__.py: -------------------------------------------------------------------------------- 1 | from .tsinit import create_roles, create_tables 2 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/process/create_hypertables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS binance_kline_spot ( 2 | open_time TIMESTAMPTZ, 3 | symbol TEXT NOT NULL, 4 | open DOUBLE PRECISION, 5 | high DOUBLE PRECISION, 6 | low DOUBLE PRECISION, 7 | close DOUBLE PRECISION, 8 | volume DOUBLE PRECISION, 9 | close_time TIMESTAMPTZ, 10 | quote_asset_volume DOUBLE PRECISION, 11 | number_of_trades BIGINT, 12 | taker_buy_base_asset_volume DOUBLE PRECISION, 13 | taker_buy_quote_asset_volume DOUBLE PRECISION 14 | ); 15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE); 16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC); 17 | 18 | CREATE TABLE IF NOT EXISTS binance_kline_future ( 19 | open_time TIMESTAMPTZ, 20 | symbol TEXT NOT NULL, 21 | open DOUBLE PRECISION, 22 | high DOUBLE PRECISION, 23 | low DOUBLE PRECISION, 24 | close DOUBLE PRECISION, 25 | volume DOUBLE PRECISION, 26 | close_time TIMESTAMPTZ, 27 | quote_asset_volume DOUBLE PRECISION, 28 | number_of_trades BIGINT, 29 | taker_buy_base_asset_volume DOUBLE PRECISION, 30 | taker_buy_quote_asset_volume DOUBLE PRECISION 31 | ); 32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE); 33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC); 34 | 35 | 36 | CREATE TABLE IF NOT EXISTS binance_funding_future ( 37 | time TIMESTAMPTZ, 38 | symbol TEXT NOT NULL, 39 | funding_rate DOUBLE PRECISION 40 | ); 41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE); 42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC); 43 | -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/process/create_roles.sql: -------------------------------------------------------------------------------- 1 | CREATE ROLE readaccess; 2 | GRANT USAGE ON SCHEMA public TO readaccess; 3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess; 4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess; 5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD}; 6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME}; -------------------------------------------------------------------------------- /part4/pipecraft/dags/timescale_init/process/tsinit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from psycopg2 import sql 5 | from psycopg2.sql import Composable 6 | from airflow.models import Variable 7 | from airflow.decorators import task 8 | from typing import Union 9 | 10 | from libs.airtasks.timescale import retrieve_conn_id 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _read_sql(path: str) -> str: 17 | """Reads an sql script.""" 18 | try: 19 | with open(path, "r") as sql_script: 20 | sql_cmd_str = sql_script.read() 21 | except Exception as exc: 22 | logger.exception(f"Could not read sql file. {exc}") 23 | raise 24 | else: 25 | logger.info(f"Read sql file successfully.") 26 | return sql_cmd_str 27 | 28 | 29 | def _get_roles_sql(path_str: str) -> Composable: 30 | """Constructs the sql script for creating roles.""" 31 | # read file 32 | sql_cmd_str = _read_sql(path_str) 33 | try: 34 | # replace dummy variables with environmental variables 35 | sql_cmd = sql.SQL(sql_cmd_str).format( 36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")), 37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD")) 38 | ) 39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD")) 40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD"))) 41 | except Exception as exc: 42 | logger.exception(f"Get create roles sql statement: failed. {exc}") 43 | raise 44 | else: 45 | logger.info("Get create roles sql statement: successful.") 46 | return sql_cmd 47 | 48 | 49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None: 50 | try: 51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}") 53 | with conn.cursor() as crs: 54 | # execute sql 55 | crs.execute(sql_cmd) 56 | # commit 57 | conn.commit() 58 | except Exception as exc: 59 | logger.exception(f"Executing query: failed. {exc}") 60 | raise 61 | else: 62 | logger.info(f"Executing query: successful.") 63 | 64 | 65 | @task 66 | def create_roles(path_str: str) -> None: 67 | """Creates roles.""" 68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str)) 69 | 70 | 71 | @task 72 | def create_tables(path_str: str) -> None: 73 | """Creates hypertables.""" 74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str)) 75 | 76 | -------------------------------------------------------------------------------- /part4/pipecraft/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/plugins/__init__.py -------------------------------------------------------------------------------- /part4/pipecraft/scripts/entry_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | airflow db migrate 4 | 5 | airflow users create \ 6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \ 7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \ 8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \ 9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \ 10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \ 11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true 12 | 13 | echo "Airflow database initialization completed." 14 | -------------------------------------------------------------------------------- /part4/pipecraft/scripts/gen_fernet_key.py: -------------------------------------------------------------------------------- 1 | from cryptography.fernet import Fernet 2 | 3 | 4 | def get_fernet_key(): 5 | """Generates a fernet key.""" 6 | return Fernet.generate_key().decode() 7 | 8 | 9 | def main(): 10 | print(get_fernet_key()) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /part4/requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography~=42.0.5 2 | apache-airflow~=2.8.1 3 | apache-airflow-providers-postgres~=5.10.0 4 | numpy~=1.24.4 5 | pandas~=2.0.3 6 | psycopg2-binary~=2.9.7 7 | requests~=2.31.0 8 | tenacity~=8.2.3 -------------------------------------------------------------------------------- /part5/QUICK_START.md: -------------------------------------------------------------------------------- 1 | # Quick Start 2 | 3 | Follow these steps to set up the application using Docker Compose: 4 | 5 | 1. Change directory to `./part5/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key. 6 | 2. Change directory to `./part5/compose/` and create an `.env` file (see template `.env.template`): 7 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1. 8 | * Set the environment variable `BINANCE_API_KEY` with 9 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072). 10 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and 11 | `TIMESCALE_READONLY_PASSWORD`. 12 | 3. Open your terminal. 13 | 4. Create common network for Traefik and data-infra services 14 | ``` 15 | docker network create traefik-net 16 | ``` 17 | 5. Start Traefik service 18 | 19 | ``` 20 | docker compose -f compose.traefik.core.yaml -f compose.traefik.dev.yaml --env-file ./.env up -d 21 | ``` 22 | 6. Initialize Apache Airflow 23 | 24 | ``` 25 | docker compose -f compose.infra.core.yaml -f compose.infra.dev.yaml --env-file ./.env up airflow-init 26 | ``` 27 | 28 | 7. Start the data infrastructure 29 | 30 | ``` 31 | docker compose -f compose.infra.core.yaml -f compose.infra.dev.yaml --env-file ./.env up -d 32 | ``` 33 | 34 | 8. Access Airflow web interface through a browser at ``airflow.localhost``. Complete the one-time 35 | initialization of Timescale: 36 | - Create a connection to Timescale: Admin → Connections 37 | * Connection Id: timescale_conn_admin 38 | * Connection Type: Postgres 39 | * Host: host.docker.internal 40 | * Database: timescale 41 | * Login: admin 42 | * Password: password 43 | * Port: 5433 44 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles. 45 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables. 46 | 9. Start the Binance data pipelines. 47 | 10. Access Grafana web interface through a browser at ``grafana.localhost``. 48 | 49 | A detailed guide can be found 50 | here: 51 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627) 52 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387) 53 | -------------------------------------------------------------------------------- /part5/QUICK_START_PROD.md: -------------------------------------------------------------------------------- 1 | # Quick Start PROD 2 | 3 | Preliminary steps: 4 | 5 | * Create [Hetzner Cloud](https://www.hetzner.com/cloud/) server with Docker Compose application installed 6 | * Register domain and link domain to server 7 | 8 | Follow these steps to set up the application using Docker Compose: 9 | 10 | 1. Change directory to `./part5/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key. 11 | 2. Change directory to `./part5/compose/` and create a `.env.prod` file (see template `.env.prod.template`): 12 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1. 13 | * Set the environment variable `BINANCE_API_KEY` with 14 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072). 15 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and 16 | `TIMESCALE_READONLY_PASSWORD`. 17 | * Set the domain you registered `DOMAIN_NAME`. For example: `DOMAIN_NAME=mydomain.com` 18 | * We use [Let´s Encrypt](https://letsencrypt.org/) to get an SSL/TLS certificate. The certificate is used to enable 19 | HTTPS (SSL/TLS) to secure browser-to-server communications. [Let´s Encrypt](https://letsencrypt.org/) issues the 20 | certificate automatically. We need to provide a valid email using `ACME_EMAIL`, which is used for important 21 | communications related to the certificate that we generated. For example, this email would be used to alert you of 22 | impending certificate expirations. 23 | 3. Create custom docker images for Apache Airflow and Grafana that we push 24 | to [GitHub Container Registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) ( 25 | GHCR): 26 | 1. Create a GitHub access token to push docker images to GHCR. Store it in a file `.ghcr.secret` 27 | at `./part5/compose/`. 28 | 2. Open your terminal, navigate to `./part5/compose/` and log into GitHub Container Registry with your generated 29 | access token 30 | ``` 31 | cat .ghcr.secret | docker login --username --password-stdin ghcr.io 32 | ``` 33 | 3. Navigate to `./part5/pipecraft/` and execute `pipecraft_build_and_push.sh`. We build the custom docker image 34 | using `pipecraft.Dockerfile` and push it to GHCR. 35 | ``` 36 | ./pipecraft_build_and_push.sh 37 | ``` 38 | 4. Navigate to `./part5/grafana/` and execute `grafana_build_and_push.sh`. We build the custom docker image 39 | using `grafana.Dockerfile` and push it to GHCR. 40 | ``` 41 | ./grafana_build_and_push.sh 42 | ``` 43 | 3. Connect with root user to Hetzner Cloud server and do the following initial steps 44 | 1. Create folders and `acme.json` file (the file is prepared to store sensitive data securely, such as SSL/TLS 45 | certificates from Let's Encrypt) 46 | ``` 47 | mkdir -p /docker/part5/storage/traefik 48 | mkdir -p /docker/part5/pipecraft/logs 49 | mkdir -p /docker/part5/compose 50 | touch /docker/part5/storage/traefik/acme.json 51 | chmod 600 /docker/part5/storage/traefik/acme.json 52 | ``` 53 | 2. Navigate `./docker/part5/` and set ownership to airflow user (important: use here AIRFLOW_USER_ID 54 | from `.env.prod` 55 | file, default=50000) 56 | ``` 57 | chown -R 50000:50000 pipecraft 58 | ``` 59 | 3. Set read and write access for pipecraft folder (used to write logs by Airflow) 60 | ``` 61 | chmod -R u+rwX pipecraft 62 | ``` 63 | 4. Copy local docker compose, `env.prod`, and `ghcr.secret` files to the server `/docker/part5/compose`. 64 | 5. Login to Github Container Registry using your access token 65 | ``` 66 | cat .ghcr.secret | docker login --username --password-stdin ghcr.io 67 | ``` 68 | 6. Navigate to `/docker/part5/compose` and create a common network 69 | ``` 70 | docker network create traefik-net 71 | ``` 72 | 5. Start Traefik service 73 | ``` 74 | docker compose -f compose.traefik.core.yaml -f compose.traefik.prod.yaml --env-file ./.env.prod up -d 75 | ``` 76 | 6. Initialize Apache Airflow 77 | 78 | ``` 79 | docker compose -f compose.infra.core.yaml -f compose.infra.prod.yaml --env-file ./.env.prod up airflow-init 80 | ``` 81 | 7. Start the data infrastructure 82 | ``` 83 | docker compose -f compose.infra.core.yaml -f compose.infra.prod.yaml --env-file ./.env.prod up -d 84 | ``` 85 | 8. Access Airflow web interface through a browser at ``airflow.mydomain.com``. Complete the one-time 86 | initialization of Timescale: 87 | - Create a connection to Timescale: Admin → Connections 88 | * Connection Id: timescale_conn_admin 89 | * Connection Type: Postgres 90 | * Host: mydomain.com 91 | * Database: timescale 92 | * Login: admin 93 | * Password: password 94 | * Port: 5433 95 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles. 96 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables. 97 | 9. Start the Binance data pipelines. 98 | 10. Access Grafana web interface through a browser at ``grafana.mydomain.com``. 99 | 100 | A detailed guide can be found 101 | here: 102 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627) 103 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387) 104 | -------------------------------------------------------------------------------- /part5/compose/.env.prod.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_FERNET_KEY= 2 | BINANCE_API_KEY= 3 | # needed for setting grafana datasources.yaml correctly 4 | TIMESCALE_PORT=5433 5 | TIMESCALE_DATABASE_NAME=timescale 6 | TIMESCALE_READONLY_USERNAME=user 7 | TIMESCALE_READONLY_PASSWORD=password 8 | # set domain 9 | DOMAIN_NAME= 10 | TRAEFIK_EMAIL= -------------------------------------------------------------------------------- /part5/compose/.env.template: -------------------------------------------------------------------------------- 1 | AIRFLOW_FERNET_KEY= 2 | BINANCE_API_KEY= 3 | # needed for setting grafana datasources.yaml correctly 4 | TIMESCALE_PORT=5433 5 | TIMESCALE_DATABASE_NAME=timescale 6 | TIMESCALE_READONLY_USERNAME=user 7 | TIMESCALE_READONLY_PASSWORD=password -------------------------------------------------------------------------------- /part5/compose/compose.infra.core.yaml: -------------------------------------------------------------------------------- 1 | x-airflow-common: 2 | &airflow-common 3 | image: ghcr.io/bylethquant/sds-pipecraft:latest 4 | environment: 5 | &airflow-common-env 6 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}" 7 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}" 8 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}" 9 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}" 10 | _AIRFLOW_WWW_USER_ROLE: "Admin" 11 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}" 12 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}" 13 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}" 14 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}" 15 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}" 16 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}" 17 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}" 18 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part5}" 19 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}" 20 | user: ${AIRFLOW_UID:-50000} 21 | depends_on: 22 | airflow-postgres: 23 | condition: service_healthy 24 | volumes: 25 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs 26 | 27 | 28 | services: 29 | 30 | airflow-webserver: 31 | <<: *airflow-common 32 | container_name: airflow-webserver 33 | command: webserver 34 | 35 | airflow-scheduler: 36 | <<: *airflow-common 37 | container_name: airflow-scheduler 38 | command: scheduler 39 | 40 | airflow-postgres: 41 | container_name: airflow-postgres 42 | image: postgres:13 43 | environment: 44 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}" 45 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}" 46 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}" 47 | 48 | airflow-init: 49 | <<: *airflow-common 50 | container_name: airflow-init 51 | environment: 52 | <<: *airflow-common-env 53 | _AIRFLOW_DB_UPGRADE: true 54 | restart: no 55 | 56 | timescale: 57 | container_name: timescale 58 | image: timescale/timescaledb:latest-pg15 59 | environment: 60 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}" 61 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}" 62 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}" 63 | 64 | grafana: 65 | container_name: grafana 66 | image: ghcr.io/bylethquant/sds-grafana:latest 67 | environment: 68 | GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}" 69 | GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-password}" 70 | depends_on: 71 | timescale: 72 | condition: service_healthy 73 | 74 | networks: 75 | default: 76 | name: traefik-net 77 | external: true -------------------------------------------------------------------------------- /part5/compose/compose.infra.dev.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-dev 2 | 3 | x-airflow-common-dev: 4 | &airflow-common-dev 5 | image: apache/airflow:2.8.1-python3.11 6 | environment: 7 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 8 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false" 9 | AIRFLOW__CORE__LOAD_EXAMPLES: "false" 10 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true" 11 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG" 12 | volumes: 13 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags 14 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config 15 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins 16 | 17 | services: 18 | 19 | airflow-webserver: 20 | <<: *airflow-common-dev 21 | healthcheck: 22 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ] 23 | interval: 30s 24 | timeout: 10s 25 | retries: 5 26 | start_period: 30s 27 | restart: unless-stopped 28 | labels: 29 | - "traefik.enable=true" 30 | - "traefik.http.routers.airflow-webserver.rule=Host(`airflow.${DOMAIN_NAME:-localhost}`)" 31 | 32 | airflow-scheduler: 33 | <<: *airflow-common-dev 34 | restart: unless-stopped 35 | 36 | airflow-postgres: 37 | ports: 38 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432" 39 | volumes: 40 | - ../.storage/postgres:/var/lib/postgresql/data 41 | healthcheck: 42 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ] 43 | interval: 5s 44 | retries: 2 45 | start_period: 3s 46 | restart: unless-stopped 47 | 48 | airflow-init: 49 | <<: *airflow-common-dev 50 | entrypoint: /opt/airflow/scripts/entry_init.sh 51 | volumes: 52 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts 53 | 54 | timescale: 55 | ports: 56 | - "${TIMESCALE_PORT:-5433}:5432" 57 | volumes: 58 | - ../.storage/timescale:/var/lib/postgresql/data 59 | healthcheck: 60 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ] 61 | interval: 5s 62 | retries: 2 63 | start_period: 3s 64 | restart: unless-stopped 65 | 66 | grafana: 67 | image: grafana/grafana:10.0.2 68 | environment: 69 | GF_DATABASE_SSL_MODE: disable 70 | GF_ENABLE_GZIP: true 71 | env_file: 72 | - .env 73 | volumes: 74 | - ../grafana/dev/provisioning:/etc/grafana/provisioning 75 | - ../grafana/dev/dashboards:/var/lib/grafana/dashboards 76 | restart: unless-stopped 77 | labels: 78 | - "traefik.enable=true" 79 | - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN_NAME:-localhost}`)" -------------------------------------------------------------------------------- /part5/compose/compose.infra.prod.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-prod 2 | 3 | services: 4 | 5 | airflow-webserver: 6 | healthcheck: 7 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ] 8 | interval: 60s 9 | timeout: 10s 10 | retries: 5 11 | start_period: 30s 12 | restart: unless-stopped 13 | labels: 14 | - "traefik.enable=true" 15 | - "traefik.http.routers.airflow-webserver.rule=Host(`airflow.${DOMAIN_NAME}`)" 16 | - "traefik.http.routers.airflow-webserver.entrypoints=websecure" 17 | - "traefik.http.routers.airflow-webserver.tls.certresolver=leresolver" 18 | 19 | airflow-scheduler: 20 | restart: unless-stopped 21 | 22 | airflow-postgres: 23 | ports: 24 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432" 25 | volumes: 26 | - ../storage/postgres:/var/lib/postgresql/data 27 | healthcheck: 28 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME_admin}" ] 29 | interval: 60s 30 | retries: 10 31 | start_period: 10s 32 | restart: unless-stopped 33 | 34 | airflow-init: 35 | entrypoint: /opt/airflow/scripts/entry_init.sh 36 | 37 | timescale: 38 | ports: 39 | - "${TIMESCALE_PORT:-5433}:5432" 40 | volumes: 41 | - ../storage/timescale:/var/lib/postgresql/data 42 | healthcheck: 43 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ] 44 | interval: 60s 45 | retries: 10 46 | start_period: 10s 47 | restart: unless-stopped 48 | 49 | grafana: 50 | env_file: 51 | - .env.prod 52 | restart: unless-stopped 53 | labels: 54 | - "traefik.enable=true" 55 | - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN_NAME}`)" 56 | - "traefik.http.routers.grafana.entrypoints=websecure" 57 | - "traefik.http.routers.grafana.tls.certresolver=leresolver" -------------------------------------------------------------------------------- /part5/compose/compose.traefik.core.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | 3 | traefik: 4 | image: traefik:v3.0 5 | container_name: traefik 6 | 7 | networks: 8 | default: 9 | name: traefik-net 10 | external: true -------------------------------------------------------------------------------- /part5/compose/compose.traefik.dev.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-traefik-dev 2 | 3 | services: 4 | 5 | traefik: 6 | command: 7 | - "--api.insecure=true" 8 | - "--providers.docker=true" 9 | - "--log.level=DEBUG" 10 | ports: 11 | - "80:80" 12 | - "${TRAEFIK_PORT:-8080}:8080" # traefik dashboard 13 | volumes: 14 | - /var/run/docker.sock:/var/run/docker.sock:ro -------------------------------------------------------------------------------- /part5/compose/compose.traefik.prod.yaml: -------------------------------------------------------------------------------- 1 | name: data-infra-traefik-prod 2 | 3 | services: 4 | 5 | traefik: 6 | command: 7 | # configure entrypoint 8 | - "--entrypoints.web.address=:80" 9 | - "--entrypoints.websecure.address=:443" 10 | # configure docker 11 | - "--providers.docker" 12 | - "--providers.docker.exposedbydefault=false" 13 | - "--providers.docker.network=traefik-net" 14 | # configure logs 15 | - "--log.level=ERROR" 16 | # configure SSL 17 | - "--certificatesresolvers.leresolver.acme.httpchallenge=true" 18 | - "--certificatesresolvers.leresolver.acme.httpchallenge.entrypoint=web" 19 | - "--certificatesresolvers.leresolver.acme.email=${ACME_EMAIL}" 20 | - "--certificatesresolvers.leresolver.acme.storage=/le/acme.json" 21 | # global HTTP -> HTTPS 22 | - "--entrypoints.web.http.redirections.entryPoint.to=websecure" 23 | - "--entrypoints.web.http.redirections.entryPoint.scheme=https" 24 | ports: 25 | - "80:80" 26 | - "443:443" 27 | volumes: 28 | - /var/run/docker.sock:/var/run/docker.sock:ro 29 | - ../storage/traefik/acme.json:/le/acme.json -------------------------------------------------------------------------------- /part5/grafana/dev/provisioning/dashboards/dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'dashboards' 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: true 10 | editable: true 11 | updateIntervalSeconds: 10 12 | allowUiUpdates: false 13 | options: 14 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /part5/grafana/dev/provisioning/datasources/datasources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: timescale 5 | type: postgres 6 | url: "host.docker.internal:${TIMESCALE_PORT}" 7 | database: "${TIMESCALE_DATABASE_NAME}" 8 | user: "${TIMESCALE_READONLY_USERNAME}" 9 | secureJsonData: 10 | password: "${TIMESCALE_READONLY_PASSWORD}" 11 | jsonData: 12 | postgresVersion: 1500 13 | sslmode: "disable" 14 | timescaledb: true 15 | tlsAuth: false 16 | tlsAuthWithCACert: false 17 | tlsConfigurationMethod: "file-path" 18 | tlsSkipVerify: true -------------------------------------------------------------------------------- /part5/grafana/grafana.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM grafana/grafana:10.0.2 2 | 3 | # set environment variables 4 | ENV GF_DATABASE_SSL_MODE=disable 5 | ENV GF_ENABLE_GZIP=true 6 | 7 | # copy provisioning and dashboards configurations 8 | COPY prod/dashboards /var/lib/grafana/dashboards 9 | COPY prod/provisioning /etc/grafana/provisioning 10 | 11 | # expose port 3000 for Grafana UI 12 | EXPOSE 3000 13 | 14 | # connect docker image to your repo (not required) 15 | # LABEL org.opencontainers.image.source https://github.com/bylethquant/substack-data-infra 16 | 17 | # start grafana 18 | CMD ["grafana-server", "--config", "/etc/grafana/grafana.ini"] 19 | -------------------------------------------------------------------------------- /part5/grafana/grafana_build_and_push.sh: -------------------------------------------------------------------------------- 1 | # define the image name, tag, and dockerfile name 2 | CONTAINER_REGISTRY="ghcr.io/bylethquant/" 3 | IMAGE_NAME="sds-grafana" 4 | TAG="latest" 5 | DOCKERFILE_NAME="grafana.Dockerfile" 6 | 7 | # build the docker image 8 | docker build -t $CONTAINER_REGISTRY$IMAGE_NAME:$TAG -f $DOCKERFILE_NAME . 9 | 10 | # push the docker image to the repository 11 | docker push $CONTAINER_REGISTRY$IMAGE_NAME:$TAG 12 | -------------------------------------------------------------------------------- /part5/grafana/prod/provisioning/dashboards/dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'dashboards' 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: true 10 | editable: true 11 | updateIntervalSeconds: 10 12 | allowUiUpdates: false 13 | options: 14 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /part5/grafana/prod/provisioning/datasources/datasources.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: timescale 5 | type: postgres 6 | url: "${DOMAIN_NAME}:${TIMESCALE_PORT}" 7 | database: "${TIMESCALE_DATABASE_NAME}" 8 | user: "${TIMESCALE_READONLY_USERNAME}" 9 | secureJsonData: 10 | password: "${TIMESCALE_READONLY_PASSWORD}" 11 | jsonData: 12 | postgresVersion: 1500 13 | sslmode: "disable" 14 | timescaledb: true 15 | tlsAuth: false 16 | tlsAuthWithCACert: false 17 | tlsConfigurationMethod: "file-path" 18 | tlsSkipVerify: true -------------------------------------------------------------------------------- /part5/pipecraft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/config/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/.airflowignore: -------------------------------------------------------------------------------- 1 | libs/ -------------------------------------------------------------------------------- /part5/pipecraft/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/binance_market_data/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .symbols import SPOT, FUTURE 2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS 3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS 4 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/config/funding.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *" 4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future" 5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 6 | "retries": 2} 7 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/config/kline.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *" 4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot" 5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future" 6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1), 7 | "retries": 2} 8 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/config/symbols.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base import Instrument, Venue, ContractType 2 | from datetime import datetime, timezone 3 | 4 | SPOT = [ 5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)), 6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)), 7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)), 8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)), 10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)), 11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)), 13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)), 14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)), 15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)), 16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)), 17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)), 18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)), 19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc)) 20 | ] 21 | 22 | FUTURE = [ 23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)), 24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)), 25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)), 26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)), 27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)), 28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)), 29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)), 30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)), 31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)), 32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)), 33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)), 34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)), 35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)), 36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)), 37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc)) 38 | ] 39 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def generate_binance_funding_rate_dag(dag_id: str, 17 | instrument: Instrument, 18 | schedule_interval: str, 19 | catchup: bool = False, 20 | testnet: bool = False) -> DAG: 21 | """Generates a DAG for binance funding rate data pipeline.""" 22 | with DAG(dag_id=dag_id, 23 | description="Data ingestion pipeline for Binance funding rates.", 24 | start_date=instrument.first_date, 25 | catchup=catchup, 26 | schedule_interval=schedule_interval, 27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag: 28 | # task flow 29 | start_dummy = start_task() 30 | binance_keys = retrieve_binance_secrets() 31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet) 33 | transform = etl_funding_tasks.transform_data(extract) 34 | ingest = etl_funding_tasks.insert_data(transform) 35 | end_dummy = end_task() 36 | 37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 38 | 39 | return dag 40 | 41 | 42 | # create DAGs for funding rates 43 | for instr in dag_config.FUTURE: 44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}" 45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id, 46 | instrument=instr, 47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP) 48 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/dag_binance_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow import DAG 4 | 5 | import binance_market_data.process.etl_kline as etl_kline_tasks 6 | import binance_market_data.config as dag_config 7 | from libs.airtasks.initial import start_task, end_task 8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity 9 | from libs.venues.base import Instrument 10 | 11 | # create module logger 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def generate_binance_candlestick_dag(dag_id: str, 16 | instrument: Instrument, 17 | schedule_interval: str, 18 | catchup: bool = False, 19 | testnet: bool = False) -> DAG: 20 | """Generates a DAG for binance candlestick data pipeline.""" 21 | with DAG(dag_id=dag_id, 22 | description="Data ingestion pipeline for Binance candlestick data.", 23 | start_date=instrument.first_date, 24 | catchup=catchup, 25 | schedule_interval=schedule_interval, 26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag: 27 | # task flow 28 | # - create start task 29 | start_dummy = start_task() 30 | # - retrieve binance api keys 31 | binance_keys = retrieve_binance_secrets() 32 | # - test connectivity of binance api 33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type) 34 | # - fetch binance candlestick data 35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet) 36 | # - transform data 37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol) 38 | # - insert data to timescale database 39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform) 40 | # - create end task 41 | end_dummy = end_task() 42 | 43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy 44 | 45 | return dag 46 | 47 | 48 | # create DAGs for kline 49 | for instr in dag_config.SPOT + dag_config.FUTURE: 50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}" 51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id, 52 | instrument=instr, 53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE) 54 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/process/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/binance_market_data/process/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/process/common.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from typing import Dict, Any 4 | from airflow.models import Variable 5 | from airflow.decorators import task 6 | 7 | from libs.venues import binance as binance_client 8 | from libs.venues.base import ContractType 9 | 10 | # module logger 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | @task 15 | def retrieve_binance_secrets() -> Dict[str, Any]: 16 | """Retrieves Binance API keys.""" 17 | try: 18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY")) 19 | except Exception as exc: 20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.") 21 | raise 22 | else: 23 | logger.info(f"Retrieving Binance keys was successful.") 24 | return binance_keys.as_dict() 25 | 26 | 27 | @task 28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None: 29 | """Tests connectivity to the Rest API.""" 30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api, 31 | ContractType.future: binance_client.ping_future_api} 32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet) 33 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/process/etl_funding_future.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, Dict, Any, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME 11 | 12 | 13 | # module logger 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | @task 18 | def fetch_data(auth: dict, 19 | symbol: str, 20 | testnet: bool = False, 21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]: 22 | """Fetches funding rate data.""" 23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 24 | start_time = datetime(data_interval_start.year, 25 | data_interval_start.month, 26 | data_interval_start.day, 27 | data_interval_start.hour) 28 | end_time = start_time + timedelta(days=1) 29 | # fetch funding rate data 30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth), 31 | symbol=symbol, 32 | start_time=start_time, 33 | end_time=end_time, 34 | testnet=testnet) 35 | return response 36 | 37 | 38 | @task 39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame: 40 | """Transforms funding rate response from API. """ 41 | try: 42 | # process funding rate 43 | field_types = binance_client.FundingRate.get_field_types() 44 | df = pd.DataFrame(data=response) 45 | # re-name columns 46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict()) 47 | # remove ignore columns 48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 49 | # set type of each column that is kept 50 | for i_col in df.columns: 51 | df = df.astype({i_col: field_types[i_col]}) 52 | # timestamp 53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True) 54 | except Exception as exc: 55 | logger.exception(f"Transformation of data: failed. {exc}") 56 | raise 57 | else: 58 | logger.info("Transformation of data: successful.") 59 | return df 60 | 61 | 62 | @task 63 | def insert_data(df: pd.DataFrame) -> None: 64 | """Inserts funding rate data to timescale.""" 65 | try: 66 | conn_id = retrieve_conn_id() 67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df) 68 | except Exception as exc: 69 | logger.exception(f"Insert data to timescale: failed. {exc}") 70 | raise 71 | else: 72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.") 73 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/binance_market_data/process/etl_kline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from airflow.decorators import task 5 | from datetime import datetime, timedelta 6 | from typing import Optional, List 7 | 8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id 9 | from libs.venues import binance as binance_client 10 | from libs.venues.base import ContractType, Instrument 11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME 12 | 13 | 14 | # module logger 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @task 19 | def fetch_data(auth: dict, 20 | instrument: Instrument, 21 | testnet: bool = False, 22 | data_interval_start: Optional[datetime] = None) -> List[list]: 23 | """Sends get request to fetch candlestick data for the previous hour.""" 24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline, 25 | ContractType.future: binance_client.fetch_future_kline} 26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time! 27 | start_time = datetime(data_interval_start.year, 28 | data_interval_start.month, 29 | data_interval_start.day, 30 | data_interval_start.hour) 31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1) 32 | # fetch candlestick data 33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth), 34 | symbol=instrument.symbol, 35 | start_time=start_time, 36 | end_time=end_time, 37 | testnet=testnet) 38 | return response 39 | 40 | 41 | @task 42 | def transform_data(response: list, symbol: str) -> pd.DataFrame: 43 | """Transforms the data and prepares to insert.""" 44 | try: 45 | # process klines 46 | field_types = binance_client.Kline.get_field_types() 47 | df = pd.DataFrame(data=response, columns=list(field_types.keys())) 48 | # remove ignore columns 49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1) 50 | # set type of each column that is kept 51 | for i_col in df.columns: 52 | df = df.astype({i_col: field_types[i_col]}) 53 | # set time 54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True) 55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True) 56 | # add symbol column 57 | df["symbol"] = symbol 58 | except Exception as exc: 59 | logger.exception(f"Transformation of data: failed. {exc}") 60 | raise 61 | else: 62 | logger.info("Transformation of data: successful.") 63 | return df 64 | 65 | 66 | @task 67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None: 68 | """Inserts data to timescale.""" 69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME, 70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME} 71 | table_name = timescale_schema_map[contract_type] 72 | try: 73 | conn_id = retrieve_conn_id() 74 | ingest_data(conn_id, table_name, df) 75 | except Exception as exc: 76 | logger.exception(f"Insert data to timescale: failed. {exc}") 77 | raise 78 | else: 79 | logger.info(f"Insert data to timescale table {table_name}: successful.") 80 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/infopy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/infopy/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/infopy/dag_infopy.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | from airflow.operators.bash import BashOperator 6 | 7 | from libs.airtasks.initial import start_task, end_task 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_infopy", 13 | description="Show all installed python packages.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - execute pip freeze 20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze') 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> pip_task >> end_dummy 25 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/__init__.py: -------------------------------------------------------------------------------- 1 | from . import venues 2 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/airtasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .initial import start_task, end_task 2 | from . import timescale 3 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/airtasks/initial.py: -------------------------------------------------------------------------------- 1 | from airflow.operators.empty import EmptyOperator 2 | from typing import Optional 3 | 4 | 5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 6 | tid = "start" if task_id is None else task_id 7 | return EmptyOperator(task_id=tid, **kwargs) 8 | 9 | 10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator: 11 | tid = "end" if task_id is None else task_id 12 | return EmptyOperator(task_id=tid, **kwargs) 13 | 14 | 15 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/airtasks/timescale/__init__.py: -------------------------------------------------------------------------------- 1 | from .ingester import ingest_data 2 | from .conn import retrieve_conn_id 3 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/airtasks/timescale/conn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.models import Variable 4 | 5 | # create module logger 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def retrieve_conn_id(id_key: str = "admin") -> str: 10 | """Retrieves timescale connection id.""" 11 | try: 12 | if id_key == "admin": 13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN") 14 | elif id_key == "readonly": 15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY") 16 | else: 17 | raise ValueError("Unknown id_key. Select admin or readonly.") 18 | except Exception as exc: 19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.") 20 | raise 21 | else: 22 | logger.info(f"Retrieving admin timescale connection id: successful.") 23 | return conn_id 24 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/airtasks/timescale/ingester.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | from psycopg2.extras import execute_values 5 | from psycopg2.extensions import connection 6 | from airflow.providers.postgres.hooks.postgres import PostgresHook 7 | 8 | # create module logger 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None: 13 | """Bulk insert to timescale.""" 14 | try: 15 | # create a list of tuples from dataframe 16 | data_tuples = [tuple(x) for x in df_data.to_numpy()] 17 | # comma-separated dataframe columns 18 | cols = ','.join(list(df_data.columns)) 19 | # SQL query to execute 20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols) 21 | with conn.cursor() as crs: 22 | execute_values(crs, query, data_tuples) 23 | conn.commit() 24 | except Exception as exc: 25 | logger.exception(f"Bulk insert: failed. {exc}.") 26 | raise 27 | else: 28 | logger.info("Bulk insert: successful.") 29 | 30 | 31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None: 32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 33 | _bulk_insert(conn, table_name, df_data) 34 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/__init__.py: -------------------------------------------------------------------------------- 1 | from . import binance 2 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure 2 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/base/base.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from dataclasses import dataclass, fields 3 | from datetime import datetime 4 | 5 | 6 | class Venue(Enum): 7 | """Crypto venues.""" 8 | binance = "binance" 9 | 10 | 11 | class VenueAuthentication: 12 | """Base class to authenticate at a venue.""" 13 | pass 14 | 15 | 16 | class VenueNet(Enum): 17 | """Production vs test environment.""" 18 | mainnet = "mainnet" 19 | testnet = "testnet" 20 | 21 | 22 | class ContractType(Enum): 23 | """The contract type of traded instrument.""" 24 | spot = "spot" 25 | future = "future" 26 | 27 | 28 | @dataclass 29 | class Instrument: 30 | """The traded instrument.""" 31 | symbol: str 32 | venue: Venue 33 | contract_type: ContractType 34 | first_date: datetime 35 | 36 | 37 | @dataclass 38 | class MarketDataStructure: 39 | """Base class for market data API responses.""" 40 | 41 | @classmethod 42 | def get_field_types(cls) -> dict: 43 | return {field.name: field.type for field in fields(cls)} 44 | 45 | 46 | @dataclass 47 | class RequestResultLimit: 48 | """Default and maximum limit on result of an API market data request.""" 49 | default: int 50 | max: int 51 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/binance/__init__.py: -------------------------------------------------------------------------------- 1 | from .common import BinanceAuth 2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api 3 | from .config import * 4 | from .types import Kline, FundingRate 5 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/binance/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import logging 3 | from datetime import datetime 4 | from requests import Response, HTTPError 5 | from typing import Optional, Dict, Any, List 6 | from tenacity import retry, stop_after_attempt, wait_exponential 7 | from time import sleep 8 | 9 | from libs.venues.base.base import ContractType, VenueNet 10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers 11 | import libs.venues.binance.config as binance_config 12 | 13 | # create module logger 14 | logger = logging.getLogger(__name__) 15 | # log messages from requests above level warning 16 | logging.getLogger('urllib3').setLevel(logging.WARNING) 17 | 18 | # module constants 19 | _KLINE_INTERVAL: str = "1m" 20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60 21 | 22 | 23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str: 24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL, 25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL}, 26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL, 27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}} 28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet] 29 | 30 | 31 | def _get_kline_endpoint(contract_type: ContractType) -> str: 32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE, 33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE} 34 | return kline_ep_map[contract_type] 35 | 36 | 37 | def _get_ping_endpoint(contract_type: ContractType) -> str: 38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING, 39 | ContractType.future: binance_config.FUT_ENDPOINT_PING} 40 | return ping_ep_map[contract_type] 41 | 42 | 43 | def _raise_for_status(response: Response) -> None: 44 | try: 45 | response.raise_for_status() 46 | except HTTPError as http_err: 47 | if response.status_code == 429: 48 | logger.exception(f"Binance rate limit was reached. " 49 | f"I need to sleep immediately for a while to avoid any IP ban!") 50 | sleep(5*60) 51 | logger.exception(http_err) 52 | raise 53 | 54 | 55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10)) 56 | def _fetch_api_data(auth: BinanceAuth, 57 | base_url: str, 58 | endpoint: str, 59 | symbol: Optional[str] = None, 60 | start_time: Optional[datetime] = None, 61 | end_time: Optional[datetime] = None, 62 | kline_interval: Optional[str] = None, 63 | request_result_limit: int = None, 64 | request_timeout_in_secs: int = 10) -> Any: 65 | """Market data fetcher for Binance API.""" 66 | request_url: str = f"{base_url}{endpoint}" 67 | headers: dict = prepare_binance_request_headers(auth) 68 | 69 | # build request url, if necessary 70 | if symbol is not None: 71 | request_url += f"?symbol={symbol}" 72 | if start_time is not None: 73 | request_url += f"&startTime={to_ms_int(start_time)}" 74 | if end_time is not None: 75 | request_url += f"&endTime={to_ms_int(end_time)}" 76 | if kline_interval is not None: 77 | request_url += f"&interval={kline_interval}" 78 | if request_result_limit is not None: 79 | request_url += f"&limit={request_result_limit}" 80 | # send get request 81 | response = requests.get(request_url, 82 | headers=headers, 83 | timeout=request_timeout_in_secs) 84 | _raise_for_status(response) 85 | return response.json() 86 | 87 | 88 | def fetch_spot_kline(auth: BinanceAuth, 89 | symbol: str, 90 | start_time: datetime, 91 | end_time: datetime, 92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default, 93 | testnet: bool = False) -> List[list]: 94 | """Fetches spot kline market data from Binance API.""" 95 | return _fetch_api_data(auth=auth, 96 | base_url=_get_base_url(ContractType.spot, testnet), 97 | endpoint=_get_kline_endpoint(ContractType.spot), 98 | symbol=symbol, 99 | start_time=start_time, 100 | end_time=end_time, 101 | request_result_limit=request_result_limit, 102 | kline_interval=_KLINE_INTERVAL) 103 | 104 | 105 | def fetch_future_kline(auth: BinanceAuth, 106 | symbol: str, 107 | start_time: Optional[datetime] = None, 108 | end_time: Optional[datetime] = None, 109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default, 110 | testnet: bool = False) -> List[list]: 111 | """Fetches future kline market data from Binance API.""" 112 | return _fetch_api_data(auth=auth, 113 | base_url=_get_base_url(ContractType.future, testnet), 114 | endpoint=_get_kline_endpoint(ContractType.future), 115 | symbol=symbol, 116 | start_time=start_time, 117 | end_time=end_time, 118 | request_result_limit=request_result_limit, 119 | kline_interval=_KLINE_INTERVAL) 120 | 121 | 122 | def fetch_funding_rate(auth: BinanceAuth, 123 | symbol: str, 124 | start_time: Optional[datetime] = None, 125 | end_time: Optional[datetime] = None, 126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default, 127 | testnet: bool = False) -> List[Dict[str, Any]]: 128 | """Fetches funding rate market data from Binance API.""" 129 | return _fetch_api_data(auth=auth, 130 | base_url=_get_base_url(ContractType.future, testnet), 131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING, 132 | symbol=symbol, 133 | start_time=start_time, 134 | end_time=end_time, 135 | request_result_limit=request_result_limit) 136 | 137 | 138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict: 139 | """Tests connectivity to spot Binance API.""" 140 | return _fetch_api_data(auth=auth, 141 | base_url=_get_base_url(ContractType.spot, testnet), 142 | endpoint=binance_config.SPOT_ENDPOINT_PING) 143 | 144 | 145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict: 146 | """Tests connectivity to future Binance API.""" 147 | return _fetch_api_data(auth=auth, 148 | base_url=_get_base_url(ContractType.future, testnet), 149 | endpoint=binance_config.FUT_ENDPOINT_PING) 150 | 151 | 152 | def fetch_spot_exchange_info() -> Dict[str, Any]: 153 | raise NotImplementedError 154 | 155 | 156 | def fetch_fut_exchange_info() -> Dict[str, Any]: 157 | raise NotImplementedError 158 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/binance/common.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | from datetime import datetime, timezone 3 | from typing import Dict, Any 4 | 5 | from libs.venues.base.base import VenueAuthentication 6 | 7 | 8 | @dataclass 9 | class BinanceAuth(VenueAuthentication): 10 | BINANCE_API_KEY: str 11 | 12 | @classmethod 13 | def from_dict(cls, auth_dict: Dict[str, str]): 14 | return cls(auth_dict["BINANCE_API_KEY"]) 15 | 16 | def as_dict(self) -> Dict[str, str]: 17 | return asdict(self) 18 | 19 | 20 | def to_ms_int(dt: datetime) -> int: 21 | """Converts datetime timestamp to integer in ms.""" 22 | return int(round(dt.timestamp() * 1000)) 23 | 24 | 25 | def to_dt(ms_int: int) -> datetime: 26 | """Converts timestamp in ms (integer) to datetime.""" 27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc) 28 | 29 | 30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]: 31 | """Creates headers for Binance REST API.""" 32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY} 33 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/binance/config.py: -------------------------------------------------------------------------------- 1 | from libs.venues.base.base import RequestResultLimit 2 | 3 | 4 | # spot base 5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info 6 | SPOT_MAINNET_URL: str = "https://api.binance.com" 7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision" 8 | SPOT_REQUEST_RATE_LIMIT: int = 6000 9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1 10 | 11 | # spot ping 12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity 13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping" 14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 15 | 16 | # spot exchange info 17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information 18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo" 19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20 20 | 21 | # spot kline 22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data 23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines" 24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2 25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000) 26 | 27 | # futures base 28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info 29 | FUT_MAINNET_URL: str = "https://fapi.binance.com" 30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com" 31 | FUT_REQUEST_RATE_LIMIT: int = 2400 32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1 33 | 34 | # future ping 35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity 36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping" 37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1 38 | 39 | # future exchangeInfo 40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information 41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo" 42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1 43 | 44 | # future funding rate 45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history 46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate" 47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500 48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5 49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000) 50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption 51 | 52 | # future kline 53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data 54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines" 55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500) 56 | 57 | 58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int: 59 | """Returns the weight conditional on the request result limit.""" 60 | if (request_result_limit >= 1) & (request_result_limit < 100): 61 | weight = 1 62 | elif (request_result_limit >= 100) & (request_result_limit < 500): 63 | weight = 2 64 | elif (request_result_limit >= 500) & (request_result_limit < 1000): 65 | weight = 5 66 | else: 67 | weight = 10 68 | return weight 69 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/libs/venues/binance/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any 3 | 4 | from libs.venues.base.base import MarketDataStructure 5 | 6 | 7 | @dataclass 8 | class Kline(MarketDataStructure): 9 | open_time: int 10 | open: float 11 | high: float 12 | low: float 13 | close: float 14 | volume: float 15 | close_time: int 16 | quote_asset_volume: float 17 | number_of_trades: int 18 | taker_buy_base_asset_volume: float 19 | taker_buy_quote_asset_volume: float 20 | ignored: Any 21 | 22 | 23 | @dataclass 24 | class FundingRate(MarketDataStructure): 25 | symbol: str 26 | time: int 27 | funding_rate: float 28 | ignored: Any 29 | 30 | @staticmethod 31 | def get_rename_dict() -> dict: 32 | return {"symbol": "symbol", 33 | "fundingTime": "time", 34 | "fundingRate": "funding_rate", 35 | "markPrice": "ignored"} 36 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/timescale_init/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/dag_timescale_roles.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_roles 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_roles", 13 | description="Timescale initialization pipeline for creating user roles.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create read only user role 20 | roles = create_roles("dags/timescale_init/process/create_roles.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> roles >> end_dummy 25 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/dag_timescale_tables.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timezone 4 | from airflow import DAG 5 | 6 | from libs.airtasks.initial import start_task, end_task 7 | from timescale_init.process import create_tables 8 | 9 | # create module logger 10 | logger = logging.getLogger(__name__) 11 | 12 | with DAG(dag_id=f"0_timescale_create_tables", 13 | description="Timescale initialization pipeline for creating hypertables.", 14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc), 15 | catchup=False, 16 | schedule_interval=None) as dag: 17 | # - create start task 18 | start_dummy = start_task() 19 | # - create hypertables 20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql") 21 | # - create end task 22 | end_dummy = end_task() 23 | 24 | start_dummy >> tables >> end_dummy 25 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/process/__init__.py: -------------------------------------------------------------------------------- 1 | from .tsinit import create_roles, create_tables 2 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/process/create_hypertables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS binance_kline_spot ( 2 | open_time TIMESTAMPTZ, 3 | symbol TEXT NOT NULL, 4 | open DOUBLE PRECISION, 5 | high DOUBLE PRECISION, 6 | low DOUBLE PRECISION, 7 | close DOUBLE PRECISION, 8 | volume DOUBLE PRECISION, 9 | close_time TIMESTAMPTZ, 10 | quote_asset_volume DOUBLE PRECISION, 11 | number_of_trades BIGINT, 12 | taker_buy_base_asset_volume DOUBLE PRECISION, 13 | taker_buy_quote_asset_volume DOUBLE PRECISION 14 | ); 15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE); 16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC); 17 | 18 | CREATE TABLE IF NOT EXISTS binance_kline_future ( 19 | open_time TIMESTAMPTZ, 20 | symbol TEXT NOT NULL, 21 | open DOUBLE PRECISION, 22 | high DOUBLE PRECISION, 23 | low DOUBLE PRECISION, 24 | close DOUBLE PRECISION, 25 | volume DOUBLE PRECISION, 26 | close_time TIMESTAMPTZ, 27 | quote_asset_volume DOUBLE PRECISION, 28 | number_of_trades BIGINT, 29 | taker_buy_base_asset_volume DOUBLE PRECISION, 30 | taker_buy_quote_asset_volume DOUBLE PRECISION 31 | ); 32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE); 33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC); 34 | 35 | 36 | CREATE TABLE IF NOT EXISTS binance_funding_future ( 37 | time TIMESTAMPTZ, 38 | symbol TEXT NOT NULL, 39 | funding_rate DOUBLE PRECISION 40 | ); 41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE); 42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC); 43 | -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/process/create_roles.sql: -------------------------------------------------------------------------------- 1 | CREATE ROLE readaccess; 2 | GRANT USAGE ON SCHEMA public TO readaccess; 3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess; 4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess; 5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD}; 6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME}; -------------------------------------------------------------------------------- /part5/pipecraft/dags/timescale_init/process/tsinit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from airflow.providers.postgres.hooks.postgres import PostgresHook 4 | from psycopg2 import sql 5 | from psycopg2.sql import Composable 6 | from airflow.models import Variable 7 | from airflow.decorators import task 8 | from typing import Union 9 | 10 | from libs.airtasks.timescale import retrieve_conn_id 11 | 12 | # create module logger 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def _read_sql(path: str) -> str: 17 | """Reads an sql script.""" 18 | try: 19 | with open(path, "r") as sql_script: 20 | sql_cmd_str = sql_script.read() 21 | except Exception as exc: 22 | logger.exception(f"Could not read sql file. {exc}") 23 | raise 24 | else: 25 | logger.info(f"Read sql file successfully.") 26 | return sql_cmd_str 27 | 28 | 29 | def _get_roles_sql(path_str: str) -> Composable: 30 | """Constructs the sql script for creating roles.""" 31 | # read file 32 | sql_cmd_str = _read_sql(path_str) 33 | try: 34 | # replace dummy variables with environmental variables 35 | sql_cmd = sql.SQL(sql_cmd_str).format( 36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")), 37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD")) 38 | ) 39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD")) 40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD"))) 41 | except Exception as exc: 42 | logger.exception(f"Get create roles sql statement: failed. {exc}") 43 | raise 44 | else: 45 | logger.info("Get create roles sql statement: successful.") 46 | return sql_cmd 47 | 48 | 49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None: 50 | try: 51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn: 52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}") 53 | with conn.cursor() as crs: 54 | # execute sql 55 | crs.execute(sql_cmd) 56 | # commit 57 | conn.commit() 58 | except Exception as exc: 59 | logger.exception(f"Executing query: failed. {exc}") 60 | raise 61 | else: 62 | logger.info(f"Executing query: successful.") 63 | 64 | 65 | @task 66 | def create_roles(path_str: str) -> None: 67 | """Creates roles.""" 68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str)) 69 | 70 | 71 | @task 72 | def create_tables(path_str: str) -> None: 73 | """Creates hypertables.""" 74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str)) 75 | 76 | -------------------------------------------------------------------------------- /part5/pipecraft/pipecraft.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.8.1-python3.11 2 | 3 | # install additional requirements (if needed) 4 | # COPY requirements.txt / 5 | # RUN pip install --no-cache-dir -r /requirements.txt 6 | 7 | # set environment variables for Airflow 8 | ENV AIRFLOW__CORE__EXECUTOR=LocalExecutor 9 | ENV AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS=false 10 | ENV AIRFLOW__CORE__LOAD_EXAMPLES=false 11 | ENV AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true 12 | ENV AIRFLOW__LOGGING__LOGGING_LEVEL=INFO 13 | 14 | # copy DAGs and other configurations into the image 15 | COPY ./dags /opt/airflow/dags 16 | COPY ./config /opt/airflow/config 17 | COPY ./plugins /opt/airflow/plugins 18 | COPY ./scripts /opt/airflow/scripts 19 | 20 | # connect docker image to your repo (not required) 21 | # LABEL org.opencontainers.image.source https://github.com/bylethquant/substack-data-infra 22 | 23 | # expose port 8080 for the Airflow UI 24 | EXPOSE 8080 25 | -------------------------------------------------------------------------------- /part5/pipecraft/pipecraft_build_and_push.sh: -------------------------------------------------------------------------------- 1 | # define the image name, tag, and dockerfile name 2 | CONTAINER_REGISTRY="ghcr.io/bylethquant/" 3 | IMAGE_NAME="sds-pipecraft" 4 | TAG="latest" 5 | DOCKERFILE_NAME="pipecraft.Dockerfile" 6 | 7 | # build the docker image 8 | docker build -t $CONTAINER_REGISTRY$IMAGE_NAME:$TAG -f $DOCKERFILE_NAME . 9 | 10 | # push the docker image to the repository 11 | docker push $CONTAINER_REGISTRY$IMAGE_NAME:$TAG 12 | -------------------------------------------------------------------------------- /part5/pipecraft/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/plugins/__init__.py -------------------------------------------------------------------------------- /part5/pipecraft/scripts/entry_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | airflow db migrate 4 | 5 | airflow users create \ 6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \ 7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \ 8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \ 9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \ 10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \ 11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true 12 | 13 | echo "Airflow database initialization completed." 14 | -------------------------------------------------------------------------------- /part5/pipecraft/scripts/gen_fernet_key.py: -------------------------------------------------------------------------------- 1 | from cryptography.fernet import Fernet 2 | 3 | 4 | def get_fernet_key(): 5 | """Generates a fernet key.""" 6 | return Fernet.generate_key().decode() 7 | 8 | 9 | def main(): 10 | print(get_fernet_key()) 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /part5/requirements.txt: -------------------------------------------------------------------------------- 1 | cryptography~=42.0.5 2 | apache-airflow~=2.8.1 3 | apache-airflow-providers-postgres~=5.10.0 4 | numpy~=1.24.4 5 | pandas~=2.0.3 6 | psycopg2-binary~=2.9.7 7 | requests~=2.31.0 8 | tenacity~=8.2.3 --------------------------------------------------------------------------------