├── .github
└── x-data-infra.webp
├── .gitignore
├── README.md
├── part1
├── QUICK_START.md
├── compose
│ └── docker-compose.yaml
├── main.py
├── requirements.txt
└── schema.sql
├── part2
├── QUICK_START.md
├── compose
│ ├── .env.template
│ └── docker-compose.yaml
├── pipecraft
│ ├── __init__.py
│ ├── config
│ │ └── __init__.py
│ ├── dags
│ │ ├── .airflowignore
│ │ ├── __init__.py
│ │ ├── infopy
│ │ │ ├── __init__.py
│ │ │ └── dag_infopy.py
│ │ └── libs
│ │ │ ├── __init__.py
│ │ │ └── airtasks
│ │ │ ├── __init__.py
│ │ │ └── initial.py
│ ├── plugins
│ │ └── __init__.py
│ └── scripts
│ │ ├── entry_init.sh
│ │ └── gen_fernet_key.py
└── requirements.txt
├── part3
├── QUICK_START.md
├── compose
│ ├── .env.template
│ └── docker-compose.yaml
├── pipecraft
│ ├── __init__.py
│ ├── config
│ │ └── __init__.py
│ ├── dags
│ │ ├── .airflowignore
│ │ ├── __init__.py
│ │ ├── binance_market_data
│ │ │ ├── __init__.py
│ │ │ ├── config
│ │ │ │ ├── __init__.py
│ │ │ │ ├── funding.py
│ │ │ │ ├── kline.py
│ │ │ │ └── symbols.py
│ │ │ ├── dag_binance_funding_rate.py
│ │ │ ├── dag_binance_kline.py
│ │ │ └── process
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common.py
│ │ │ │ ├── etl_funding_future.py
│ │ │ │ └── etl_kline.py
│ │ ├── infopy
│ │ │ ├── __init__.py
│ │ │ └── dag_infopy.py
│ │ ├── libs
│ │ │ ├── __init__.py
│ │ │ ├── airtasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── initial.py
│ │ │ │ └── timescale
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conn.py
│ │ │ │ │ └── ingester.py
│ │ │ └── venues
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base
│ │ │ │ ├── __init__.py
│ │ │ │ └── base.py
│ │ │ │ └── binance
│ │ │ │ ├── __init__.py
│ │ │ │ ├── client.py
│ │ │ │ ├── common.py
│ │ │ │ ├── config.py
│ │ │ │ └── types.py
│ │ └── timescale_init
│ │ │ ├── __init__.py
│ │ │ ├── dag_timescale_roles.py
│ │ │ ├── dag_timescale_tables.py
│ │ │ └── process
│ │ │ ├── __init__.py
│ │ │ ├── create_hypertables.sql
│ │ │ ├── create_roles.sql
│ │ │ └── tsinit.py
│ ├── plugins
│ │ └── __init__.py
│ └── scripts
│ │ ├── entry_init.sh
│ │ └── gen_fernet_key.py
└── requirements.txt
├── part4
├── QUICK_START.md
├── compose
│ ├── .env.template
│ └── docker-compose.yaml
├── grafana
│ ├── dashboards
│ │ └── MarketMonitor.json
│ └── provisioning
│ │ ├── dashboards
│ │ └── dashboards.yaml
│ │ └── datasources
│ │ └── datasources.yaml
├── pipecraft
│ ├── __init__.py
│ ├── config
│ │ └── __init__.py
│ ├── dags
│ │ ├── .airflowignore
│ │ ├── __init__.py
│ │ ├── binance_market_data
│ │ │ ├── __init__.py
│ │ │ ├── config
│ │ │ │ ├── __init__.py
│ │ │ │ ├── funding.py
│ │ │ │ ├── kline.py
│ │ │ │ └── symbols.py
│ │ │ ├── dag_binance_funding_rate.py
│ │ │ ├── dag_binance_kline.py
│ │ │ └── process
│ │ │ │ ├── __init__.py
│ │ │ │ ├── common.py
│ │ │ │ ├── etl_funding_future.py
│ │ │ │ └── etl_kline.py
│ │ ├── infopy
│ │ │ ├── __init__.py
│ │ │ └── dag_infopy.py
│ │ ├── libs
│ │ │ ├── __init__.py
│ │ │ ├── airtasks
│ │ │ │ ├── __init__.py
│ │ │ │ ├── initial.py
│ │ │ │ └── timescale
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── conn.py
│ │ │ │ │ └── ingester.py
│ │ │ └── venues
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base
│ │ │ │ ├── __init__.py
│ │ │ │ └── base.py
│ │ │ │ └── binance
│ │ │ │ ├── __init__.py
│ │ │ │ ├── client.py
│ │ │ │ ├── common.py
│ │ │ │ ├── config.py
│ │ │ │ └── types.py
│ │ └── timescale_init
│ │ │ ├── __init__.py
│ │ │ ├── dag_timescale_roles.py
│ │ │ ├── dag_timescale_tables.py
│ │ │ └── process
│ │ │ ├── __init__.py
│ │ │ ├── create_hypertables.sql
│ │ │ ├── create_roles.sql
│ │ │ └── tsinit.py
│ ├── plugins
│ │ └── __init__.py
│ └── scripts
│ │ ├── entry_init.sh
│ │ └── gen_fernet_key.py
└── requirements.txt
└── part5
├── QUICK_START.md
├── QUICK_START_PROD.md
├── compose
├── .env.prod.template
├── .env.template
├── compose.infra.core.yaml
├── compose.infra.dev.yaml
├── compose.infra.prod.yaml
├── compose.traefik.core.yaml
├── compose.traefik.dev.yaml
└── compose.traefik.prod.yaml
├── grafana
├── dev
│ ├── dashboards
│ │ └── MarketMonitor.json
│ └── provisioning
│ │ ├── dashboards
│ │ └── dashboards.yaml
│ │ └── datasources
│ │ └── datasources.yaml
├── grafana.Dockerfile
├── grafana_build_and_push.sh
└── prod
│ ├── dashboards
│ └── MarketMonitor.json
│ └── provisioning
│ ├── dashboards
│ └── dashboards.yaml
│ └── datasources
│ └── datasources.yaml
├── pipecraft
├── __init__.py
├── config
│ └── __init__.py
├── dags
│ ├── .airflowignore
│ ├── __init__.py
│ ├── binance_market_data
│ │ ├── __init__.py
│ │ ├── config
│ │ │ ├── __init__.py
│ │ │ ├── funding.py
│ │ │ ├── kline.py
│ │ │ └── symbols.py
│ │ ├── dag_binance_funding_rate.py
│ │ ├── dag_binance_kline.py
│ │ └── process
│ │ │ ├── __init__.py
│ │ │ ├── common.py
│ │ │ ├── etl_funding_future.py
│ │ │ └── etl_kline.py
│ ├── infopy
│ │ ├── __init__.py
│ │ └── dag_infopy.py
│ ├── libs
│ │ ├── __init__.py
│ │ ├── airtasks
│ │ │ ├── __init__.py
│ │ │ ├── initial.py
│ │ │ └── timescale
│ │ │ │ ├── __init__.py
│ │ │ │ ├── conn.py
│ │ │ │ └── ingester.py
│ │ └── venues
│ │ │ ├── __init__.py
│ │ │ ├── base
│ │ │ ├── __init__.py
│ │ │ └── base.py
│ │ │ └── binance
│ │ │ ├── __init__.py
│ │ │ ├── client.py
│ │ │ ├── common.py
│ │ │ ├── config.py
│ │ │ └── types.py
│ └── timescale_init
│ │ ├── __init__.py
│ │ ├── dag_timescale_roles.py
│ │ ├── dag_timescale_tables.py
│ │ └── process
│ │ ├── __init__.py
│ │ ├── create_hypertables.sql
│ │ ├── create_roles.sql
│ │ └── tsinit.py
├── pipecraft.Dockerfile
├── pipecraft_build_and_push.sh
├── plugins
│ └── __init__.py
└── scripts
│ ├── entry_init.sh
│ └── gen_fernet_key.py
└── requirements.txt
/.github/x-data-infra.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/.github/x-data-infra.webp
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # docker
156 | .storage/
157 | storage/
158 | *.secret
159 | .env.prod
160 | # airflow
161 | part2/pipecraft/logs/
162 | part3/pipecraft/logs/
163 | part4/pipecraft/logs/
164 | part5/pipecraft/logs/
165 |
166 | # PyCharm
167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | # and can be added to the global gitignore or merged into this file. For a more nuclear
170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A Practical Guide to a Simple Data Stack
2 |
3 | This GitHub repository hosts the source code for my infrastructure
4 | series, "A Practical Guide to a Simple Data Stack" (SDS). It expands
5 | upon the [X articles](https://x.com/bylethquant/articles) and provides a step-by-step guide to building the data infrastructure depicted in the figure
6 | below.
7 |
8 | **Articles**
9 | * [SDS #0: A Practical Guide to a Simple Data Stack](https://x.com/bylethquant/status/1826891957249212691)
10 | * [SDS #1: Docker, Docker Compose and Setting Up Timescale](https://x.com/bylethquant/status/1828041355131859198)
11 | * [SDS #2: Setting up Apache Airflow and Creating Pipelines](https://x.com/bylethquant/status/1830558712899228012)
12 | * [SDS #3: Robust Crypto Data Pipelines with Apache Airflow](https://x.com/bylethquant/status/1831899712749699506)
13 | * [SDS #4: Crypto Market Data Dashboard with Grafana](https://x.com/bylethquant/status/1833141733305295348)
14 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627)
15 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387)
16 |
17 | ## Introduction
18 |
19 | The series is inspired by recent discussions in my crypto quant circle about leveraging modern applications like
20 | [Grafana](https://grafana.com) for a simple data infrastructure. But what is the easiest way to gain
21 | exposure and
22 | utilize these tools? Today, [Docker](https://www.docker.com) stands out as an excellent tool for experimenting
23 | with these applications. For
24 | instance, [Grafana](https://grafana.com) can be run locally with just a few lines of code. This series aims to equip
25 | everyone with the
26 | knowledge necessary to deploy tools such as
27 | [Grafana](https://grafana.com), [Timescale](https://www.timescale.com), [Apache Airflow](https://airflow.apache.org),
28 | and [Traefik](https://traefik.io/traefik).
29 |
30 | ## Overview
31 |
32 | Let's start with the outcome: In the substack posts, I provided a guide to building step by step the data stack as
33 | illustrated in the figure below:
34 |
35 |
36 |

37 |
38 |
39 | *Orchestration* is managed using Apache Airflow, which facilitates the implementation and management of data pipelines.
40 | As a practical example, I will develop Extract-Transform-Load (ETL) pipelines to process 1-minute candlestick data,
41 | including spot and future price data as well as funding rates from Binance. *Data Storage* will be handled by
42 | Timescale. *Visualization* will be provided by Grafana, with Docker as the main hosting tool and Traefik serving as
43 | the reverse proxy.
44 |
45 | Following local development, I will showcase how to deploy this infrastructure on a cloud service
46 | provider. A domain will be registered through [Porkbun](https://porkbun.com/), with DNS records set up to enable access
47 | to Docker containers via
48 | subdomains such as airflow.mydomain.com and grafana.mydomain.com.
49 |
50 | Additionally, while Binance data ingestion pipelines serve as the primary example, the infrastructure is designed with
51 | flexibility in mind. It can accommodate pipelines for processing log files or any other data sources.
52 |
53 | In what follows, I aim to keep everything straightforward so that anyone can adopt elements of this data stack
54 | for their own data infrastructure.
55 |
56 | ## Who Should Read This Series
57 |
58 | This series is for those who are looking to take their first steps in developing their own data infrastructure. It is for individuals who want to:
59 | * Host these tools locally via Docker to experiment with them.
60 | * Deploy them to the cloud.
61 |
62 |
63 | ## Tools
64 |
65 | * Docker Desktop 4.27.2
66 | * Python 3.11.5
67 | * Airflow 2.8.1
68 | * Grafana 10.0.2
69 | * Timescale pg15
70 | * Traefik 3.0
71 |
--------------------------------------------------------------------------------
/part1/QUICK_START.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 | Follow these steps to set up the application using Docker Compose:
3 | 1. Open your terminal.
4 | 2. Change to the directory ``./part1/compose`` with the ``docker-compose.yaml`` file.
5 | 3. Start Timescale database in detached mode by executing ``docker compose up -d``.
6 | 4. Run ``main.py`` to ingest some mock data to Timescale.
7 |
8 | A detailed guide can be found here:
9 | [SDS #1: Docker, Docker Compose and Setting Up Timescale](https://x.com/bylethquant/status/1828041355131859198).
--------------------------------------------------------------------------------
/part1/compose/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-part1
2 |
3 | services:
4 | timescale:
5 | container_name: timescale
6 | image: timescale/timescaledb:latest-pg15
7 | environment:
8 | POSTGRES_DB: timescale
9 | POSTGRES_USER: user
10 | POSTGRES_PASSWORD: password
11 | ports:
12 | - "5432:5432"
13 | volumes:
14 | - ../.storage/timescale:/var/lib/postgresql/data
15 | healthcheck:
16 | test: [ "CMD", "pg_isready", "-q", "-d", "timescale", "-U", "user" ]
17 | interval: 5s
18 | timeout: 5s
19 | retries: 5
--------------------------------------------------------------------------------
/part1/main.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 | import random
3 |
4 | from psycopg2.extras import execute_values
5 | from dataclasses import dataclass, asdict, astuple, fields
6 | from datetime import datetime, timedelta
7 | from typing import List
8 |
9 |
10 | @dataclass
11 | class TimescaleConfig:
12 | database: str
13 | host: str
14 | user: str
15 | password: str
16 | port: int
17 |
18 |
19 | class Event:
20 | pass
21 |
22 |
23 | @dataclass
24 | class PriceUpdated(Event):
25 | time: datetime
26 | close: float
27 |
28 |
29 | def insert(events: List[PriceUpdated], timescale_config: TimescaleConfig, table_name: str) -> None:
30 | """Inserts a price update event to timescale database."""
31 | data_tpl = [astuple(event) for event in events]
32 | col_name = ','.join([field.name for field in fields(PriceUpdated)])
33 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, col_name)
34 | with psycopg2.connect(**asdict(timescale_config)) as conn:
35 | with conn.cursor() as cursor:
36 | execute_values(cursor, query, data_tpl)
37 | conn.commit()
38 |
39 |
40 | def create_hypertable(timescale_config: TimescaleConfig, sql_file_path: str = "schema.sql") -> None:
41 | """Creates timescale schema."""
42 | with psycopg2.connect(**asdict(timescale_config)) as conn:
43 | with conn.cursor() as cursor:
44 | with open(sql_file_path, 'r') as sql_file:
45 | cursor.execute(sql_file.read())
46 | conn.commit()
47 |
48 |
49 | def read(timescale_config: TimescaleConfig, table_name: str) -> List[tuple]:
50 | """Reads price update events from timescale database."""
51 | with psycopg2.connect(**asdict(timescale_config)) as conn:
52 | with conn.cursor() as cursor:
53 | cursor.execute(f"SELECT * FROM {table_name}")
54 | data = cursor.fetchall()
55 |
56 | return data
57 |
58 |
59 | def get_mock_data(num: int) -> List[PriceUpdated]:
60 | """Gets some mock data."""
61 | mock_events = [PriceUpdated(time=datetime.utcnow() - timedelta(minutes=i),
62 | close=random.randint(0, 1)) for i in range(num)]
63 |
64 | return mock_events
65 |
66 |
67 | def main():
68 | mock_events = get_mock_data(5)
69 | ts_config = TimescaleConfig("timescale", "localhost", "user", "password", 5432)
70 | create_hypertable(ts_config)
71 | insert(events=mock_events, timescale_config=ts_config, table_name="price")
72 | print(read(ts_config, table_name="price"))
73 |
74 |
75 | if __name__ == "__main__":
76 | main()
77 |
--------------------------------------------------------------------------------
/part1/requirements.txt:
--------------------------------------------------------------------------------
1 | psycopg2-binary~=2.9.7
--------------------------------------------------------------------------------
/part1/schema.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS price CASCADE;
2 | CREATE TABLE price (
3 | time TIMESTAMPTZ,
4 | close DOUBLE PRECISION
5 | );
6 | SELECT create_hypertable('price', 'time');
--------------------------------------------------------------------------------
/part2/QUICK_START.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 | Follow these steps to set up the application using Docker Compose:
3 | 1. Change directory to `./part2/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`.
4 | 2. Change directory to `./part2/compose/` and create a `.env` file (see `.env.template`):
5 | * Set the environment variable `AIRFLOW_FERNET_KEY` to the fernet key created in step 1.
6 | 3. Open your terminal.
7 | 4. Change to the directory ``./part2/compose`` with the ``docker-compose.yaml`` file.
8 | 5. Initialize Apache Airflow by executing ``docker compose up airflow-init``.
9 | 6. Start the data infrastructure in detached mode by executing ``docker compose up -d``.
10 | 7. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time
11 | initialization of Timescale:
12 | - Create a connection to Timescale: Admin → Connections
13 | * Connection Id: timescale_conn_admin
14 | * Connection Type: Postgres
15 | * Host: host.docker.internal
16 | * Database: timescale
17 | * Login: admin
18 | * Password: password
19 | * Port: 5433
20 |
21 | A detailed guide can be found here: [SDS #2: Setting up Apache Airflow and Creating Pipelines](https://x.com/bylethquant/status/1830558712899228012).
--------------------------------------------------------------------------------
/part2/compose/.env.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_FERNET_KEY=
--------------------------------------------------------------------------------
/part2/compose/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-part2
2 |
3 | x-airflow-common:
4 | &airflow-common
5 | image: apache/airflow:2.8.1-python3.11
6 | environment:
7 | &airflow-common-env
8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}"
9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}"
10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}"
11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}"
12 | _AIRFLOW_WWW_USER_ROLE: "Admin"
13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}"
14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}"
15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}"
16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}"
17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}"
18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}"
19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}"
20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part2}"
21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor
22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false"
23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false"
24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG"
26 | user: ${AIRFLOW_UID:-50000}
27 | depends_on:
28 | airflow-postgres:
29 | condition: service_healthy
30 | volumes:
31 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs
32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags
33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config
34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins
35 |
36 | services:
37 |
38 | airflow-webserver:
39 | <<: *airflow-common
40 | container_name: airflow-webserver
41 | command: webserver
42 | ports:
43 | - "${AIRFLOW_WWW_PORT:-8080}:8080"
44 | restart: always
45 |
46 | airflow-scheduler:
47 | <<: *airflow-common
48 | container_name: airflow-scheduler
49 | command: scheduler
50 | restart: always
51 |
52 | airflow-postgres:
53 | container_name: airflow-postgres
54 | image: postgres:13
55 | environment:
56 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}"
57 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}"
58 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}"
59 | ports:
60 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432"
61 | volumes:
62 | - ../.storage/postgres:/var/lib/postgresql/data
63 | healthcheck:
64 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ]
65 | interval: 5s
66 | retries: 2
67 | start_period: 3s
68 | restart: unless-stopped
69 |
70 | airflow-init:
71 | <<: *airflow-common
72 | container_name: airflow-init
73 | environment:
74 | <<: *airflow-common-env
75 | _AIRFLOW_DB_UPGRADE: true
76 | restart: no
77 | entrypoint: /opt/airflow/scripts/entry_init.sh
78 | volumes:
79 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts
80 |
81 | timescale:
82 | container_name: timescale
83 | image: timescale/timescaledb:latest-pg15
84 | environment:
85 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}"
86 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}"
87 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}"
88 | ports:
89 | - "${TIMESCALE_PORT:-5433}:5432"
90 | volumes:
91 | - ../.storage/timescale:/var/lib/postgresql/data
92 | healthcheck:
93 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ]
94 | interval: 5s
95 | retries: 2
96 | start_period: 3s
97 | restart: unless-stopped
98 |
--------------------------------------------------------------------------------
/part2/pipecraft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/config/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | libs/
--------------------------------------------------------------------------------
/part2/pipecraft/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/dags/infopy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/infopy/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/dags/infopy/dag_infopy.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | from libs.airtasks.initial import start_task, end_task
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_infopy",
13 | description="Show all installed python packages.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - execute pip freeze
20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze')
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> pip_task >> end_dummy
25 |
--------------------------------------------------------------------------------
/part2/pipecraft/dags/libs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/dags/libs/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/dags/libs/airtasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .initial import start_task, end_task
2 |
--------------------------------------------------------------------------------
/part2/pipecraft/dags/libs/airtasks/initial.py:
--------------------------------------------------------------------------------
1 | from airflow.operators.empty import EmptyOperator
2 | from typing import Optional
3 |
4 |
5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
6 | tid = "start" if task_id is None else task_id
7 | return EmptyOperator(task_id=tid, **kwargs)
8 |
9 |
10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
11 | tid = "end" if task_id is None else task_id
12 | return EmptyOperator(task_id=tid, **kwargs)
13 |
14 |
15 |
--------------------------------------------------------------------------------
/part2/pipecraft/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part2/pipecraft/plugins/__init__.py
--------------------------------------------------------------------------------
/part2/pipecraft/scripts/entry_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | airflow db migrate
4 |
5 | airflow users create \
6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \
7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \
8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \
9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \
10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \
11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true
12 |
13 | echo "Airflow database initialization completed."
14 |
--------------------------------------------------------------------------------
/part2/pipecraft/scripts/gen_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 |
3 |
4 | def get_fernet_key():
5 | """Generates a fernet key."""
6 | return Fernet.generate_key().decode()
7 |
8 |
9 | def main():
10 | print(get_fernet_key())
11 |
12 |
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/part2/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography~=42.0.5
2 | apache-airflow~=2.8.1
--------------------------------------------------------------------------------
/part3/QUICK_START.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 | Follow these steps to set up the application using Docker Compose:
3 | 1. Change directory to `./part3/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key.
4 | 2. Change directory to `./part3/compose/` and create a `.env` file (see `.env.template`):
5 | * Set the environment variable `AIRFLOW_FERNET_KEY` to the fernet key created in step 1.
6 | * Set the environment variable `BINANCE_API_KEY` with your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072).
7 | 3. Open your terminal.
8 | 4. Change to the directory ``./part3/compose`` with the ``docker-compose.yaml`` file.
9 | 5. Initialize Apache Airflow by executing ``docker compose up airflow-init``.
10 | 6. Start the data infrastructure in detached mode by executing ``docker compose up -d``.
11 | 7. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time
12 | initialization of Timescale:
13 | - Create a connection to Timescale: Admin → Connections
14 | * Connection Id: timescale_conn_admin
15 | * Connection Type: Postgres
16 | * Host: host.docker.internal
17 | * Database: timescale
18 | * Login: admin
19 | * Password: password
20 | * Port: 5433
21 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles.
22 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables.
23 | 8. Start the Binance data pipelines.
24 |
25 | A detailed guide can be found here: [SDS #3: Robust Crypto Data Pipelines with Apache Airflow](https://x.com/bylethquant/status/1831899712749699506).
--------------------------------------------------------------------------------
/part3/compose/.env.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_FERNET_KEY=
2 | BINANCE_API_KEY=
--------------------------------------------------------------------------------
/part3/compose/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-part3
2 |
3 | x-airflow-common:
4 | &airflow-common
5 | image: apache/airflow:2.8.1-python3.11
6 | environment:
7 | &airflow-common-env
8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}"
9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}"
10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}"
11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}"
12 | _AIRFLOW_WWW_USER_ROLE: "Admin"
13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}"
14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}"
15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}"
16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}"
17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}"
18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}"
19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}"
20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part3}"
21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor
22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false"
23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false"
24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG"
26 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}"
27 | user: ${AIRFLOW_UID:-50000}
28 | depends_on:
29 | airflow-postgres:
30 | condition: service_healthy
31 | volumes:
32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs
33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags
34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config
35 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins
36 |
37 | services:
38 |
39 | airflow-webserver:
40 | <<: *airflow-common
41 | container_name: airflow-webserver
42 | command: webserver
43 | ports:
44 | - "${AIRFLOW_WWW_PORT:-8080}:8080"
45 | restart: always
46 |
47 | airflow-scheduler:
48 | <<: *airflow-common
49 | container_name: airflow-scheduler
50 | command: scheduler
51 | restart: always
52 |
53 | airflow-postgres:
54 | container_name: airflow-postgres
55 | image: postgres:13
56 | environment:
57 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}"
58 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}"
59 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}"
60 | ports:
61 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432"
62 | volumes:
63 | - ../.storage/postgres:/var/lib/postgresql/data
64 | healthcheck:
65 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ]
66 | interval: 5s
67 | retries: 2
68 | start_period: 3s
69 | restart: unless-stopped
70 |
71 | airflow-init:
72 | <<: *airflow-common
73 | container_name: airflow-init
74 | environment:
75 | <<: *airflow-common-env
76 | _AIRFLOW_DB_UPGRADE: true
77 | restart: no
78 | entrypoint: /opt/airflow/scripts/entry_init.sh
79 | volumes:
80 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts
81 |
82 | timescale:
83 | container_name: timescale
84 | image: timescale/timescaledb:latest-pg15
85 | environment:
86 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}"
87 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}"
88 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}"
89 | ports:
90 | - "${TIMESCALE_PORT:-5433}:5432"
91 | volumes:
92 | - ../.storage/timescale:/var/lib/postgresql/data
93 | healthcheck:
94 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ]
95 | interval: 5s
96 | retries: 2
97 | start_period: 3s
98 | restart: unless-stopped
99 |
--------------------------------------------------------------------------------
/part3/pipecraft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/config/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | libs/
--------------------------------------------------------------------------------
/part3/pipecraft/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/binance_market_data/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import SPOT, FUTURE
2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS
3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS
4 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/config/funding.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *"
4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future"
5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
6 | "retries": 2}
7 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/config/kline.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *"
4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot"
5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future"
6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
7 | "retries": 2}
8 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/config/symbols.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base import Instrument, Venue, ContractType
2 | from datetime import datetime, timezone
3 |
4 | SPOT = [
5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)),
6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)),
7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)),
8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)),
10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)),
13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)),
14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)),
15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)),
16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)),
17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)),
18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)),
19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc))
20 | ]
21 |
22 | FUTURE = [
23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)),
24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)),
25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)),
26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)),
27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)),
28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)),
29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)),
31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)),
32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)),
33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)),
34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)),
35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)),
36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)),
37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc))
38 | ]
39 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def generate_binance_funding_rate_dag(dag_id: str,
17 | instrument: Instrument,
18 | schedule_interval: str,
19 | catchup: bool = False,
20 | testnet: bool = False) -> DAG:
21 | """Generates a DAG for binance funding rate data pipeline."""
22 | with DAG(dag_id=dag_id,
23 | description="Data ingestion pipeline for Binance funding rates.",
24 | start_date=instrument.first_date,
25 | catchup=catchup,
26 | schedule_interval=schedule_interval,
27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag:
28 | # task flow
29 | start_dummy = start_task()
30 | binance_keys = retrieve_binance_secrets()
31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet)
33 | transform = etl_funding_tasks.transform_data(extract)
34 | ingest = etl_funding_tasks.insert_data(transform)
35 | end_dummy = end_task()
36 |
37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
38 |
39 | return dag
40 |
41 |
42 | # create DAGs for funding rates
43 | for instr in dag_config.FUTURE:
44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}"
45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id,
46 | instrument=instr,
47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP)
48 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/dag_binance_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_kline as etl_kline_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 | # create module logger
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def generate_binance_candlestick_dag(dag_id: str,
16 | instrument: Instrument,
17 | schedule_interval: str,
18 | catchup: bool = False,
19 | testnet: bool = False) -> DAG:
20 | """Generates a DAG for binance candlestick data pipeline."""
21 | with DAG(dag_id=dag_id,
22 | description="Data ingestion pipeline for Binance candlestick data.",
23 | start_date=instrument.first_date,
24 | catchup=catchup,
25 | schedule_interval=schedule_interval,
26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag:
27 | # task flow
28 | # - create start task
29 | start_dummy = start_task()
30 | # - retrieve binance api keys
31 | binance_keys = retrieve_binance_secrets()
32 | # - test connectivity of binance api
33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
34 | # - fetch binance candlestick data
35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet)
36 | # - transform data
37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol)
38 | # - insert data to timescale database
39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform)
40 | # - create end task
41 | end_dummy = end_task()
42 |
43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
44 |
45 | return dag
46 |
47 |
48 | # create DAGs for kline
49 | for instr in dag_config.SPOT + dag_config.FUTURE:
50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}"
51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id,
52 | instrument=instr,
53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE)
54 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/process/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/binance_market_data/process/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/process/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from typing import Dict, Any
4 | from airflow.models import Variable
5 | from airflow.decorators import task
6 |
7 | from libs.venues import binance as binance_client
8 | from libs.venues.base import ContractType
9 |
10 | # module logger
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | @task
15 | def retrieve_binance_secrets() -> Dict[str, Any]:
16 | """Retrieves Binance API keys."""
17 | try:
18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY"))
19 | except Exception as exc:
20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.")
21 | raise
22 | else:
23 | logger.info(f"Retrieving Binance keys was successful.")
24 | return binance_keys.as_dict()
25 |
26 |
27 | @task
28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None:
29 | """Tests connectivity to the Rest API."""
30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api,
31 | ContractType.future: binance_client.ping_future_api}
32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet)
33 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/process/etl_funding_future.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, Dict, Any, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME
11 |
12 |
13 | # module logger
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | @task
18 | def fetch_data(auth: dict,
19 | symbol: str,
20 | testnet: bool = False,
21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]:
22 | """Fetches funding rate data."""
23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
24 | start_time = datetime(data_interval_start.year,
25 | data_interval_start.month,
26 | data_interval_start.day,
27 | data_interval_start.hour)
28 | end_time = start_time + timedelta(days=1)
29 | # fetch funding rate data
30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth),
31 | symbol=symbol,
32 | start_time=start_time,
33 | end_time=end_time,
34 | testnet=testnet)
35 | return response
36 |
37 |
38 | @task
39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame:
40 | """Transforms funding rate response from API. """
41 | try:
42 | # process funding rate
43 | field_types = binance_client.FundingRate.get_field_types()
44 | df = pd.DataFrame(data=response)
45 | # re-name columns
46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict())
47 | # remove ignore columns
48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
49 | # set type of each column that is kept
50 | for i_col in df.columns:
51 | df = df.astype({i_col: field_types[i_col]})
52 | # timestamp
53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True)
54 | except Exception as exc:
55 | logger.exception(f"Transformation of data: failed. {exc}")
56 | raise
57 | else:
58 | logger.info("Transformation of data: successful.")
59 | return df
60 |
61 |
62 | @task
63 | def insert_data(df: pd.DataFrame) -> None:
64 | """Inserts funding rate data to timescale."""
65 | try:
66 | conn_id = retrieve_conn_id()
67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df)
68 | except Exception as exc:
69 | logger.exception(f"Insert data to timescale: failed. {exc}")
70 | raise
71 | else:
72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.")
73 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/binance_market_data/process/etl_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from libs.venues.base import ContractType, Instrument
11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME
12 |
13 |
14 | # module logger
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @task
19 | def fetch_data(auth: dict,
20 | instrument: Instrument,
21 | testnet: bool = False,
22 | data_interval_start: Optional[datetime] = None) -> List[list]:
23 | """Sends get request to fetch candlestick data for the previous hour."""
24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline,
25 | ContractType.future: binance_client.fetch_future_kline}
26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
27 | start_time = datetime(data_interval_start.year,
28 | data_interval_start.month,
29 | data_interval_start.day,
30 | data_interval_start.hour)
31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1)
32 | # fetch candlestick data
33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth),
34 | symbol=instrument.symbol,
35 | start_time=start_time,
36 | end_time=end_time,
37 | testnet=testnet)
38 | return response
39 |
40 |
41 | @task
42 | def transform_data(response: list, symbol: str) -> pd.DataFrame:
43 | """Transforms the data and prepares to insert."""
44 | try:
45 | # process klines
46 | field_types = binance_client.Kline.get_field_types()
47 | df = pd.DataFrame(data=response, columns=list(field_types.keys()))
48 | # remove ignore columns
49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
50 | # set type of each column that is kept
51 | for i_col in df.columns:
52 | df = df.astype({i_col: field_types[i_col]})
53 | # set time
54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True)
55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True)
56 | # add symbol column
57 | df["symbol"] = symbol
58 | except Exception as exc:
59 | logger.exception(f"Transformation of data: failed. {exc}")
60 | raise
61 | else:
62 | logger.info("Transformation of data: successful.")
63 | return df
64 |
65 |
66 | @task
67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None:
68 | """Inserts data to timescale."""
69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME,
70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME}
71 | table_name = timescale_schema_map[contract_type]
72 | try:
73 | conn_id = retrieve_conn_id()
74 | ingest_data(conn_id, table_name, df)
75 | except Exception as exc:
76 | logger.exception(f"Insert data to timescale: failed. {exc}")
77 | raise
78 | else:
79 | logger.info(f"Insert data to timescale table {table_name}: successful.")
80 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/infopy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/infopy/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/infopy/dag_infopy.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | from libs.airtasks.initial import start_task, end_task
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_infopy",
13 | description="Show all installed python packages.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - execute pip freeze
20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze')
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> pip_task >> end_dummy
25 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/__init__.py:
--------------------------------------------------------------------------------
1 | from . import venues
2 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/airtasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .initial import start_task, end_task
2 | from . import timescale
3 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/airtasks/initial.py:
--------------------------------------------------------------------------------
1 | from airflow.operators.empty import EmptyOperator
2 | from typing import Optional
3 |
4 |
5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
6 | tid = "start" if task_id is None else task_id
7 | return EmptyOperator(task_id=tid, **kwargs)
8 |
9 |
10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
11 | tid = "end" if task_id is None else task_id
12 | return EmptyOperator(task_id=tid, **kwargs)
13 |
14 |
15 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/airtasks/timescale/__init__.py:
--------------------------------------------------------------------------------
1 | from .ingester import ingest_data
2 | from .conn import retrieve_conn_id
3 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/airtasks/timescale/conn.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.models import Variable
4 |
5 | # create module logger
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | def retrieve_conn_id(id_key: str = "admin") -> str:
10 | """Retrieves timescale connection id."""
11 | try:
12 | if id_key == "admin":
13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN")
14 | elif id_key == "readonly":
15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY")
16 | else:
17 | raise ValueError("Unknown id_key. Select admin or readonly.")
18 | except Exception as exc:
19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.")
20 | raise
21 | else:
22 | logger.info(f"Retrieving admin timescale connection id: successful.")
23 | return conn_id
24 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/airtasks/timescale/ingester.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from psycopg2.extras import execute_values
5 | from psycopg2.extensions import connection
6 | from airflow.providers.postgres.hooks.postgres import PostgresHook
7 |
8 | # create module logger
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None:
13 | """Bulk insert to timescale."""
14 | try:
15 | # create a list of tuples from dataframe
16 | data_tuples = [tuple(x) for x in df_data.to_numpy()]
17 | # comma-separated dataframe columns
18 | cols = ','.join(list(df_data.columns))
19 | # SQL query to execute
20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols)
21 | with conn.cursor() as crs:
22 | execute_values(crs, query, data_tuples)
23 | conn.commit()
24 | except Exception as exc:
25 | logger.exception(f"Bulk insert: failed. {exc}.")
26 | raise
27 | else:
28 | logger.info("Bulk insert: successful.")
29 |
30 |
31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None:
32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
33 | _bulk_insert(conn, table_name, df_data)
34 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/__init__.py:
--------------------------------------------------------------------------------
1 | from . import binance
2 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure
2 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/base/base.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from dataclasses import dataclass, fields
3 | from datetime import datetime
4 |
5 |
6 | class Venue(Enum):
7 | """Crypto venues."""
8 | binance = "binance"
9 |
10 |
11 | class VenueAuthentication:
12 | """Base class to authenticate at a venue."""
13 | pass
14 |
15 |
16 | class VenueNet(Enum):
17 | """Production vs test environment."""
18 | mainnet = "mainnet"
19 | testnet = "testnet"
20 |
21 |
22 | class ContractType(Enum):
23 | """The contract type of traded instrument."""
24 | spot = "spot"
25 | future = "future"
26 |
27 |
28 | @dataclass
29 | class Instrument:
30 | """The traded instrument."""
31 | symbol: str
32 | venue: Venue
33 | contract_type: ContractType
34 | first_date: datetime
35 |
36 |
37 | @dataclass
38 | class MarketDataStructure:
39 | """Base class for market data API responses."""
40 |
41 | @classmethod
42 | def get_field_types(cls) -> dict:
43 | return {field.name: field.type for field in fields(cls)}
44 |
45 |
46 | @dataclass
47 | class RequestResultLimit:
48 | """Default and maximum limit on result of an API market data request."""
49 | default: int
50 | max: int
51 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/binance/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import BinanceAuth
2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api
3 | from .config import *
4 | from .types import Kline, FundingRate
5 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/binance/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import logging
3 | from datetime import datetime
4 | from requests import Response, HTTPError
5 | from typing import Optional, Dict, Any, List
6 | from tenacity import retry, stop_after_attempt, wait_exponential
7 | from time import sleep
8 |
9 | from libs.venues.base.base import ContractType, VenueNet
10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers
11 | import libs.venues.binance.config as binance_config
12 |
13 | # create module logger
14 | logger = logging.getLogger(__name__)
15 | # log messages from requests above level warning
16 | logging.getLogger('urllib3').setLevel(logging.WARNING)
17 |
18 | # module constants
19 | _KLINE_INTERVAL: str = "1m"
20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60
21 |
22 |
23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str:
24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL,
25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL},
26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL,
27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}}
28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet]
29 |
30 |
31 | def _get_kline_endpoint(contract_type: ContractType) -> str:
32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE,
33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE}
34 | return kline_ep_map[contract_type]
35 |
36 |
37 | def _get_ping_endpoint(contract_type: ContractType) -> str:
38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING,
39 | ContractType.future: binance_config.FUT_ENDPOINT_PING}
40 | return ping_ep_map[contract_type]
41 |
42 |
43 | def _raise_for_status(response: Response) -> None:
44 | try:
45 | response.raise_for_status()
46 | except HTTPError as http_err:
47 | if response.status_code == 429:
48 | logger.exception(f"Binance rate limit was reached. "
49 | f"I need to sleep immediately for a while to avoid any IP ban!")
50 | sleep(5*60)
51 | logger.exception(http_err)
52 | raise
53 |
54 |
55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
56 | def _fetch_api_data(auth: BinanceAuth,
57 | base_url: str,
58 | endpoint: str,
59 | symbol: Optional[str] = None,
60 | start_time: Optional[datetime] = None,
61 | end_time: Optional[datetime] = None,
62 | kline_interval: Optional[str] = None,
63 | request_result_limit: int = None,
64 | request_timeout_in_secs: int = 10) -> Any:
65 | """Market data fetcher for Binance API."""
66 | request_url: str = f"{base_url}{endpoint}"
67 | headers: dict = prepare_binance_request_headers(auth)
68 |
69 | # build request url, if necessary
70 | if symbol is not None:
71 | request_url += f"?symbol={symbol}"
72 | if start_time is not None:
73 | request_url += f"&startTime={to_ms_int(start_time)}"
74 | if end_time is not None:
75 | request_url += f"&endTime={to_ms_int(end_time)}"
76 | if kline_interval is not None:
77 | request_url += f"&interval={kline_interval}"
78 | if request_result_limit is not None:
79 | request_url += f"&limit={request_result_limit}"
80 | # send get request
81 | response = requests.get(request_url,
82 | headers=headers,
83 | timeout=request_timeout_in_secs)
84 | _raise_for_status(response)
85 | return response.json()
86 |
87 |
88 | def fetch_spot_kline(auth: BinanceAuth,
89 | symbol: str,
90 | start_time: datetime,
91 | end_time: datetime,
92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default,
93 | testnet: bool = False) -> List[list]:
94 | """Fetches spot kline market data from Binance API."""
95 | return _fetch_api_data(auth=auth,
96 | base_url=_get_base_url(ContractType.spot, testnet),
97 | endpoint=_get_kline_endpoint(ContractType.spot),
98 | symbol=symbol,
99 | start_time=start_time,
100 | end_time=end_time,
101 | request_result_limit=request_result_limit,
102 | kline_interval=_KLINE_INTERVAL)
103 |
104 |
105 | def fetch_future_kline(auth: BinanceAuth,
106 | symbol: str,
107 | start_time: Optional[datetime] = None,
108 | end_time: Optional[datetime] = None,
109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default,
110 | testnet: bool = False) -> List[list]:
111 | """Fetches future kline market data from Binance API."""
112 | return _fetch_api_data(auth=auth,
113 | base_url=_get_base_url(ContractType.future, testnet),
114 | endpoint=_get_kline_endpoint(ContractType.future),
115 | symbol=symbol,
116 | start_time=start_time,
117 | end_time=end_time,
118 | request_result_limit=request_result_limit,
119 | kline_interval=_KLINE_INTERVAL)
120 |
121 |
122 | def fetch_funding_rate(auth: BinanceAuth,
123 | symbol: str,
124 | start_time: Optional[datetime] = None,
125 | end_time: Optional[datetime] = None,
126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default,
127 | testnet: bool = False) -> List[Dict[str, Any]]:
128 | """Fetches funding rate market data from Binance API."""
129 | return _fetch_api_data(auth=auth,
130 | base_url=_get_base_url(ContractType.future, testnet),
131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING,
132 | symbol=symbol,
133 | start_time=start_time,
134 | end_time=end_time,
135 | request_result_limit=request_result_limit)
136 |
137 |
138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict:
139 | """Tests connectivity to spot Binance API."""
140 | return _fetch_api_data(auth=auth,
141 | base_url=_get_base_url(ContractType.spot, testnet),
142 | endpoint=binance_config.SPOT_ENDPOINT_PING)
143 |
144 |
145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict:
146 | """Tests connectivity to future Binance API."""
147 | return _fetch_api_data(auth=auth,
148 | base_url=_get_base_url(ContractType.future, testnet),
149 | endpoint=binance_config.FUT_ENDPOINT_PING)
150 |
151 |
152 | def fetch_spot_exchange_info() -> Dict[str, Any]:
153 | raise NotImplementedError
154 |
155 |
156 | def fetch_fut_exchange_info() -> Dict[str, Any]:
157 | raise NotImplementedError
158 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/binance/common.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, asdict
2 | from datetime import datetime, timezone
3 | from typing import Dict, Any
4 |
5 | from libs.venues.base.base import VenueAuthentication
6 |
7 |
8 | @dataclass
9 | class BinanceAuth(VenueAuthentication):
10 | BINANCE_API_KEY: str
11 |
12 | @classmethod
13 | def from_dict(cls, auth_dict: Dict[str, str]):
14 | return cls(auth_dict["BINANCE_API_KEY"])
15 |
16 | def as_dict(self) -> Dict[str, str]:
17 | return asdict(self)
18 |
19 |
20 | def to_ms_int(dt: datetime) -> int:
21 | """Converts datetime timestamp to integer in ms."""
22 | return int(round(dt.timestamp() * 1000))
23 |
24 |
25 | def to_dt(ms_int: int) -> datetime:
26 | """Converts timestamp in ms (integer) to datetime."""
27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc)
28 |
29 |
30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]:
31 | """Creates headers for Binance REST API."""
32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY}
33 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/binance/config.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base.base import RequestResultLimit
2 |
3 |
4 | # spot base
5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info
6 | SPOT_MAINNET_URL: str = "https://api.binance.com"
7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision"
8 | SPOT_REQUEST_RATE_LIMIT: int = 6000
9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1
10 |
11 | # spot ping
12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity
13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping"
14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
15 |
16 | # spot exchange info
17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information
18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo"
19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20
20 |
21 | # spot kline
22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data
23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines"
24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2
25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000)
26 |
27 | # futures base
28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info
29 | FUT_MAINNET_URL: str = "https://fapi.binance.com"
30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com"
31 | FUT_REQUEST_RATE_LIMIT: int = 2400
32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1
33 |
34 | # future ping
35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity
36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping"
37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
38 |
39 | # future exchangeInfo
40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information
41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo"
42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1
43 |
44 | # future funding rate
45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history
46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate"
47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500
48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5
49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000)
50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption
51 |
52 | # future kline
53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data
54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines"
55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500)
56 |
57 |
58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int:
59 | """Returns the weight conditional on the request result limit."""
60 | if (request_result_limit >= 1) & (request_result_limit < 100):
61 | weight = 1
62 | elif (request_result_limit >= 100) & (request_result_limit < 500):
63 | weight = 2
64 | elif (request_result_limit >= 500) & (request_result_limit < 1000):
65 | weight = 5
66 | else:
67 | weight = 10
68 | return weight
69 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/libs/venues/binance/types.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any
3 |
4 | from libs.venues.base.base import MarketDataStructure
5 |
6 |
7 | @dataclass
8 | class Kline(MarketDataStructure):
9 | open_time: int
10 | open: float
11 | high: float
12 | low: float
13 | close: float
14 | volume: float
15 | close_time: int
16 | quote_asset_volume: float
17 | number_of_trades: int
18 | taker_buy_base_asset_volume: float
19 | taker_buy_quote_asset_volume: float
20 | ignored: Any
21 |
22 |
23 | @dataclass
24 | class FundingRate(MarketDataStructure):
25 | symbol: str
26 | time: int
27 | funding_rate: float
28 | ignored: Any
29 |
30 | @staticmethod
31 | def get_rename_dict() -> dict:
32 | return {"symbol": "symbol",
33 | "fundingTime": "time",
34 | "fundingRate": "funding_rate",
35 | "markPrice": "ignored"}
36 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/dags/timescale_init/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/dag_timescale_roles.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_roles
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_roles",
13 | description="Timescale initialization pipeline for creating user roles.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create read only user role
20 | roles = create_roles("dags/timescale_init/process/create_roles.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> roles >> end_dummy
25 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/dag_timescale_tables.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_tables
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_tables",
13 | description="Timescale initialization pipeline for creating hypertables.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create hypertables
20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> tables >> end_dummy
25 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/process/__init__.py:
--------------------------------------------------------------------------------
1 | from .tsinit import create_roles, create_tables
2 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/process/create_hypertables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS binance_kline_spot (
2 | open_time TIMESTAMPTZ,
3 | symbol TEXT NOT NULL,
4 | open DOUBLE PRECISION,
5 | high DOUBLE PRECISION,
6 | low DOUBLE PRECISION,
7 | close DOUBLE PRECISION,
8 | volume DOUBLE PRECISION,
9 | close_time TIMESTAMPTZ,
10 | quote_asset_volume DOUBLE PRECISION,
11 | number_of_trades BIGINT,
12 | taker_buy_base_asset_volume DOUBLE PRECISION,
13 | taker_buy_quote_asset_volume DOUBLE PRECISION
14 | );
15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE);
16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC);
17 |
18 | CREATE TABLE IF NOT EXISTS binance_kline_future (
19 | open_time TIMESTAMPTZ,
20 | symbol TEXT NOT NULL,
21 | open DOUBLE PRECISION,
22 | high DOUBLE PRECISION,
23 | low DOUBLE PRECISION,
24 | close DOUBLE PRECISION,
25 | volume DOUBLE PRECISION,
26 | close_time TIMESTAMPTZ,
27 | quote_asset_volume DOUBLE PRECISION,
28 | number_of_trades BIGINT,
29 | taker_buy_base_asset_volume DOUBLE PRECISION,
30 | taker_buy_quote_asset_volume DOUBLE PRECISION
31 | );
32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE);
33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC);
34 |
35 |
36 | CREATE TABLE IF NOT EXISTS binance_funding_future (
37 | time TIMESTAMPTZ,
38 | symbol TEXT NOT NULL,
39 | funding_rate DOUBLE PRECISION
40 | );
41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE);
42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC);
43 |
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/process/create_roles.sql:
--------------------------------------------------------------------------------
1 | CREATE ROLE readaccess;
2 | GRANT USAGE ON SCHEMA public TO readaccess;
3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess;
4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess;
5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD};
6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME};
--------------------------------------------------------------------------------
/part3/pipecraft/dags/timescale_init/process/tsinit.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.providers.postgres.hooks.postgres import PostgresHook
4 | from psycopg2 import sql
5 | from psycopg2.sql import Composable
6 | from airflow.models import Variable
7 | from airflow.decorators import task
8 | from typing import Union
9 |
10 | from libs.airtasks.timescale import retrieve_conn_id
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def _read_sql(path: str) -> str:
17 | """Reads an sql script."""
18 | try:
19 | with open(path, "r") as sql_script:
20 | sql_cmd_str = sql_script.read()
21 | except Exception as exc:
22 | logger.exception(f"Could not read sql file. {exc}")
23 | raise
24 | else:
25 | logger.info(f"Read sql file successfully.")
26 | return sql_cmd_str
27 |
28 |
29 | def _get_roles_sql(path_str: str) -> Composable:
30 | """Constructs the sql script for creating roles."""
31 | # read file
32 | sql_cmd_str = _read_sql(path_str)
33 | try:
34 | # replace dummy variables with environmental variables
35 | sql_cmd = sql.SQL(sql_cmd_str).format(
36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")),
37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD"))
38 | )
39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD"))
40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD")))
41 | except Exception as exc:
42 | logger.exception(f"Get create roles sql statement: failed. {exc}")
43 | raise
44 | else:
45 | logger.info("Get create roles sql statement: successful.")
46 | return sql_cmd
47 |
48 |
49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None:
50 | try:
51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}")
53 | with conn.cursor() as crs:
54 | # execute sql
55 | crs.execute(sql_cmd)
56 | # commit
57 | conn.commit()
58 | except Exception as exc:
59 | logger.exception(f"Executing query: failed. {exc}")
60 | raise
61 | else:
62 | logger.info(f"Executing query: successful.")
63 |
64 |
65 | @task
66 | def create_roles(path_str: str) -> None:
67 | """Creates roles."""
68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str))
69 |
70 |
71 | @task
72 | def create_tables(path_str: str) -> None:
73 | """Creates hypertables."""
74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str))
75 |
76 |
--------------------------------------------------------------------------------
/part3/pipecraft/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part3/pipecraft/plugins/__init__.py
--------------------------------------------------------------------------------
/part3/pipecraft/scripts/entry_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | airflow db migrate
4 |
5 | airflow users create \
6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \
7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \
8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \
9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \
10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \
11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true
12 |
13 | echo "Airflow database initialization completed."
14 |
--------------------------------------------------------------------------------
/part3/pipecraft/scripts/gen_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 |
3 |
4 | def get_fernet_key():
5 | """Generates a fernet key."""
6 | return Fernet.generate_key().decode()
7 |
8 |
9 | def main():
10 | print(get_fernet_key())
11 |
12 |
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/part3/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography~=42.0.5
2 | apache-airflow~=2.8.1
3 | apache-airflow-providers-postgres~=5.10.0
4 | numpy~=1.24.4
5 | pandas~=2.0.3
6 | psycopg2-binary~=2.9.7
7 | requests~=2.31.0
8 | tenacity~=8.2.3
--------------------------------------------------------------------------------
/part4/QUICK_START.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 |
3 | Follow these steps to set up the application using Docker Compose:
4 |
5 | 1. Change directory to `./part4/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key.
6 | 2. Change directory to `./part4/compose/` and create a `.env` file (see `.env.template`):
7 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1.
8 | * Set the environment variable `BINANCE_API_KEY` with
9 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072).
10 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and
11 | `TIMESCALE_READONLY_PASSWORD`.
12 | 3. Open your terminal.
13 | 4. Initialize Apache Airflow by executing ``docker compose up airflow-init``.
14 | 5. Start the data infrastructure in detached mode by executing ``docker compose up -d``.
15 | 6. Access Airflow web interface through a browser at ``localhost:8080``. Complete the one-time
16 | initialization of Timescale:
17 | - Create a connection to Timescale: Admin → Connections
18 | * Connection Id: timescale_conn_admin
19 | * Connection Type: Postgres
20 | * Host: host.docker.internal
21 | * Database: timescale
22 | * Login: admin
23 | * Password: password
24 | * Port: 5433
25 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles.
26 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables.
27 | 7. Start the Binance data pipelines.
28 | 8. Access Grafana web interface through a browser at ``localhost:3000``.
29 |
30 | A detailed guide can be found here: [SDS #4: Crypto Market Data Dashboard with Grafana](https://x.com/bylethquant/status/1833141733305295348).
--------------------------------------------------------------------------------
/part4/compose/.env.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_FERNET_KEY=
2 | BINANCE_API_KEY=
3 | # needed for setting grafana datasources.yaml correctly
4 | TIMESCALE_PORT=5433
5 | TIMESCALE_DATABASE_NAME=timescale
6 | TIMESCALE_READONLY_USERNAME=user
7 | TIMESCALE_READONLY_PASSWORD=password
--------------------------------------------------------------------------------
/part4/compose/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-part4
2 |
3 | x-airflow-common:
4 | &airflow-common
5 | image: apache/airflow:2.8.1-python3.11
6 | environment:
7 | &airflow-common-env
8 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}"
9 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}"
10 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}"
11 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}"
12 | _AIRFLOW_WWW_USER_ROLE: "Admin"
13 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}"
14 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}"
15 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}"
16 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}"
17 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}"
18 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}"
19 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}"
20 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part4}"
21 | AIRFLOW__CORE__EXECUTOR: LocalExecutor
22 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false"
23 | AIRFLOW__CORE__LOAD_EXAMPLES: "false"
24 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
25 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG"
26 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}"
27 | user: ${AIRFLOW_UID:-50000}
28 | depends_on:
29 | airflow-postgres:
30 | condition: service_healthy
31 | volumes:
32 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs
33 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags
34 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config
35 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins
36 |
37 | services:
38 |
39 | airflow-webserver:
40 | <<: *airflow-common
41 | container_name: airflow-webserver
42 | command: webserver
43 | ports:
44 | - "${AIRFLOW_WWW_PORT:-8080}:8080"
45 | restart: always
46 |
47 | airflow-scheduler:
48 | <<: *airflow-common
49 | container_name: airflow-scheduler
50 | command: scheduler
51 | restart: always
52 |
53 | airflow-postgres:
54 | container_name: airflow-postgres
55 | image: postgres:13
56 | environment:
57 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}"
58 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}"
59 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}"
60 | ports:
61 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432"
62 | volumes:
63 | - ../.storage/postgres:/var/lib/postgresql/data
64 | healthcheck:
65 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ]
66 | interval: 5s
67 | retries: 2
68 | start_period: 3s
69 | restart: unless-stopped
70 |
71 | airflow-init:
72 | <<: *airflow-common
73 | container_name: airflow-init
74 | environment:
75 | <<: *airflow-common-env
76 | _AIRFLOW_DB_UPGRADE: true
77 | restart: no
78 | entrypoint: /opt/airflow/scripts/entry_init.sh
79 | volumes:
80 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts
81 |
82 | timescale:
83 | container_name: timescale
84 | image: timescale/timescaledb:latest-pg15
85 | environment:
86 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}"
87 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}"
88 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}"
89 | ports:
90 | - "${TIMESCALE_PORT:-5433}:5432"
91 | volumes:
92 | - ../.storage/timescale:/var/lib/postgresql/data
93 | healthcheck:
94 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ]
95 | interval: 5s
96 | retries: 2
97 | start_period: 3s
98 | restart: unless-stopped
99 |
100 | grafana:
101 | container_name: grafana
102 | image: grafana/grafana:10.0.2
103 | environment:
104 | GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}"
105 | GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-password}"
106 | GF_DATABASE_SSL_MODE: disable
107 | GF_ENABLE_GZIP: true
108 | env_file:
109 | - .env
110 | ports:
111 | - "${GRAFANA_PORT:-3000}:3000"
112 | depends_on:
113 | timescale:
114 | condition: service_healthy
115 | volumes:
116 | - ../grafana/provisioning:/etc/grafana/provisioning
117 | - ../grafana/dashboards:/var/lib/grafana/dashboards
118 | restart: unless-stopped
--------------------------------------------------------------------------------
/part4/grafana/provisioning/dashboards/dashboards.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'dashboards'
5 | orgId: 1
6 | folder: ''
7 | folderUid: ''
8 | type: file
9 | disableDeletion: true
10 | editable: true
11 | updateIntervalSeconds: 10
12 | allowUiUpdates: false
13 | options:
14 | path: /var/lib/grafana/dashboards
--------------------------------------------------------------------------------
/part4/grafana/provisioning/datasources/datasources.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | datasources:
4 | - name: timescale
5 | type: postgres
6 | url: "host.docker.internal:${TIMESCALE_PORT}"
7 | database: "${TIMESCALE_DATABASE_NAME}"
8 | user: "${TIMESCALE_READONLY_USERNAME}"
9 | secureJsonData:
10 | password: "${TIMESCALE_READONLY_PASSWORD}"
11 | jsonData:
12 | postgresVersion: 1500
13 | sslmode: "disable"
14 | timescaledb: true
15 | tlsAuth: false
16 | tlsAuthWithCACert: false
17 | tlsConfigurationMethod: "file-path"
18 | tlsSkipVerify: true
--------------------------------------------------------------------------------
/part4/pipecraft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/config/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | libs/
--------------------------------------------------------------------------------
/part4/pipecraft/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/binance_market_data/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import SPOT, FUTURE
2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS
3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS
4 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/config/funding.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *"
4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future"
5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
6 | "retries": 2}
7 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/config/kline.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *"
4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot"
5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future"
6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
7 | "retries": 2}
8 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/config/symbols.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base import Instrument, Venue, ContractType
2 | from datetime import datetime, timezone
3 |
4 | SPOT = [
5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)),
6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)),
7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)),
8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)),
10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)),
13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)),
14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)),
15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)),
16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)),
17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)),
18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)),
19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc))
20 | ]
21 |
22 | FUTURE = [
23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)),
24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)),
25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)),
26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)),
27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)),
28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)),
29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)),
31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)),
32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)),
33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)),
34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)),
35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)),
36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)),
37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc))
38 | ]
39 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def generate_binance_funding_rate_dag(dag_id: str,
17 | instrument: Instrument,
18 | schedule_interval: str,
19 | catchup: bool = False,
20 | testnet: bool = False) -> DAG:
21 | """Generates a DAG for binance funding rate data pipeline."""
22 | with DAG(dag_id=dag_id,
23 | description="Data ingestion pipeline for Binance funding rates.",
24 | start_date=instrument.first_date,
25 | catchup=catchup,
26 | schedule_interval=schedule_interval,
27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag:
28 | # task flow
29 | start_dummy = start_task()
30 | binance_keys = retrieve_binance_secrets()
31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet)
33 | transform = etl_funding_tasks.transform_data(extract)
34 | ingest = etl_funding_tasks.insert_data(transform)
35 | end_dummy = end_task()
36 |
37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
38 |
39 | return dag
40 |
41 |
42 | # create DAGs for funding rates
43 | for instr in dag_config.FUTURE:
44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}"
45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id,
46 | instrument=instr,
47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP)
48 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/dag_binance_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_kline as etl_kline_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 | # create module logger
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def generate_binance_candlestick_dag(dag_id: str,
16 | instrument: Instrument,
17 | schedule_interval: str,
18 | catchup: bool = False,
19 | testnet: bool = False) -> DAG:
20 | """Generates a DAG for binance candlestick data pipeline."""
21 | with DAG(dag_id=dag_id,
22 | description="Data ingestion pipeline for Binance candlestick data.",
23 | start_date=instrument.first_date,
24 | catchup=catchup,
25 | schedule_interval=schedule_interval,
26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag:
27 | # task flow
28 | # - create start task
29 | start_dummy = start_task()
30 | # - retrieve binance api keys
31 | binance_keys = retrieve_binance_secrets()
32 | # - test connectivity of binance api
33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
34 | # - fetch binance candlestick data
35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet)
36 | # - transform data
37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol)
38 | # - insert data to timescale database
39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform)
40 | # - create end task
41 | end_dummy = end_task()
42 |
43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
44 |
45 | return dag
46 |
47 |
48 | # create DAGs for kline
49 | for instr in dag_config.SPOT + dag_config.FUTURE:
50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}"
51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id,
52 | instrument=instr,
53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE)
54 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/process/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/binance_market_data/process/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/process/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from typing import Dict, Any
4 | from airflow.models import Variable
5 | from airflow.decorators import task
6 |
7 | from libs.venues import binance as binance_client
8 | from libs.venues.base import ContractType
9 |
10 | # module logger
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | @task
15 | def retrieve_binance_secrets() -> Dict[str, Any]:
16 | """Retrieves Binance API keys."""
17 | try:
18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY"))
19 | except Exception as exc:
20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.")
21 | raise
22 | else:
23 | logger.info(f"Retrieving Binance keys was successful.")
24 | return binance_keys.as_dict()
25 |
26 |
27 | @task
28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None:
29 | """Tests connectivity to the Rest API."""
30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api,
31 | ContractType.future: binance_client.ping_future_api}
32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet)
33 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/process/etl_funding_future.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, Dict, Any, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME
11 |
12 |
13 | # module logger
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | @task
18 | def fetch_data(auth: dict,
19 | symbol: str,
20 | testnet: bool = False,
21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]:
22 | """Fetches funding rate data."""
23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
24 | start_time = datetime(data_interval_start.year,
25 | data_interval_start.month,
26 | data_interval_start.day,
27 | data_interval_start.hour)
28 | end_time = start_time + timedelta(days=1)
29 | # fetch funding rate data
30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth),
31 | symbol=symbol,
32 | start_time=start_time,
33 | end_time=end_time,
34 | testnet=testnet)
35 | return response
36 |
37 |
38 | @task
39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame:
40 | """Transforms funding rate response from API. """
41 | try:
42 | # process funding rate
43 | field_types = binance_client.FundingRate.get_field_types()
44 | df = pd.DataFrame(data=response)
45 | # re-name columns
46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict())
47 | # remove ignore columns
48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
49 | # set type of each column that is kept
50 | for i_col in df.columns:
51 | df = df.astype({i_col: field_types[i_col]})
52 | # timestamp
53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True)
54 | except Exception as exc:
55 | logger.exception(f"Transformation of data: failed. {exc}")
56 | raise
57 | else:
58 | logger.info("Transformation of data: successful.")
59 | return df
60 |
61 |
62 | @task
63 | def insert_data(df: pd.DataFrame) -> None:
64 | """Inserts funding rate data to timescale."""
65 | try:
66 | conn_id = retrieve_conn_id()
67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df)
68 | except Exception as exc:
69 | logger.exception(f"Insert data to timescale: failed. {exc}")
70 | raise
71 | else:
72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.")
73 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/binance_market_data/process/etl_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from libs.venues.base import ContractType, Instrument
11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME
12 |
13 |
14 | # module logger
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @task
19 | def fetch_data(auth: dict,
20 | instrument: Instrument,
21 | testnet: bool = False,
22 | data_interval_start: Optional[datetime] = None) -> List[list]:
23 | """Sends get request to fetch candlestick data for the previous hour."""
24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline,
25 | ContractType.future: binance_client.fetch_future_kline}
26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
27 | start_time = datetime(data_interval_start.year,
28 | data_interval_start.month,
29 | data_interval_start.day,
30 | data_interval_start.hour)
31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1)
32 | # fetch candlestick data
33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth),
34 | symbol=instrument.symbol,
35 | start_time=start_time,
36 | end_time=end_time,
37 | testnet=testnet)
38 | return response
39 |
40 |
41 | @task
42 | def transform_data(response: list, symbol: str) -> pd.DataFrame:
43 | """Transforms the data and prepares to insert."""
44 | try:
45 | # process klines
46 | field_types = binance_client.Kline.get_field_types()
47 | df = pd.DataFrame(data=response, columns=list(field_types.keys()))
48 | # remove ignore columns
49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
50 | # set type of each column that is kept
51 | for i_col in df.columns:
52 | df = df.astype({i_col: field_types[i_col]})
53 | # set time
54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True)
55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True)
56 | # add symbol column
57 | df["symbol"] = symbol
58 | except Exception as exc:
59 | logger.exception(f"Transformation of data: failed. {exc}")
60 | raise
61 | else:
62 | logger.info("Transformation of data: successful.")
63 | return df
64 |
65 |
66 | @task
67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None:
68 | """Inserts data to timescale."""
69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME,
70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME}
71 | table_name = timescale_schema_map[contract_type]
72 | try:
73 | conn_id = retrieve_conn_id()
74 | ingest_data(conn_id, table_name, df)
75 | except Exception as exc:
76 | logger.exception(f"Insert data to timescale: failed. {exc}")
77 | raise
78 | else:
79 | logger.info(f"Insert data to timescale table {table_name}: successful.")
80 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/infopy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/infopy/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/infopy/dag_infopy.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | from libs.airtasks.initial import start_task, end_task
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_infopy",
13 | description="Show all installed python packages.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - execute pip freeze
20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze')
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> pip_task >> end_dummy
25 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/__init__.py:
--------------------------------------------------------------------------------
1 | from . import venues
2 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/airtasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .initial import start_task, end_task
2 | from . import timescale
3 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/airtasks/initial.py:
--------------------------------------------------------------------------------
1 | from airflow.operators.empty import EmptyOperator
2 | from typing import Optional
3 |
4 |
5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
6 | tid = "start" if task_id is None else task_id
7 | return EmptyOperator(task_id=tid, **kwargs)
8 |
9 |
10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
11 | tid = "end" if task_id is None else task_id
12 | return EmptyOperator(task_id=tid, **kwargs)
13 |
14 |
15 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/airtasks/timescale/__init__.py:
--------------------------------------------------------------------------------
1 | from .ingester import ingest_data
2 | from .conn import retrieve_conn_id
3 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/airtasks/timescale/conn.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.models import Variable
4 |
5 | # create module logger
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | def retrieve_conn_id(id_key: str = "admin") -> str:
10 | """Retrieves timescale connection id."""
11 | try:
12 | if id_key == "admin":
13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN")
14 | elif id_key == "readonly":
15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY")
16 | else:
17 | raise ValueError("Unknown id_key. Select admin or readonly.")
18 | except Exception as exc:
19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.")
20 | raise
21 | else:
22 | logger.info(f"Retrieving admin timescale connection id: successful.")
23 | return conn_id
24 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/airtasks/timescale/ingester.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from psycopg2.extras import execute_values
5 | from psycopg2.extensions import connection
6 | from airflow.providers.postgres.hooks.postgres import PostgresHook
7 |
8 | # create module logger
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None:
13 | """Bulk insert to timescale."""
14 | try:
15 | # create a list of tuples from dataframe
16 | data_tuples = [tuple(x) for x in df_data.to_numpy()]
17 | # comma-separated dataframe columns
18 | cols = ','.join(list(df_data.columns))
19 | # SQL query to execute
20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols)
21 | with conn.cursor() as crs:
22 | execute_values(crs, query, data_tuples)
23 | conn.commit()
24 | except Exception as exc:
25 | logger.exception(f"Bulk insert: failed. {exc}.")
26 | raise
27 | else:
28 | logger.info("Bulk insert: successful.")
29 |
30 |
31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None:
32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
33 | _bulk_insert(conn, table_name, df_data)
34 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/__init__.py:
--------------------------------------------------------------------------------
1 | from . import binance
2 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure
2 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/base/base.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from dataclasses import dataclass, fields
3 | from datetime import datetime
4 |
5 |
6 | class Venue(Enum):
7 | """Crypto venues."""
8 | binance = "binance"
9 |
10 |
11 | class VenueAuthentication:
12 | """Base class to authenticate at a venue."""
13 | pass
14 |
15 |
16 | class VenueNet(Enum):
17 | """Production vs test environment."""
18 | mainnet = "mainnet"
19 | testnet = "testnet"
20 |
21 |
22 | class ContractType(Enum):
23 | """The contract type of traded instrument."""
24 | spot = "spot"
25 | future = "future"
26 |
27 |
28 | @dataclass
29 | class Instrument:
30 | """The traded instrument."""
31 | symbol: str
32 | venue: Venue
33 | contract_type: ContractType
34 | first_date: datetime
35 |
36 |
37 | @dataclass
38 | class MarketDataStructure:
39 | """Base class for market data API responses."""
40 |
41 | @classmethod
42 | def get_field_types(cls) -> dict:
43 | return {field.name: field.type for field in fields(cls)}
44 |
45 |
46 | @dataclass
47 | class RequestResultLimit:
48 | """Default and maximum limit on result of an API market data request."""
49 | default: int
50 | max: int
51 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/binance/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import BinanceAuth
2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api
3 | from .config import *
4 | from .types import Kline, FundingRate
5 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/binance/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import logging
3 | from datetime import datetime
4 | from requests import Response, HTTPError
5 | from typing import Optional, Dict, Any, List
6 | from tenacity import retry, stop_after_attempt, wait_exponential
7 | from time import sleep
8 |
9 | from libs.venues.base.base import ContractType, VenueNet
10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers
11 | import libs.venues.binance.config as binance_config
12 |
13 | # create module logger
14 | logger = logging.getLogger(__name__)
15 | # log messages from requests above level warning
16 | logging.getLogger('urllib3').setLevel(logging.WARNING)
17 |
18 | # module constants
19 | _KLINE_INTERVAL: str = "1m"
20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60
21 |
22 |
23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str:
24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL,
25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL},
26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL,
27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}}
28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet]
29 |
30 |
31 | def _get_kline_endpoint(contract_type: ContractType) -> str:
32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE,
33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE}
34 | return kline_ep_map[contract_type]
35 |
36 |
37 | def _get_ping_endpoint(contract_type: ContractType) -> str:
38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING,
39 | ContractType.future: binance_config.FUT_ENDPOINT_PING}
40 | return ping_ep_map[contract_type]
41 |
42 |
43 | def _raise_for_status(response: Response) -> None:
44 | try:
45 | response.raise_for_status()
46 | except HTTPError as http_err:
47 | if response.status_code == 429:
48 | logger.exception(f"Binance rate limit was reached. "
49 | f"I need to sleep immediately for a while to avoid any IP ban!")
50 | sleep(5*60)
51 | logger.exception(http_err)
52 | raise
53 |
54 |
55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
56 | def _fetch_api_data(auth: BinanceAuth,
57 | base_url: str,
58 | endpoint: str,
59 | symbol: Optional[str] = None,
60 | start_time: Optional[datetime] = None,
61 | end_time: Optional[datetime] = None,
62 | kline_interval: Optional[str] = None,
63 | request_result_limit: int = None,
64 | request_timeout_in_secs: int = 10) -> Any:
65 | """Market data fetcher for Binance API."""
66 | request_url: str = f"{base_url}{endpoint}"
67 | headers: dict = prepare_binance_request_headers(auth)
68 |
69 | # build request url, if necessary
70 | if symbol is not None:
71 | request_url += f"?symbol={symbol}"
72 | if start_time is not None:
73 | request_url += f"&startTime={to_ms_int(start_time)}"
74 | if end_time is not None:
75 | request_url += f"&endTime={to_ms_int(end_time)}"
76 | if kline_interval is not None:
77 | request_url += f"&interval={kline_interval}"
78 | if request_result_limit is not None:
79 | request_url += f"&limit={request_result_limit}"
80 | # send get request
81 | response = requests.get(request_url,
82 | headers=headers,
83 | timeout=request_timeout_in_secs)
84 | _raise_for_status(response)
85 | return response.json()
86 |
87 |
88 | def fetch_spot_kline(auth: BinanceAuth,
89 | symbol: str,
90 | start_time: datetime,
91 | end_time: datetime,
92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default,
93 | testnet: bool = False) -> List[list]:
94 | """Fetches spot kline market data from Binance API."""
95 | return _fetch_api_data(auth=auth,
96 | base_url=_get_base_url(ContractType.spot, testnet),
97 | endpoint=_get_kline_endpoint(ContractType.spot),
98 | symbol=symbol,
99 | start_time=start_time,
100 | end_time=end_time,
101 | request_result_limit=request_result_limit,
102 | kline_interval=_KLINE_INTERVAL)
103 |
104 |
105 | def fetch_future_kline(auth: BinanceAuth,
106 | symbol: str,
107 | start_time: Optional[datetime] = None,
108 | end_time: Optional[datetime] = None,
109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default,
110 | testnet: bool = False) -> List[list]:
111 | """Fetches future kline market data from Binance API."""
112 | return _fetch_api_data(auth=auth,
113 | base_url=_get_base_url(ContractType.future, testnet),
114 | endpoint=_get_kline_endpoint(ContractType.future),
115 | symbol=symbol,
116 | start_time=start_time,
117 | end_time=end_time,
118 | request_result_limit=request_result_limit,
119 | kline_interval=_KLINE_INTERVAL)
120 |
121 |
122 | def fetch_funding_rate(auth: BinanceAuth,
123 | symbol: str,
124 | start_time: Optional[datetime] = None,
125 | end_time: Optional[datetime] = None,
126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default,
127 | testnet: bool = False) -> List[Dict[str, Any]]:
128 | """Fetches funding rate market data from Binance API."""
129 | return _fetch_api_data(auth=auth,
130 | base_url=_get_base_url(ContractType.future, testnet),
131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING,
132 | symbol=symbol,
133 | start_time=start_time,
134 | end_time=end_time,
135 | request_result_limit=request_result_limit)
136 |
137 |
138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict:
139 | """Tests connectivity to spot Binance API."""
140 | return _fetch_api_data(auth=auth,
141 | base_url=_get_base_url(ContractType.spot, testnet),
142 | endpoint=binance_config.SPOT_ENDPOINT_PING)
143 |
144 |
145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict:
146 | """Tests connectivity to future Binance API."""
147 | return _fetch_api_data(auth=auth,
148 | base_url=_get_base_url(ContractType.future, testnet),
149 | endpoint=binance_config.FUT_ENDPOINT_PING)
150 |
151 |
152 | def fetch_spot_exchange_info() -> Dict[str, Any]:
153 | raise NotImplementedError
154 |
155 |
156 | def fetch_fut_exchange_info() -> Dict[str, Any]:
157 | raise NotImplementedError
158 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/binance/common.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, asdict
2 | from datetime import datetime, timezone
3 | from typing import Dict, Any
4 |
5 | from libs.venues.base.base import VenueAuthentication
6 |
7 |
8 | @dataclass
9 | class BinanceAuth(VenueAuthentication):
10 | BINANCE_API_KEY: str
11 |
12 | @classmethod
13 | def from_dict(cls, auth_dict: Dict[str, str]):
14 | return cls(auth_dict["BINANCE_API_KEY"])
15 |
16 | def as_dict(self) -> Dict[str, str]:
17 | return asdict(self)
18 |
19 |
20 | def to_ms_int(dt: datetime) -> int:
21 | """Converts datetime timestamp to integer in ms."""
22 | return int(round(dt.timestamp() * 1000))
23 |
24 |
25 | def to_dt(ms_int: int) -> datetime:
26 | """Converts timestamp in ms (integer) to datetime."""
27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc)
28 |
29 |
30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]:
31 | """Creates headers for Binance REST API."""
32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY}
33 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/binance/config.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base.base import RequestResultLimit
2 |
3 |
4 | # spot base
5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info
6 | SPOT_MAINNET_URL: str = "https://api.binance.com"
7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision"
8 | SPOT_REQUEST_RATE_LIMIT: int = 6000
9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1
10 |
11 | # spot ping
12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity
13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping"
14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
15 |
16 | # spot exchange info
17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information
18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo"
19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20
20 |
21 | # spot kline
22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data
23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines"
24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2
25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000)
26 |
27 | # futures base
28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info
29 | FUT_MAINNET_URL: str = "https://fapi.binance.com"
30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com"
31 | FUT_REQUEST_RATE_LIMIT: int = 2400
32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1
33 |
34 | # future ping
35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity
36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping"
37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
38 |
39 | # future exchangeInfo
40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information
41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo"
42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1
43 |
44 | # future funding rate
45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history
46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate"
47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500
48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5
49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000)
50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption
51 |
52 | # future kline
53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data
54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines"
55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500)
56 |
57 |
58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int:
59 | """Returns the weight conditional on the request result limit."""
60 | if (request_result_limit >= 1) & (request_result_limit < 100):
61 | weight = 1
62 | elif (request_result_limit >= 100) & (request_result_limit < 500):
63 | weight = 2
64 | elif (request_result_limit >= 500) & (request_result_limit < 1000):
65 | weight = 5
66 | else:
67 | weight = 10
68 | return weight
69 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/libs/venues/binance/types.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any
3 |
4 | from libs.venues.base.base import MarketDataStructure
5 |
6 |
7 | @dataclass
8 | class Kline(MarketDataStructure):
9 | open_time: int
10 | open: float
11 | high: float
12 | low: float
13 | close: float
14 | volume: float
15 | close_time: int
16 | quote_asset_volume: float
17 | number_of_trades: int
18 | taker_buy_base_asset_volume: float
19 | taker_buy_quote_asset_volume: float
20 | ignored: Any
21 |
22 |
23 | @dataclass
24 | class FundingRate(MarketDataStructure):
25 | symbol: str
26 | time: int
27 | funding_rate: float
28 | ignored: Any
29 |
30 | @staticmethod
31 | def get_rename_dict() -> dict:
32 | return {"symbol": "symbol",
33 | "fundingTime": "time",
34 | "fundingRate": "funding_rate",
35 | "markPrice": "ignored"}
36 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/dags/timescale_init/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/dag_timescale_roles.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_roles
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_roles",
13 | description="Timescale initialization pipeline for creating user roles.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create read only user role
20 | roles = create_roles("dags/timescale_init/process/create_roles.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> roles >> end_dummy
25 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/dag_timescale_tables.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_tables
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_tables",
13 | description="Timescale initialization pipeline for creating hypertables.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create hypertables
20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> tables >> end_dummy
25 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/process/__init__.py:
--------------------------------------------------------------------------------
1 | from .tsinit import create_roles, create_tables
2 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/process/create_hypertables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS binance_kline_spot (
2 | open_time TIMESTAMPTZ,
3 | symbol TEXT NOT NULL,
4 | open DOUBLE PRECISION,
5 | high DOUBLE PRECISION,
6 | low DOUBLE PRECISION,
7 | close DOUBLE PRECISION,
8 | volume DOUBLE PRECISION,
9 | close_time TIMESTAMPTZ,
10 | quote_asset_volume DOUBLE PRECISION,
11 | number_of_trades BIGINT,
12 | taker_buy_base_asset_volume DOUBLE PRECISION,
13 | taker_buy_quote_asset_volume DOUBLE PRECISION
14 | );
15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE);
16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC);
17 |
18 | CREATE TABLE IF NOT EXISTS binance_kline_future (
19 | open_time TIMESTAMPTZ,
20 | symbol TEXT NOT NULL,
21 | open DOUBLE PRECISION,
22 | high DOUBLE PRECISION,
23 | low DOUBLE PRECISION,
24 | close DOUBLE PRECISION,
25 | volume DOUBLE PRECISION,
26 | close_time TIMESTAMPTZ,
27 | quote_asset_volume DOUBLE PRECISION,
28 | number_of_trades BIGINT,
29 | taker_buy_base_asset_volume DOUBLE PRECISION,
30 | taker_buy_quote_asset_volume DOUBLE PRECISION
31 | );
32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE);
33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC);
34 |
35 |
36 | CREATE TABLE IF NOT EXISTS binance_funding_future (
37 | time TIMESTAMPTZ,
38 | symbol TEXT NOT NULL,
39 | funding_rate DOUBLE PRECISION
40 | );
41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE);
42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC);
43 |
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/process/create_roles.sql:
--------------------------------------------------------------------------------
1 | CREATE ROLE readaccess;
2 | GRANT USAGE ON SCHEMA public TO readaccess;
3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess;
4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess;
5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD};
6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME};
--------------------------------------------------------------------------------
/part4/pipecraft/dags/timescale_init/process/tsinit.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.providers.postgres.hooks.postgres import PostgresHook
4 | from psycopg2 import sql
5 | from psycopg2.sql import Composable
6 | from airflow.models import Variable
7 | from airflow.decorators import task
8 | from typing import Union
9 |
10 | from libs.airtasks.timescale import retrieve_conn_id
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def _read_sql(path: str) -> str:
17 | """Reads an sql script."""
18 | try:
19 | with open(path, "r") as sql_script:
20 | sql_cmd_str = sql_script.read()
21 | except Exception as exc:
22 | logger.exception(f"Could not read sql file. {exc}")
23 | raise
24 | else:
25 | logger.info(f"Read sql file successfully.")
26 | return sql_cmd_str
27 |
28 |
29 | def _get_roles_sql(path_str: str) -> Composable:
30 | """Constructs the sql script for creating roles."""
31 | # read file
32 | sql_cmd_str = _read_sql(path_str)
33 | try:
34 | # replace dummy variables with environmental variables
35 | sql_cmd = sql.SQL(sql_cmd_str).format(
36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")),
37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD"))
38 | )
39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD"))
40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD")))
41 | except Exception as exc:
42 | logger.exception(f"Get create roles sql statement: failed. {exc}")
43 | raise
44 | else:
45 | logger.info("Get create roles sql statement: successful.")
46 | return sql_cmd
47 |
48 |
49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None:
50 | try:
51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}")
53 | with conn.cursor() as crs:
54 | # execute sql
55 | crs.execute(sql_cmd)
56 | # commit
57 | conn.commit()
58 | except Exception as exc:
59 | logger.exception(f"Executing query: failed. {exc}")
60 | raise
61 | else:
62 | logger.info(f"Executing query: successful.")
63 |
64 |
65 | @task
66 | def create_roles(path_str: str) -> None:
67 | """Creates roles."""
68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str))
69 |
70 |
71 | @task
72 | def create_tables(path_str: str) -> None:
73 | """Creates hypertables."""
74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str))
75 |
76 |
--------------------------------------------------------------------------------
/part4/pipecraft/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part4/pipecraft/plugins/__init__.py
--------------------------------------------------------------------------------
/part4/pipecraft/scripts/entry_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | airflow db migrate
4 |
5 | airflow users create \
6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \
7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \
8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \
9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \
10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \
11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true
12 |
13 | echo "Airflow database initialization completed."
14 |
--------------------------------------------------------------------------------
/part4/pipecraft/scripts/gen_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 |
3 |
4 | def get_fernet_key():
5 | """Generates a fernet key."""
6 | return Fernet.generate_key().decode()
7 |
8 |
9 | def main():
10 | print(get_fernet_key())
11 |
12 |
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/part4/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography~=42.0.5
2 | apache-airflow~=2.8.1
3 | apache-airflow-providers-postgres~=5.10.0
4 | numpy~=1.24.4
5 | pandas~=2.0.3
6 | psycopg2-binary~=2.9.7
7 | requests~=2.31.0
8 | tenacity~=8.2.3
--------------------------------------------------------------------------------
/part5/QUICK_START.md:
--------------------------------------------------------------------------------
1 | # Quick Start
2 |
3 | Follow these steps to set up the application using Docker Compose:
4 |
5 | 1. Change directory to `./part5/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key.
6 | 2. Change directory to `./part5/compose/` and create an `.env` file (see template `.env.template`):
7 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1.
8 | * Set the environment variable `BINANCE_API_KEY` with
9 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072).
10 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and
11 | `TIMESCALE_READONLY_PASSWORD`.
12 | 3. Open your terminal.
13 | 4. Create common network for Traefik and data-infra services
14 | ```
15 | docker network create traefik-net
16 | ```
17 | 5. Start Traefik service
18 |
19 | ```
20 | docker compose -f compose.traefik.core.yaml -f compose.traefik.dev.yaml --env-file ./.env up -d
21 | ```
22 | 6. Initialize Apache Airflow
23 |
24 | ```
25 | docker compose -f compose.infra.core.yaml -f compose.infra.dev.yaml --env-file ./.env up airflow-init
26 | ```
27 |
28 | 7. Start the data infrastructure
29 |
30 | ```
31 | docker compose -f compose.infra.core.yaml -f compose.infra.dev.yaml --env-file ./.env up -d
32 | ```
33 |
34 | 8. Access Airflow web interface through a browser at ``airflow.localhost``. Complete the one-time
35 | initialization of Timescale:
36 | - Create a connection to Timescale: Admin → Connections
37 | * Connection Id: timescale_conn_admin
38 | * Connection Type: Postgres
39 | * Host: host.docker.internal
40 | * Database: timescale
41 | * Login: admin
42 | * Password: password
43 | * Port: 5433
44 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles.
45 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables.
46 | 9. Start the Binance data pipelines.
47 | 10. Access Grafana web interface through a browser at ``grafana.localhost``.
48 |
49 | A detailed guide can be found
50 | here:
51 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627)
52 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387)
53 |
--------------------------------------------------------------------------------
/part5/QUICK_START_PROD.md:
--------------------------------------------------------------------------------
1 | # Quick Start PROD
2 |
3 | Preliminary steps:
4 |
5 | * Create [Hetzner Cloud](https://www.hetzner.com/cloud/) server with Docker Compose application installed
6 | * Register domain and link domain to server
7 |
8 | Follow these steps to set up the application using Docker Compose:
9 |
10 | 1. Change directory to `./part5/compose/pipecraft/scripts/` and execute the Python script `gen_fernet_key.py`. Copy key.
11 | 2. Change directory to `./part5/compose/` and create a `.env.prod` file (see template `.env.prod.template`):
12 | * Set the environment variable `AIRFLOW_FERNET_KEY` with the fernet key created in step 1.
13 | * Set the environment variable `BINANCE_API_KEY` with
14 | your [Binance API keys](https://www.binance.com/en/support/faq/how-to-create-api-keys-on-binance-360002502072).
15 | * Set the environment variables `TIMESCALE_PORT`, `TIMESCALE_DATABASE_NAME`, `TIMESCALE_READONLY_USERNAME`, and
16 | `TIMESCALE_READONLY_PASSWORD`.
17 | * Set the domain you registered `DOMAIN_NAME`. For example: `DOMAIN_NAME=mydomain.com`
18 | * We use [Let´s Encrypt](https://letsencrypt.org/) to get an SSL/TLS certificate. The certificate is used to enable
19 | HTTPS (SSL/TLS) to secure browser-to-server communications. [Let´s Encrypt](https://letsencrypt.org/) issues the
20 | certificate automatically. We need to provide a valid email using `ACME_EMAIL`, which is used for important
21 | communications related to the certificate that we generated. For example, this email would be used to alert you of
22 | impending certificate expirations.
23 | 3. Create custom docker images for Apache Airflow and Grafana that we push
24 | to [GitHub Container Registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) (
25 | GHCR):
26 | 1. Create a GitHub access token to push docker images to GHCR. Store it in a file `.ghcr.secret`
27 | at `./part5/compose/`.
28 | 2. Open your terminal, navigate to `./part5/compose/` and log into GitHub Container Registry with your generated
29 | access token
30 | ```
31 | cat .ghcr.secret | docker login --username --password-stdin ghcr.io
32 | ```
33 | 3. Navigate to `./part5/pipecraft/` and execute `pipecraft_build_and_push.sh`. We build the custom docker image
34 | using `pipecraft.Dockerfile` and push it to GHCR.
35 | ```
36 | ./pipecraft_build_and_push.sh
37 | ```
38 | 4. Navigate to `./part5/grafana/` and execute `grafana_build_and_push.sh`. We build the custom docker image
39 | using `grafana.Dockerfile` and push it to GHCR.
40 | ```
41 | ./grafana_build_and_push.sh
42 | ```
43 | 3. Connect with root user to Hetzner Cloud server and do the following initial steps
44 | 1. Create folders and `acme.json` file (the file is prepared to store sensitive data securely, such as SSL/TLS
45 | certificates from Let's Encrypt)
46 | ```
47 | mkdir -p /docker/part5/storage/traefik
48 | mkdir -p /docker/part5/pipecraft/logs
49 | mkdir -p /docker/part5/compose
50 | touch /docker/part5/storage/traefik/acme.json
51 | chmod 600 /docker/part5/storage/traefik/acme.json
52 | ```
53 | 2. Navigate `./docker/part5/` and set ownership to airflow user (important: use here AIRFLOW_USER_ID
54 | from `.env.prod`
55 | file, default=50000)
56 | ```
57 | chown -R 50000:50000 pipecraft
58 | ```
59 | 3. Set read and write access for pipecraft folder (used to write logs by Airflow)
60 | ```
61 | chmod -R u+rwX pipecraft
62 | ```
63 | 4. Copy local docker compose, `env.prod`, and `ghcr.secret` files to the server `/docker/part5/compose`.
64 | 5. Login to Github Container Registry using your access token
65 | ```
66 | cat .ghcr.secret | docker login --username --password-stdin ghcr.io
67 | ```
68 | 6. Navigate to `/docker/part5/compose` and create a common network
69 | ```
70 | docker network create traefik-net
71 | ```
72 | 5. Start Traefik service
73 | ```
74 | docker compose -f compose.traefik.core.yaml -f compose.traefik.prod.yaml --env-file ./.env.prod up -d
75 | ```
76 | 6. Initialize Apache Airflow
77 |
78 | ```
79 | docker compose -f compose.infra.core.yaml -f compose.infra.prod.yaml --env-file ./.env.prod up airflow-init
80 | ```
81 | 7. Start the data infrastructure
82 | ```
83 | docker compose -f compose.infra.core.yaml -f compose.infra.prod.yaml --env-file ./.env.prod up -d
84 | ```
85 | 8. Access Airflow web interface through a browser at ``airflow.mydomain.com``. Complete the one-time
86 | initialization of Timescale:
87 | - Create a connection to Timescale: Admin → Connections
88 | * Connection Id: timescale_conn_admin
89 | * Connection Type: Postgres
90 | * Host: mydomain.com
91 | * Database: timescale
92 | * Login: admin
93 | * Password: password
94 | * Port: 5433
95 | - Execute the Airflow DAG `0_timescale_create_roles` to create read-only user roles.
96 | - Execute the Airflow DAG `0_timescale_create_tables` to create hypertables.
97 | 9. Start the Binance data pipelines.
98 | 10. Access Grafana web interface through a browser at ``grafana.mydomain.com``.
99 |
100 | A detailed guide can be found
101 | here:
102 | * [SDS #5-1: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1835662178571190627)
103 | * [SDS #5-2: How to Set Up the Data Stack in the Cloud](https://x.com/bylethquant/status/1836390688524767387)
104 |
--------------------------------------------------------------------------------
/part5/compose/.env.prod.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_FERNET_KEY=
2 | BINANCE_API_KEY=
3 | # needed for setting grafana datasources.yaml correctly
4 | TIMESCALE_PORT=5433
5 | TIMESCALE_DATABASE_NAME=timescale
6 | TIMESCALE_READONLY_USERNAME=user
7 | TIMESCALE_READONLY_PASSWORD=password
8 | # set domain
9 | DOMAIN_NAME=
10 | TRAEFIK_EMAIL=
--------------------------------------------------------------------------------
/part5/compose/.env.template:
--------------------------------------------------------------------------------
1 | AIRFLOW_FERNET_KEY=
2 | BINANCE_API_KEY=
3 | # needed for setting grafana datasources.yaml correctly
4 | TIMESCALE_PORT=5433
5 | TIMESCALE_DATABASE_NAME=timescale
6 | TIMESCALE_READONLY_USERNAME=user
7 | TIMESCALE_READONLY_PASSWORD=password
--------------------------------------------------------------------------------
/part5/compose/compose.infra.core.yaml:
--------------------------------------------------------------------------------
1 | x-airflow-common:
2 | &airflow-common
3 | image: ghcr.io/bylethquant/sds-pipecraft:latest
4 | environment:
5 | &airflow-common-env
6 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${AIRFLOW_DATABASE_USERNAME:-admin}:${AIRFLOW_DATABASE_PASSWORD:-password}@airflow-postgres:${AIRFLOW_DATABASE_PORT:-5432}/${AIRFLOW_DATABASE_NAME:-airflow}"
7 | AIRFLOW__CORE__FERNET_KEY: "${AIRFLOW_FERNET_KEY}"
8 | _AIRFLOW_WWW_USER_USERNAME: "${AIRFLOW_WWW_USER_USERNAME:-admin}"
9 | _AIRFLOW_WWW_USER_PASSWORD: "${AIRFLOW_WWW_USER_PASSWORD:-password}"
10 | _AIRFLOW_WWW_USER_ROLE: "Admin"
11 | _AIRFLOW_WWW_USER_FIRSTNAME: "${AIRFLOW_WWW_USER_FIRSTNAME:-firstname}"
12 | _AIRFLOW_WWW_USER_LASTNAME: "${AIRFLOW_WWW_USER_LASTNAME:-lastname}"
13 | _AIRFLOW_WWW_USER_EMAIL: "${AIRFLOW_WWW_USER_EMAIL:-admin@example.com}"
14 | AIRFLOW_VAR_TIMESCALE_READONLY_USERNAME: "${TIMESCALE_READONLY_USERNAME:-user}"
15 | AIRFLOW_VAR_TIMESCALE_READONLY_PASSWORD: "${TIMESCALE_READONLY_PASSWORD:-password}"
16 | AIRFLOW_VAR_TIMESCALE_CONN_ID_ADMIN: "${TIMESCALE_CONN_ID_ADMIN:-timescale_conn_admin}"
17 | AIRFLOW_VAR_TIMESCALE_CONN_ID_READONLY: "${TIMESCALE_CONN_ID_READONLY:-timescale_conn_readonly}"
18 | AIRFLOW_VAR_ROOT_PROJ_NAME: "${ROOT_PROJ_NAME:-part5}"
19 | AIRFLOW_VAR_BINANCE_API_KEY: "${BINANCE_API_KEY}"
20 | user: ${AIRFLOW_UID:-50000}
21 | depends_on:
22 | airflow-postgres:
23 | condition: service_healthy
24 | volumes:
25 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/logs:/opt/airflow/logs
26 |
27 |
28 | services:
29 |
30 | airflow-webserver:
31 | <<: *airflow-common
32 | container_name: airflow-webserver
33 | command: webserver
34 |
35 | airflow-scheduler:
36 | <<: *airflow-common
37 | container_name: airflow-scheduler
38 | command: scheduler
39 |
40 | airflow-postgres:
41 | container_name: airflow-postgres
42 | image: postgres:13
43 | environment:
44 | POSTGRES_DB: "${AIRFLOW_DATABASE_NAME:-airflow}"
45 | POSTGRES_USER: "${AIRFLOW_DATABASE_USERNAME:-admin}"
46 | POSTGRES_PASSWORD: "${AIRFLOW_DATABASE_PASSWORD:-password}"
47 |
48 | airflow-init:
49 | <<: *airflow-common
50 | container_name: airflow-init
51 | environment:
52 | <<: *airflow-common-env
53 | _AIRFLOW_DB_UPGRADE: true
54 | restart: no
55 |
56 | timescale:
57 | container_name: timescale
58 | image: timescale/timescaledb:latest-pg15
59 | environment:
60 | POSTGRES_DB: "${TIMESCALE_DATABASE_NAME:-timescale}"
61 | POSTGRES_USER: "${TIMESCALE_ADMIN_USERNAME:-admin}"
62 | POSTGRES_PASSWORD: "${TIMESCALE_ADMIN_PASSWORD:-password}"
63 |
64 | grafana:
65 | container_name: grafana
66 | image: ghcr.io/bylethquant/sds-grafana:latest
67 | environment:
68 | GF_SECURITY_ADMIN_USER: "${GRAFANA_ADMIN_USER:-admin}"
69 | GF_SECURITY_ADMIN_PASSWORD: "${GRAFANA_ADMIN_PASSWORD:-password}"
70 | depends_on:
71 | timescale:
72 | condition: service_healthy
73 |
74 | networks:
75 | default:
76 | name: traefik-net
77 | external: true
--------------------------------------------------------------------------------
/part5/compose/compose.infra.dev.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-dev
2 |
3 | x-airflow-common-dev:
4 | &airflow-common-dev
5 | image: apache/airflow:2.8.1-python3.11
6 | environment:
7 | AIRFLOW__CORE__EXECUTOR: LocalExecutor
8 | AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS: "false"
9 | AIRFLOW__CORE__LOAD_EXAMPLES: "false"
10 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: "true"
11 | AIRFLOW__LOGGING__LOGGING_LEVEL: "DEBUG"
12 | volumes:
13 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/dags:/opt/airflow/dags
14 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/config:/opt/airflow/config
15 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/plugins:/opt/airflow/plugins
16 |
17 | services:
18 |
19 | airflow-webserver:
20 | <<: *airflow-common-dev
21 | healthcheck:
22 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ]
23 | interval: 30s
24 | timeout: 10s
25 | retries: 5
26 | start_period: 30s
27 | restart: unless-stopped
28 | labels:
29 | - "traefik.enable=true"
30 | - "traefik.http.routers.airflow-webserver.rule=Host(`airflow.${DOMAIN_NAME:-localhost}`)"
31 |
32 | airflow-scheduler:
33 | <<: *airflow-common-dev
34 | restart: unless-stopped
35 |
36 | airflow-postgres:
37 | ports:
38 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432"
39 | volumes:
40 | - ../.storage/postgres:/var/lib/postgresql/data
41 | healthcheck:
42 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME:-admin}" ]
43 | interval: 5s
44 | retries: 2
45 | start_period: 3s
46 | restart: unless-stopped
47 |
48 | airflow-init:
49 | <<: *airflow-common-dev
50 | entrypoint: /opt/airflow/scripts/entry_init.sh
51 | volumes:
52 | - ${AIRFLOW_PROJ_DIR:-../pipecraft}/scripts:/opt/airflow/scripts
53 |
54 | timescale:
55 | ports:
56 | - "${TIMESCALE_PORT:-5433}:5432"
57 | volumes:
58 | - ../.storage/timescale:/var/lib/postgresql/data
59 | healthcheck:
60 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ]
61 | interval: 5s
62 | retries: 2
63 | start_period: 3s
64 | restart: unless-stopped
65 |
66 | grafana:
67 | image: grafana/grafana:10.0.2
68 | environment:
69 | GF_DATABASE_SSL_MODE: disable
70 | GF_ENABLE_GZIP: true
71 | env_file:
72 | - .env
73 | volumes:
74 | - ../grafana/dev/provisioning:/etc/grafana/provisioning
75 | - ../grafana/dev/dashboards:/var/lib/grafana/dashboards
76 | restart: unless-stopped
77 | labels:
78 | - "traefik.enable=true"
79 | - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN_NAME:-localhost}`)"
--------------------------------------------------------------------------------
/part5/compose/compose.infra.prod.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-prod
2 |
3 | services:
4 |
5 | airflow-webserver:
6 | healthcheck:
7 | test: [ "CMD", "curl", "--fail", "http://localhost:8080/health" ]
8 | interval: 60s
9 | timeout: 10s
10 | retries: 5
11 | start_period: 30s
12 | restart: unless-stopped
13 | labels:
14 | - "traefik.enable=true"
15 | - "traefik.http.routers.airflow-webserver.rule=Host(`airflow.${DOMAIN_NAME}`)"
16 | - "traefik.http.routers.airflow-webserver.entrypoints=websecure"
17 | - "traefik.http.routers.airflow-webserver.tls.certresolver=leresolver"
18 |
19 | airflow-scheduler:
20 | restart: unless-stopped
21 |
22 | airflow-postgres:
23 | ports:
24 | - "${AIRFLOW_DATABASE_PORT:-5432}:5432"
25 | volumes:
26 | - ../storage/postgres:/var/lib/postgresql/data
27 | healthcheck:
28 | test: [ "CMD", "pg_isready", "-q", "-d", "${AIRFLOW_DATABASE_NAME:-airflow}", "-U", "${AIRFLOW_DATABASE_USERNAME_admin}" ]
29 | interval: 60s
30 | retries: 10
31 | start_period: 10s
32 | restart: unless-stopped
33 |
34 | airflow-init:
35 | entrypoint: /opt/airflow/scripts/entry_init.sh
36 |
37 | timescale:
38 | ports:
39 | - "${TIMESCALE_PORT:-5433}:5432"
40 | volumes:
41 | - ../storage/timescale:/var/lib/postgresql/data
42 | healthcheck:
43 | test: [ "CMD", "pg_isready", "-q", "-d", "${TIMESCALE_DATABASE_NAME:-timescale}", "-U", "${TIMESCALE_ADMIN_USERNAME:-admin}" ]
44 | interval: 60s
45 | retries: 10
46 | start_period: 10s
47 | restart: unless-stopped
48 |
49 | grafana:
50 | env_file:
51 | - .env.prod
52 | restart: unless-stopped
53 | labels:
54 | - "traefik.enable=true"
55 | - "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN_NAME}`)"
56 | - "traefik.http.routers.grafana.entrypoints=websecure"
57 | - "traefik.http.routers.grafana.tls.certresolver=leresolver"
--------------------------------------------------------------------------------
/part5/compose/compose.traefik.core.yaml:
--------------------------------------------------------------------------------
1 | services:
2 |
3 | traefik:
4 | image: traefik:v3.0
5 | container_name: traefik
6 |
7 | networks:
8 | default:
9 | name: traefik-net
10 | external: true
--------------------------------------------------------------------------------
/part5/compose/compose.traefik.dev.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-traefik-dev
2 |
3 | services:
4 |
5 | traefik:
6 | command:
7 | - "--api.insecure=true"
8 | - "--providers.docker=true"
9 | - "--log.level=DEBUG"
10 | ports:
11 | - "80:80"
12 | - "${TRAEFIK_PORT:-8080}:8080" # traefik dashboard
13 | volumes:
14 | - /var/run/docker.sock:/var/run/docker.sock:ro
--------------------------------------------------------------------------------
/part5/compose/compose.traefik.prod.yaml:
--------------------------------------------------------------------------------
1 | name: data-infra-traefik-prod
2 |
3 | services:
4 |
5 | traefik:
6 | command:
7 | # configure entrypoint
8 | - "--entrypoints.web.address=:80"
9 | - "--entrypoints.websecure.address=:443"
10 | # configure docker
11 | - "--providers.docker"
12 | - "--providers.docker.exposedbydefault=false"
13 | - "--providers.docker.network=traefik-net"
14 | # configure logs
15 | - "--log.level=ERROR"
16 | # configure SSL
17 | - "--certificatesresolvers.leresolver.acme.httpchallenge=true"
18 | - "--certificatesresolvers.leresolver.acme.httpchallenge.entrypoint=web"
19 | - "--certificatesresolvers.leresolver.acme.email=${ACME_EMAIL}"
20 | - "--certificatesresolvers.leresolver.acme.storage=/le/acme.json"
21 | # global HTTP -> HTTPS
22 | - "--entrypoints.web.http.redirections.entryPoint.to=websecure"
23 | - "--entrypoints.web.http.redirections.entryPoint.scheme=https"
24 | ports:
25 | - "80:80"
26 | - "443:443"
27 | volumes:
28 | - /var/run/docker.sock:/var/run/docker.sock:ro
29 | - ../storage/traefik/acme.json:/le/acme.json
--------------------------------------------------------------------------------
/part5/grafana/dev/provisioning/dashboards/dashboards.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'dashboards'
5 | orgId: 1
6 | folder: ''
7 | folderUid: ''
8 | type: file
9 | disableDeletion: true
10 | editable: true
11 | updateIntervalSeconds: 10
12 | allowUiUpdates: false
13 | options:
14 | path: /var/lib/grafana/dashboards
--------------------------------------------------------------------------------
/part5/grafana/dev/provisioning/datasources/datasources.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | datasources:
4 | - name: timescale
5 | type: postgres
6 | url: "host.docker.internal:${TIMESCALE_PORT}"
7 | database: "${TIMESCALE_DATABASE_NAME}"
8 | user: "${TIMESCALE_READONLY_USERNAME}"
9 | secureJsonData:
10 | password: "${TIMESCALE_READONLY_PASSWORD}"
11 | jsonData:
12 | postgresVersion: 1500
13 | sslmode: "disable"
14 | timescaledb: true
15 | tlsAuth: false
16 | tlsAuthWithCACert: false
17 | tlsConfigurationMethod: "file-path"
18 | tlsSkipVerify: true
--------------------------------------------------------------------------------
/part5/grafana/grafana.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM grafana/grafana:10.0.2
2 |
3 | # set environment variables
4 | ENV GF_DATABASE_SSL_MODE=disable
5 | ENV GF_ENABLE_GZIP=true
6 |
7 | # copy provisioning and dashboards configurations
8 | COPY prod/dashboards /var/lib/grafana/dashboards
9 | COPY prod/provisioning /etc/grafana/provisioning
10 |
11 | # expose port 3000 for Grafana UI
12 | EXPOSE 3000
13 |
14 | # connect docker image to your repo (not required)
15 | # LABEL org.opencontainers.image.source https://github.com/bylethquant/substack-data-infra
16 |
17 | # start grafana
18 | CMD ["grafana-server", "--config", "/etc/grafana/grafana.ini"]
19 |
--------------------------------------------------------------------------------
/part5/grafana/grafana_build_and_push.sh:
--------------------------------------------------------------------------------
1 | # define the image name, tag, and dockerfile name
2 | CONTAINER_REGISTRY="ghcr.io/bylethquant/"
3 | IMAGE_NAME="sds-grafana"
4 | TAG="latest"
5 | DOCKERFILE_NAME="grafana.Dockerfile"
6 |
7 | # build the docker image
8 | docker build -t $CONTAINER_REGISTRY$IMAGE_NAME:$TAG -f $DOCKERFILE_NAME .
9 |
10 | # push the docker image to the repository
11 | docker push $CONTAINER_REGISTRY$IMAGE_NAME:$TAG
12 |
--------------------------------------------------------------------------------
/part5/grafana/prod/provisioning/dashboards/dashboards.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | providers:
4 | - name: 'dashboards'
5 | orgId: 1
6 | folder: ''
7 | folderUid: ''
8 | type: file
9 | disableDeletion: true
10 | editable: true
11 | updateIntervalSeconds: 10
12 | allowUiUpdates: false
13 | options:
14 | path: /var/lib/grafana/dashboards
--------------------------------------------------------------------------------
/part5/grafana/prod/provisioning/datasources/datasources.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: 1
2 |
3 | datasources:
4 | - name: timescale
5 | type: postgres
6 | url: "${DOMAIN_NAME}:${TIMESCALE_PORT}"
7 | database: "${TIMESCALE_DATABASE_NAME}"
8 | user: "${TIMESCALE_READONLY_USERNAME}"
9 | secureJsonData:
10 | password: "${TIMESCALE_READONLY_PASSWORD}"
11 | jsonData:
12 | postgresVersion: 1500
13 | sslmode: "disable"
14 | timescaledb: true
15 | tlsAuth: false
16 | tlsAuthWithCACert: false
17 | tlsConfigurationMethod: "file-path"
18 | tlsSkipVerify: true
--------------------------------------------------------------------------------
/part5/pipecraft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/config/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/.airflowignore:
--------------------------------------------------------------------------------
1 | libs/
--------------------------------------------------------------------------------
/part5/pipecraft/dags/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/binance_market_data/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .symbols import SPOT, FUTURE
2 | from .kline import DAG_SCHEDULE_INTERVAL_KLINE, TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME, DAG_KLINE_DEFAULT_ARGS
3 | from .funding import DAG_SCHEDULE_INTERVAL_FUNDING_PERP, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, DAG_FUNDING_DEFAULT_ARGS
4 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/config/funding.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_FUNDING_PERP: str = "5 0 * * *"
4 | TIMESCALE_FUNDING_FUTURE_TABLE_NAME: str = "binance_funding_future"
5 | DAG_FUNDING_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
6 | "retries": 2}
7 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/config/kline.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | DAG_SCHEDULE_INTERVAL_KLINE: str = "5 * * * *"
4 | TIMESCALE_KLINE_SPOT_TABLE_NAME: str = "binance_kline_spot"
5 | TIMESCALE_KLINE_FUTURE_TABLE_NAME: str = "binance_kline_future"
6 | DAG_KLINE_DEFAULT_ARGS: dict = {"retry_delay": timedelta(minutes=1),
7 | "retries": 2}
8 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/config/symbols.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base import Instrument, Venue, ContractType
2 | from datetime import datetime, timezone
3 |
4 | SPOT = [
5 | Instrument("ADAUSDT", Venue.binance, ContractType.spot, datetime(2018, 4, 18, 0, tzinfo=timezone.utc)),
6 | Instrument("ATOMUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 30, 0, tzinfo=timezone.utc)),
7 | Instrument("AVAXUSDT", Venue.binance, ContractType.spot, datetime(2020, 9, 23, 0, tzinfo=timezone.utc)),
8 | Instrument("BTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
9 | Instrument("DOGEUSDT", Venue.binance, ContractType.spot, datetime(2019, 7, 6, 0, tzinfo=timezone.utc)),
10 | Instrument("ETHUSDT", Venue.binance, ContractType.spot, datetime(2017, 8, 18, 0, tzinfo=timezone.utc)),
11 | Instrument("FTMUSDT", Venue.binance, ContractType.spot, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
12 | Instrument("SOLUSDT", Venue.binance, ContractType.spot, datetime(2020, 8, 12, 0, tzinfo=timezone.utc)),
13 | Instrument("MATICUSDT", Venue.binance, ContractType.spot, datetime(2019, 4, 27, 0, tzinfo=timezone.utc)),
14 | Instrument("LINKUSDT", Venue.binance, ContractType.spot, datetime(2019, 1, 17, 0, tzinfo=timezone.utc)),
15 | Instrument("LTCUSDT", Venue.binance, ContractType.spot, datetime(2017, 12, 14, 0, tzinfo=timezone.utc)),
16 | Instrument("TRXUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 12, 0, tzinfo=timezone.utc)),
17 | Instrument("VETUSDT", Venue.binance, ContractType.spot, datetime(2018, 7, 26, 0, tzinfo=timezone.utc)),
18 | Instrument("XLMUSDT", Venue.binance, ContractType.spot, datetime(2018, 6, 1, 0, tzinfo=timezone.utc)),
19 | Instrument("XRPUSDT", Venue.binance, ContractType.spot, datetime(2019, 3, 16, 0, tzinfo=timezone.utc))
20 | ]
21 |
22 | FUTURE = [
23 | Instrument("ADAUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 1, 0, tzinfo=timezone.utc)),
24 | Instrument("ATOMUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 8, 0, tzinfo=timezone.utc)),
25 | Instrument("AVAXUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 24, 0, tzinfo=timezone.utc)),
26 | Instrument("BTCUSDT", Venue.binance, ContractType.future, datetime(2019, 9, 9, 0, tzinfo=timezone.utc)),
27 | Instrument("DOGEUSDT", Venue.binance, ContractType.future, datetime(2020, 7, 11, 0, tzinfo=timezone.utc)),
28 | Instrument("ETHUSDT", Venue.binance, ContractType.future, datetime(2019, 11, 28, 0, tzinfo=timezone.utc)),
29 | Instrument("FTMUSDT", Venue.binance, ContractType.future, datetime(2019, 6, 12, 0, tzinfo=timezone.utc)),
30 | Instrument("SOLUSDT", Venue.binance, ContractType.future, datetime(2020, 9, 15, 0, tzinfo=timezone.utc)),
31 | Instrument("MATICUSDT", Venue.binance, ContractType.future, datetime(2020, 10, 23, 0, tzinfo=timezone.utc)),
32 | Instrument("LINKUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 18, 0, tzinfo=timezone.utc)),
33 | Instrument("LTCUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 10, 0, tzinfo=timezone.utc)),
34 | Instrument("TRXUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 16, 0, tzinfo=timezone.utc)),
35 | Instrument("VETUSDT", Venue.binance, ContractType.future, datetime(2020, 2, 15, 0, tzinfo=timezone.utc)),
36 | Instrument("XLMUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 21, 0, tzinfo=timezone.utc)),
37 | Instrument("XRPUSDT", Venue.binance, ContractType.future, datetime(2020, 1, 7, 0, tzinfo=timezone.utc))
38 | ]
39 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/dag_binance_funding_rate.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_funding_future as etl_funding_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def generate_binance_funding_rate_dag(dag_id: str,
17 | instrument: Instrument,
18 | schedule_interval: str,
19 | catchup: bool = False,
20 | testnet: bool = False) -> DAG:
21 | """Generates a DAG for binance funding rate data pipeline."""
22 | with DAG(dag_id=dag_id,
23 | description="Data ingestion pipeline for Binance funding rates.",
24 | start_date=instrument.first_date,
25 | catchup=catchup,
26 | schedule_interval=schedule_interval,
27 | default_args=dag_config.DAG_FUNDING_DEFAULT_ARGS) as dag:
28 | # task flow
29 | start_dummy = start_task()
30 | binance_keys = retrieve_binance_secrets()
31 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
32 | extract = etl_funding_tasks.fetch_data(binance_keys, instrument.symbol, testnet=testnet)
33 | transform = etl_funding_tasks.transform_data(extract)
34 | ingest = etl_funding_tasks.insert_data(transform)
35 | end_dummy = end_task()
36 |
37 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
38 |
39 | return dag
40 |
41 |
42 | # create DAGs for funding rates
43 | for instr in dag_config.FUTURE:
44 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_funding_{instr.contract_type.value}"
45 | globals()[dag_instance_id] = generate_binance_funding_rate_dag(dag_id=dag_instance_id,
46 | instrument=instr,
47 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_FUNDING_PERP)
48 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/dag_binance_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow import DAG
4 |
5 | import binance_market_data.process.etl_kline as etl_kline_tasks
6 | import binance_market_data.config as dag_config
7 | from libs.airtasks.initial import start_task, end_task
8 | from binance_market_data.process.common import retrieve_binance_secrets, test_api_connectivity
9 | from libs.venues.base import Instrument
10 |
11 | # create module logger
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def generate_binance_candlestick_dag(dag_id: str,
16 | instrument: Instrument,
17 | schedule_interval: str,
18 | catchup: bool = False,
19 | testnet: bool = False) -> DAG:
20 | """Generates a DAG for binance candlestick data pipeline."""
21 | with DAG(dag_id=dag_id,
22 | description="Data ingestion pipeline for Binance candlestick data.",
23 | start_date=instrument.first_date,
24 | catchup=catchup,
25 | schedule_interval=schedule_interval,
26 | default_args=dag_config.DAG_KLINE_DEFAULT_ARGS) as dag:
27 | # task flow
28 | # - create start task
29 | start_dummy = start_task()
30 | # - retrieve binance api keys
31 | binance_keys = retrieve_binance_secrets()
32 | # - test connectivity of binance api
33 | ping_api = test_api_connectivity(binance_keys, testnet, instrument.contract_type)
34 | # - fetch binance candlestick data
35 | extract = etl_kline_tasks.fetch_data(binance_keys, instrument, testnet=testnet)
36 | # - transform data
37 | transform = etl_kline_tasks.transform_data(extract, instrument.symbol)
38 | # - insert data to timescale database
39 | ingest = etl_kline_tasks.insert_data(instrument.contract_type, transform)
40 | # - create end task
41 | end_dummy = end_task()
42 |
43 | start_dummy >> binance_keys >> ping_api >> extract >> transform >> ingest >> end_dummy
44 |
45 | return dag
46 |
47 |
48 | # create DAGs for kline
49 | for instr in dag_config.SPOT + dag_config.FUTURE:
50 | dag_instance_id = f"{instr.venue.value}_{instr.symbol}_kline_{instr.contract_type.value}"
51 | globals()[dag_instance_id] = generate_binance_candlestick_dag(dag_id=dag_instance_id,
52 | instrument=instr,
53 | schedule_interval=dag_config.DAG_SCHEDULE_INTERVAL_KLINE)
54 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/process/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/binance_market_data/process/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/process/common.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from typing import Dict, Any
4 | from airflow.models import Variable
5 | from airflow.decorators import task
6 |
7 | from libs.venues import binance as binance_client
8 | from libs.venues.base import ContractType
9 |
10 | # module logger
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | @task
15 | def retrieve_binance_secrets() -> Dict[str, Any]:
16 | """Retrieves Binance API keys."""
17 | try:
18 | binance_keys = binance_client.BinanceAuth(Variable.get("BINANCE_API_KEY"))
19 | except Exception as exc:
20 | logger.exception(f"Retrieving Binance keys failed. Msg: {exc}.")
21 | raise
22 | else:
23 | logger.info(f"Retrieving Binance keys was successful.")
24 | return binance_keys.as_dict()
25 |
26 |
27 | @task
28 | def test_api_connectivity(auth: dict, testnet: bool, contract_type: ContractType) -> None:
29 | """Tests connectivity to the Rest API."""
30 | connectivity_map = {ContractType.spot: binance_client.ping_spot_api,
31 | ContractType.future: binance_client.ping_future_api}
32 | connectivity_map[contract_type](binance_client.BinanceAuth.from_dict(auth), testnet)
33 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/process/etl_funding_future.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, Dict, Any, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from binance_market_data.config import TIMESCALE_FUNDING_FUTURE_TABLE_NAME
11 |
12 |
13 | # module logger
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | @task
18 | def fetch_data(auth: dict,
19 | symbol: str,
20 | testnet: bool = False,
21 | data_interval_start: Optional[datetime] = None) -> List[Dict[str, Any]]:
22 | """Fetches funding rate data."""
23 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
24 | start_time = datetime(data_interval_start.year,
25 | data_interval_start.month,
26 | data_interval_start.day,
27 | data_interval_start.hour)
28 | end_time = start_time + timedelta(days=1)
29 | # fetch funding rate data
30 | response = binance_client.fetch_funding_rate(auth=binance_client.BinanceAuth.from_dict(auth),
31 | symbol=symbol,
32 | start_time=start_time,
33 | end_time=end_time,
34 | testnet=testnet)
35 | return response
36 |
37 |
38 | @task
39 | def transform_data(response: List[Dict[str, Any]]) -> pd.DataFrame:
40 | """Transforms funding rate response from API. """
41 | try:
42 | # process funding rate
43 | field_types = binance_client.FundingRate.get_field_types()
44 | df = pd.DataFrame(data=response)
45 | # re-name columns
46 | df = df.rename(columns=binance_client.FundingRate.get_rename_dict())
47 | # remove ignore columns
48 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
49 | # set type of each column that is kept
50 | for i_col in df.columns:
51 | df = df.astype({i_col: field_types[i_col]})
52 | # timestamp
53 | df.time = pd.to_datetime(df.time, unit="ms", utc=True)
54 | except Exception as exc:
55 | logger.exception(f"Transformation of data: failed. {exc}")
56 | raise
57 | else:
58 | logger.info("Transformation of data: successful.")
59 | return df
60 |
61 |
62 | @task
63 | def insert_data(df: pd.DataFrame) -> None:
64 | """Inserts funding rate data to timescale."""
65 | try:
66 | conn_id = retrieve_conn_id()
67 | ingest_data(conn_id, TIMESCALE_FUNDING_FUTURE_TABLE_NAME, df)
68 | except Exception as exc:
69 | logger.exception(f"Insert data to timescale: failed. {exc}")
70 | raise
71 | else:
72 | logger.info(f"Insert data to timescale table {TIMESCALE_FUNDING_FUTURE_TABLE_NAME}: successful.")
73 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/binance_market_data/process/etl_kline.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from airflow.decorators import task
5 | from datetime import datetime, timedelta
6 | from typing import Optional, List
7 |
8 | from libs.airtasks.timescale import ingest_data, retrieve_conn_id
9 | from libs.venues import binance as binance_client
10 | from libs.venues.base import ContractType, Instrument
11 | from binance_market_data.config import TIMESCALE_KLINE_SPOT_TABLE_NAME, TIMESCALE_KLINE_FUTURE_TABLE_NAME
12 |
13 |
14 | # module logger
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | @task
19 | def fetch_data(auth: dict,
20 | instrument: Instrument,
21 | testnet: bool = False,
22 | data_interval_start: Optional[datetime] = None) -> List[list]:
23 | """Sends get request to fetch candlestick data for the previous hour."""
24 | fetch_data_map = {ContractType.spot: binance_client.fetch_spot_kline,
25 | ContractType.future: binance_client.fetch_future_kline}
26 | # reminder: data_interval_start will be set from airflow based on scheduler and schedule time!
27 | start_time = datetime(data_interval_start.year,
28 | data_interval_start.month,
29 | data_interval_start.day,
30 | data_interval_start.hour)
31 | end_time = start_time + timedelta(hours=1) - timedelta(minutes=1)
32 | # fetch candlestick data
33 | response = fetch_data_map[instrument.contract_type](auth=binance_client.BinanceAuth.from_dict(auth),
34 | symbol=instrument.symbol,
35 | start_time=start_time,
36 | end_time=end_time,
37 | testnet=testnet)
38 | return response
39 |
40 |
41 | @task
42 | def transform_data(response: list, symbol: str) -> pd.DataFrame:
43 | """Transforms the data and prepares to insert."""
44 | try:
45 | # process klines
46 | field_types = binance_client.Kline.get_field_types()
47 | df = pd.DataFrame(data=response, columns=list(field_types.keys()))
48 | # remove ignore columns
49 | df = df.drop(df.columns[df.columns.str.contains('ignore')], axis=1)
50 | # set type of each column that is kept
51 | for i_col in df.columns:
52 | df = df.astype({i_col: field_types[i_col]})
53 | # set time
54 | df.open_time = pd.to_datetime(df.open_time, unit="ms", utc=True)
55 | df.close_time = pd.to_datetime(df.close_time, unit="ms", utc=True)
56 | # add symbol column
57 | df["symbol"] = symbol
58 | except Exception as exc:
59 | logger.exception(f"Transformation of data: failed. {exc}")
60 | raise
61 | else:
62 | logger.info("Transformation of data: successful.")
63 | return df
64 |
65 |
66 | @task
67 | def insert_data(contract_type: ContractType, df: pd.DataFrame) -> None:
68 | """Inserts data to timescale."""
69 | timescale_schema_map = {ContractType.spot: TIMESCALE_KLINE_SPOT_TABLE_NAME,
70 | ContractType.future: TIMESCALE_KLINE_FUTURE_TABLE_NAME}
71 | table_name = timescale_schema_map[contract_type]
72 | try:
73 | conn_id = retrieve_conn_id()
74 | ingest_data(conn_id, table_name, df)
75 | except Exception as exc:
76 | logger.exception(f"Insert data to timescale: failed. {exc}")
77 | raise
78 | else:
79 | logger.info(f"Insert data to timescale table {table_name}: successful.")
80 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/infopy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/infopy/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/infopy/dag_infopy.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 | from airflow.operators.bash import BashOperator
6 |
7 | from libs.airtasks.initial import start_task, end_task
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_infopy",
13 | description="Show all installed python packages.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - execute pip freeze
20 | pip_task = BashOperator(task_id="pip_task", bash_command='pip freeze')
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> pip_task >> end_dummy
25 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/__init__.py:
--------------------------------------------------------------------------------
1 | from . import venues
2 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/airtasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .initial import start_task, end_task
2 | from . import timescale
3 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/airtasks/initial.py:
--------------------------------------------------------------------------------
1 | from airflow.operators.empty import EmptyOperator
2 | from typing import Optional
3 |
4 |
5 | def start_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
6 | tid = "start" if task_id is None else task_id
7 | return EmptyOperator(task_id=tid, **kwargs)
8 |
9 |
10 | def end_task(task_id: Optional[str] = None, **kwargs) -> EmptyOperator:
11 | tid = "end" if task_id is None else task_id
12 | return EmptyOperator(task_id=tid, **kwargs)
13 |
14 |
15 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/airtasks/timescale/__init__.py:
--------------------------------------------------------------------------------
1 | from .ingester import ingest_data
2 | from .conn import retrieve_conn_id
3 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/airtasks/timescale/conn.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.models import Variable
4 |
5 | # create module logger
6 | logger = logging.getLogger(__name__)
7 |
8 |
9 | def retrieve_conn_id(id_key: str = "admin") -> str:
10 | """Retrieves timescale connection id."""
11 | try:
12 | if id_key == "admin":
13 | conn_id = Variable.get("TIMESCALE_CONN_ID_ADMIN")
14 | elif id_key == "readonly":
15 | conn_id = Variable.get("TIMESCALE_CONN_ID_READONLY")
16 | else:
17 | raise ValueError("Unknown id_key. Select admin or readonly.")
18 | except Exception as exc:
19 | logger.exception(f"Retrieving admin timescale connection id: failed. {exc}.")
20 | raise
21 | else:
22 | logger.info(f"Retrieving admin timescale connection id: successful.")
23 | return conn_id
24 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/airtasks/timescale/ingester.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import pandas as pd
3 |
4 | from psycopg2.extras import execute_values
5 | from psycopg2.extensions import connection
6 | from airflow.providers.postgres.hooks.postgres import PostgresHook
7 |
8 | # create module logger
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def _bulk_insert(conn: connection, table_name: str, df_data: pd.DataFrame) -> None:
13 | """Bulk insert to timescale."""
14 | try:
15 | # create a list of tuples from dataframe
16 | data_tuples = [tuple(x) for x in df_data.to_numpy()]
17 | # comma-separated dataframe columns
18 | cols = ','.join(list(df_data.columns))
19 | # SQL query to execute
20 | query = "INSERT INTO %s(%s) VALUES %%s" % (table_name, cols)
21 | with conn.cursor() as crs:
22 | execute_values(crs, query, data_tuples)
23 | conn.commit()
24 | except Exception as exc:
25 | logger.exception(f"Bulk insert: failed. {exc}.")
26 | raise
27 | else:
28 | logger.info("Bulk insert: successful.")
29 |
30 |
31 | def ingest_data(conn_id: str, table_name: str, df_data: pd.DataFrame) -> None:
32 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
33 | _bulk_insert(conn, table_name, df_data)
34 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/__init__.py:
--------------------------------------------------------------------------------
1 | from . import binance
2 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Venue, VenueAuthentication, ContractType, Instrument, RequestResultLimit, VenueNet, MarketDataStructure
2 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/base/base.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from dataclasses import dataclass, fields
3 | from datetime import datetime
4 |
5 |
6 | class Venue(Enum):
7 | """Crypto venues."""
8 | binance = "binance"
9 |
10 |
11 | class VenueAuthentication:
12 | """Base class to authenticate at a venue."""
13 | pass
14 |
15 |
16 | class VenueNet(Enum):
17 | """Production vs test environment."""
18 | mainnet = "mainnet"
19 | testnet = "testnet"
20 |
21 |
22 | class ContractType(Enum):
23 | """The contract type of traded instrument."""
24 | spot = "spot"
25 | future = "future"
26 |
27 |
28 | @dataclass
29 | class Instrument:
30 | """The traded instrument."""
31 | symbol: str
32 | venue: Venue
33 | contract_type: ContractType
34 | first_date: datetime
35 |
36 |
37 | @dataclass
38 | class MarketDataStructure:
39 | """Base class for market data API responses."""
40 |
41 | @classmethod
42 | def get_field_types(cls) -> dict:
43 | return {field.name: field.type for field in fields(cls)}
44 |
45 |
46 | @dataclass
47 | class RequestResultLimit:
48 | """Default and maximum limit on result of an API market data request."""
49 | default: int
50 | max: int
51 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/binance/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import BinanceAuth
2 | from .client import fetch_spot_kline, fetch_future_kline, fetch_funding_rate, ping_spot_api, ping_future_api
3 | from .config import *
4 | from .types import Kline, FundingRate
5 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/binance/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import logging
3 | from datetime import datetime
4 | from requests import Response, HTTPError
5 | from typing import Optional, Dict, Any, List
6 | from tenacity import retry, stop_after_attempt, wait_exponential
7 | from time import sleep
8 |
9 | from libs.venues.base.base import ContractType, VenueNet
10 | from libs.venues.binance.common import BinanceAuth, to_ms_int, prepare_binance_request_headers
11 | import libs.venues.binance.config as binance_config
12 |
13 | # create module logger
14 | logger = logging.getLogger(__name__)
15 | # log messages from requests above level warning
16 | logging.getLogger('urllib3').setLevel(logging.WARNING)
17 |
18 | # module constants
19 | _KLINE_INTERVAL: str = "1m"
20 | _RATE_LIMIT_SLEEPER_IN_SECS: int = 5*60
21 |
22 |
23 | def _get_base_url(contract_type: ContractType, testnet: bool) -> str:
24 | api_url_map: dict = {ContractType.spot: {VenueNet.testnet: binance_config.SPOT_TESTNET_URL,
25 | VenueNet.mainnet: binance_config.SPOT_MAINNET_URL},
26 | ContractType.future: {VenueNet.testnet: binance_config.FUT_TESTNET_URL,
27 | VenueNet.mainnet: binance_config.FUT_MAINNET_URL}}
28 | return api_url_map[contract_type][VenueNet.testnet if testnet else VenueNet.mainnet]
29 |
30 |
31 | def _get_kline_endpoint(contract_type: ContractType) -> str:
32 | kline_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_KLINE,
33 | ContractType.future: binance_config.FUT_ENDPOINT_KLINE}
34 | return kline_ep_map[contract_type]
35 |
36 |
37 | def _get_ping_endpoint(contract_type: ContractType) -> str:
38 | ping_ep_map: dict = {ContractType.spot: binance_config.SPOT_ENDPOINT_PING,
39 | ContractType.future: binance_config.FUT_ENDPOINT_PING}
40 | return ping_ep_map[contract_type]
41 |
42 |
43 | def _raise_for_status(response: Response) -> None:
44 | try:
45 | response.raise_for_status()
46 | except HTTPError as http_err:
47 | if response.status_code == 429:
48 | logger.exception(f"Binance rate limit was reached. "
49 | f"I need to sleep immediately for a while to avoid any IP ban!")
50 | sleep(5*60)
51 | logger.exception(http_err)
52 | raise
53 |
54 |
55 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, max=10))
56 | def _fetch_api_data(auth: BinanceAuth,
57 | base_url: str,
58 | endpoint: str,
59 | symbol: Optional[str] = None,
60 | start_time: Optional[datetime] = None,
61 | end_time: Optional[datetime] = None,
62 | kline_interval: Optional[str] = None,
63 | request_result_limit: int = None,
64 | request_timeout_in_secs: int = 10) -> Any:
65 | """Market data fetcher for Binance API."""
66 | request_url: str = f"{base_url}{endpoint}"
67 | headers: dict = prepare_binance_request_headers(auth)
68 |
69 | # build request url, if necessary
70 | if symbol is not None:
71 | request_url += f"?symbol={symbol}"
72 | if start_time is not None:
73 | request_url += f"&startTime={to_ms_int(start_time)}"
74 | if end_time is not None:
75 | request_url += f"&endTime={to_ms_int(end_time)}"
76 | if kline_interval is not None:
77 | request_url += f"&interval={kline_interval}"
78 | if request_result_limit is not None:
79 | request_url += f"&limit={request_result_limit}"
80 | # send get request
81 | response = requests.get(request_url,
82 | headers=headers,
83 | timeout=request_timeout_in_secs)
84 | _raise_for_status(response)
85 | return response.json()
86 |
87 |
88 | def fetch_spot_kline(auth: BinanceAuth,
89 | symbol: str,
90 | start_time: datetime,
91 | end_time: datetime,
92 | request_result_limit: int = binance_config.SPOT_ENDPOINT_KLINE_RESULT_LIMIT.default,
93 | testnet: bool = False) -> List[list]:
94 | """Fetches spot kline market data from Binance API."""
95 | return _fetch_api_data(auth=auth,
96 | base_url=_get_base_url(ContractType.spot, testnet),
97 | endpoint=_get_kline_endpoint(ContractType.spot),
98 | symbol=symbol,
99 | start_time=start_time,
100 | end_time=end_time,
101 | request_result_limit=request_result_limit,
102 | kline_interval=_KLINE_INTERVAL)
103 |
104 |
105 | def fetch_future_kline(auth: BinanceAuth,
106 | symbol: str,
107 | start_time: Optional[datetime] = None,
108 | end_time: Optional[datetime] = None,
109 | request_result_limit: int = binance_config.FUT_ENDPOINT_KLINE_RESULT_LIMIT.default,
110 | testnet: bool = False) -> List[list]:
111 | """Fetches future kline market data from Binance API."""
112 | return _fetch_api_data(auth=auth,
113 | base_url=_get_base_url(ContractType.future, testnet),
114 | endpoint=_get_kline_endpoint(ContractType.future),
115 | symbol=symbol,
116 | start_time=start_time,
117 | end_time=end_time,
118 | request_result_limit=request_result_limit,
119 | kline_interval=_KLINE_INTERVAL)
120 |
121 |
122 | def fetch_funding_rate(auth: BinanceAuth,
123 | symbol: str,
124 | start_time: Optional[datetime] = None,
125 | end_time: Optional[datetime] = None,
126 | request_result_limit: int = binance_config.FUT_FUNDING_RESULT_LIMIT.default,
127 | testnet: bool = False) -> List[Dict[str, Any]]:
128 | """Fetches funding rate market data from Binance API."""
129 | return _fetch_api_data(auth=auth,
130 | base_url=_get_base_url(ContractType.future, testnet),
131 | endpoint=binance_config.FUT_ENDPOINT_FUNDING,
132 | symbol=symbol,
133 | start_time=start_time,
134 | end_time=end_time,
135 | request_result_limit=request_result_limit)
136 |
137 |
138 | def ping_spot_api(auth: BinanceAuth, testnet: bool) -> dict:
139 | """Tests connectivity to spot Binance API."""
140 | return _fetch_api_data(auth=auth,
141 | base_url=_get_base_url(ContractType.spot, testnet),
142 | endpoint=binance_config.SPOT_ENDPOINT_PING)
143 |
144 |
145 | def ping_future_api(auth: BinanceAuth, testnet: bool) -> dict:
146 | """Tests connectivity to future Binance API."""
147 | return _fetch_api_data(auth=auth,
148 | base_url=_get_base_url(ContractType.future, testnet),
149 | endpoint=binance_config.FUT_ENDPOINT_PING)
150 |
151 |
152 | def fetch_spot_exchange_info() -> Dict[str, Any]:
153 | raise NotImplementedError
154 |
155 |
156 | def fetch_fut_exchange_info() -> Dict[str, Any]:
157 | raise NotImplementedError
158 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/binance/common.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, asdict
2 | from datetime import datetime, timezone
3 | from typing import Dict, Any
4 |
5 | from libs.venues.base.base import VenueAuthentication
6 |
7 |
8 | @dataclass
9 | class BinanceAuth(VenueAuthentication):
10 | BINANCE_API_KEY: str
11 |
12 | @classmethod
13 | def from_dict(cls, auth_dict: Dict[str, str]):
14 | return cls(auth_dict["BINANCE_API_KEY"])
15 |
16 | def as_dict(self) -> Dict[str, str]:
17 | return asdict(self)
18 |
19 |
20 | def to_ms_int(dt: datetime) -> int:
21 | """Converts datetime timestamp to integer in ms."""
22 | return int(round(dt.timestamp() * 1000))
23 |
24 |
25 | def to_dt(ms_int: int) -> datetime:
26 | """Converts timestamp in ms (integer) to datetime."""
27 | return datetime.utcfromtimestamp(ms_int / 1000).replace(tzinfo=timezone.utc)
28 |
29 |
30 | def prepare_binance_request_headers(auth: BinanceAuth) -> Dict[str, Any]:
31 | """Creates headers for Binance REST API."""
32 | return {"content-type": "application/json", "X-MBX-APIKEY": auth.BINANCE_API_KEY}
33 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/binance/config.py:
--------------------------------------------------------------------------------
1 | from libs.venues.base.base import RequestResultLimit
2 |
3 |
4 | # spot base
5 | # https://binance-docs.github.io/apidocs/spot/en/#general-info
6 | SPOT_MAINNET_URL: str = "https://api.binance.com"
7 | SPOT_TESTNET_URL: str = "https://testnet.binance.vision"
8 | SPOT_REQUEST_RATE_LIMIT: int = 6000
9 | SPOT_REQUEST_INTERVAL_IN_MIN: int = 1
10 |
11 | # spot ping
12 | # https://binance-docs.github.io/apidocs/spot/en/#test-connectivity
13 | SPOT_ENDPOINT_PING: str = "/api/v3/ping"
14 | SPOT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
15 |
16 | # spot exchange info
17 | # https://binance-docs.github.io/apidocs/spot/en/#exchange-information
18 | SPOT_ENDPOINT_EXCHANGE_INFO: str = "/api/v3/exchangeInfo"
19 | SPOT_ENDPOINT_EXCHANGE_INFO_REQUEST_WEIGHT: int = 20
20 |
21 | # spot kline
22 | # https://binance-docs.github.io/apidocs/spot/en/#kline-candlestick-data
23 | SPOT_ENDPOINT_KLINE: str = "/api/v3/klines"
24 | SPOT_ENDPOINT_KLINE_REQUEST_WEIGHT: int = 2
25 | SPOT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1000)
26 |
27 | # futures base
28 | # https://binance-docs.github.io/apidocs/futures/en/#general-info
29 | FUT_MAINNET_URL: str = "https://fapi.binance.com"
30 | FUT_TESTNET_URL: str = "https://testnet.binancefuture.com"
31 | FUT_REQUEST_RATE_LIMIT: int = 2400
32 | FUT_REQUEST_INTERVAL_IN_MIN: int = 1
33 |
34 | # future ping
35 | # https://binance-docs.github.io/apidocs/futures/en/#test-connectivity
36 | FUT_ENDPOINT_PING: str = "/fapi/v1/ping"
37 | FUT_ENDPOINT_PING_REQUEST_WEIGHT: int = 1
38 |
39 | # future exchangeInfo
40 | # https://binance-docs.github.io/apidocs/futures/en/#exchange-information
41 | FUT_ENDPOINT_EXCHANGEINFO: str = "/fapi/v1/exchangeInfo"
42 | FUT_ENDPOINT_EXCHANGEINFO_REQUEST_WEIGHT: int = 1
43 |
44 | # future funding rate
45 | # https://binance-docs.github.io/apidocs/futures/en/#get-funding-rate-history
46 | FUT_ENDPOINT_FUNDING: str = "/fapi/v1/fundingRate"
47 | FUT_FUNDING_REQUEST_RATE_LIMIT: int = 500
48 | FUT_FUNDING_REQUEST_INTERVAL_IN_MIN: int = 5
49 | FUT_FUNDING_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(100, 1000)
50 | FUT_FUNDING_REQUEST_WEIGHT: int = 1 # assumption
51 |
52 | # future kline
53 | # https://binance-docs.github.io/apidocs/futures/en/#kline-candlestick-data
54 | FUT_ENDPOINT_KLINE: str = "/fapi/v1/klines"
55 | FUT_ENDPOINT_KLINE_RESULT_LIMIT: RequestResultLimit = RequestResultLimit(500, 1500)
56 |
57 |
58 | def fut_endpoint_kline_request_weight(request_result_limit: int) -> int:
59 | """Returns the weight conditional on the request result limit."""
60 | if (request_result_limit >= 1) & (request_result_limit < 100):
61 | weight = 1
62 | elif (request_result_limit >= 100) & (request_result_limit < 500):
63 | weight = 2
64 | elif (request_result_limit >= 500) & (request_result_limit < 1000):
65 | weight = 5
66 | else:
67 | weight = 10
68 | return weight
69 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/libs/venues/binance/types.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Any
3 |
4 | from libs.venues.base.base import MarketDataStructure
5 |
6 |
7 | @dataclass
8 | class Kline(MarketDataStructure):
9 | open_time: int
10 | open: float
11 | high: float
12 | low: float
13 | close: float
14 | volume: float
15 | close_time: int
16 | quote_asset_volume: float
17 | number_of_trades: int
18 | taker_buy_base_asset_volume: float
19 | taker_buy_quote_asset_volume: float
20 | ignored: Any
21 |
22 |
23 | @dataclass
24 | class FundingRate(MarketDataStructure):
25 | symbol: str
26 | time: int
27 | funding_rate: float
28 | ignored: Any
29 |
30 | @staticmethod
31 | def get_rename_dict() -> dict:
32 | return {"symbol": "symbol",
33 | "fundingTime": "time",
34 | "fundingRate": "funding_rate",
35 | "markPrice": "ignored"}
36 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/dags/timescale_init/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/dag_timescale_roles.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_roles
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_roles",
13 | description="Timescale initialization pipeline for creating user roles.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create read only user role
20 | roles = create_roles("dags/timescale_init/process/create_roles.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> roles >> end_dummy
25 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/dag_timescale_tables.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from datetime import datetime, timezone
4 | from airflow import DAG
5 |
6 | from libs.airtasks.initial import start_task, end_task
7 | from timescale_init.process import create_tables
8 |
9 | # create module logger
10 | logger = logging.getLogger(__name__)
11 |
12 | with DAG(dag_id=f"0_timescale_create_tables",
13 | description="Timescale initialization pipeline for creating hypertables.",
14 | start_date=datetime(2024, 1, 1, tzinfo=timezone.utc),
15 | catchup=False,
16 | schedule_interval=None) as dag:
17 | # - create start task
18 | start_dummy = start_task()
19 | # - create hypertables
20 | tables = create_tables("dags/timescale_init/process/create_hypertables.sql")
21 | # - create end task
22 | end_dummy = end_task()
23 |
24 | start_dummy >> tables >> end_dummy
25 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/process/__init__.py:
--------------------------------------------------------------------------------
1 | from .tsinit import create_roles, create_tables
2 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/process/create_hypertables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS binance_kline_spot (
2 | open_time TIMESTAMPTZ,
3 | symbol TEXT NOT NULL,
4 | open DOUBLE PRECISION,
5 | high DOUBLE PRECISION,
6 | low DOUBLE PRECISION,
7 | close DOUBLE PRECISION,
8 | volume DOUBLE PRECISION,
9 | close_time TIMESTAMPTZ,
10 | quote_asset_volume DOUBLE PRECISION,
11 | number_of_trades BIGINT,
12 | taker_buy_base_asset_volume DOUBLE PRECISION,
13 | taker_buy_quote_asset_volume DOUBLE PRECISION
14 | );
15 | SELECT create_hypertable('binance_kline_spot', 'open_time', if_not_exists => TRUE);
16 | CREATE INDEX IF NOT EXISTS idx_symbol_time_spot ON binance_kline_spot (symbol, open_time DESC);
17 |
18 | CREATE TABLE IF NOT EXISTS binance_kline_future (
19 | open_time TIMESTAMPTZ,
20 | symbol TEXT NOT NULL,
21 | open DOUBLE PRECISION,
22 | high DOUBLE PRECISION,
23 | low DOUBLE PRECISION,
24 | close DOUBLE PRECISION,
25 | volume DOUBLE PRECISION,
26 | close_time TIMESTAMPTZ,
27 | quote_asset_volume DOUBLE PRECISION,
28 | number_of_trades BIGINT,
29 | taker_buy_base_asset_volume DOUBLE PRECISION,
30 | taker_buy_quote_asset_volume DOUBLE PRECISION
31 | );
32 | SELECT create_hypertable('binance_kline_future', 'open_time', if_not_exists => TRUE);
33 | CREATE INDEX IF NOT EXISTS idx_symbol_time_future ON binance_kline_future (symbol, open_time DESC);
34 |
35 |
36 | CREATE TABLE IF NOT EXISTS binance_funding_future (
37 | time TIMESTAMPTZ,
38 | symbol TEXT NOT NULL,
39 | funding_rate DOUBLE PRECISION
40 | );
41 | SELECT create_hypertable('binance_funding_future', 'time', if_not_exists => TRUE);
42 | CREATE INDEX IF NOT EXISTS idx_symbol_time_funding_future ON binance_funding_future (symbol, time DESC);
43 |
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/process/create_roles.sql:
--------------------------------------------------------------------------------
1 | CREATE ROLE readaccess;
2 | GRANT USAGE ON SCHEMA public TO readaccess;
3 | GRANT SELECT ON ALL TABLES IN SCHEMA public TO readaccess;
4 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO readaccess;
5 | CREATE USER {TIMESCALE_READONLY_USERNAME} WITH PASSWORD {TIMESCALE_READONLY_PASSWORD};
6 | GRANT readaccess TO {TIMESCALE_READONLY_USERNAME};
--------------------------------------------------------------------------------
/part5/pipecraft/dags/timescale_init/process/tsinit.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from airflow.providers.postgres.hooks.postgres import PostgresHook
4 | from psycopg2 import sql
5 | from psycopg2.sql import Composable
6 | from airflow.models import Variable
7 | from airflow.decorators import task
8 | from typing import Union
9 |
10 | from libs.airtasks.timescale import retrieve_conn_id
11 |
12 | # create module logger
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | def _read_sql(path: str) -> str:
17 | """Reads an sql script."""
18 | try:
19 | with open(path, "r") as sql_script:
20 | sql_cmd_str = sql_script.read()
21 | except Exception as exc:
22 | logger.exception(f"Could not read sql file. {exc}")
23 | raise
24 | else:
25 | logger.info(f"Read sql file successfully.")
26 | return sql_cmd_str
27 |
28 |
29 | def _get_roles_sql(path_str: str) -> Composable:
30 | """Constructs the sql script for creating roles."""
31 | # read file
32 | sql_cmd_str = _read_sql(path_str)
33 | try:
34 | # replace dummy variables with environmental variables
35 | sql_cmd = sql.SQL(sql_cmd_str).format(
36 | TIMESCALE_READONLY_USERNAME=sql.Identifier(Variable.get("TIMESCALE_READONLY_USERNAME")),
37 | TIMESCALE_READONLY_PASSWORD=sql.Literal(Variable.get("TIMESCALE_READONLY_PASSWORD"))
38 | )
39 | logger.info(Variable.get("TIMESCALE_READONLY_PASSWORD"))
40 | logger.info(type(Variable.get("TIMESCALE_READONLY_PASSWORD")))
41 | except Exception as exc:
42 | logger.exception(f"Get create roles sql statement: failed. {exc}")
43 | raise
44 | else:
45 | logger.info("Get create roles sql statement: successful.")
46 | return sql_cmd
47 |
48 |
49 | def _execute_sql(conn_id: str, sql_cmd: Union[str, Composable]) -> None:
50 | try:
51 | with PostgresHook(postgres_conn_id=conn_id).get_conn() as conn:
52 | logger.info(f"Executing query. {sql_cmd if isinstance(sql_cmd, str) else sql_cmd.as_string(conn)}")
53 | with conn.cursor() as crs:
54 | # execute sql
55 | crs.execute(sql_cmd)
56 | # commit
57 | conn.commit()
58 | except Exception as exc:
59 | logger.exception(f"Executing query: failed. {exc}")
60 | raise
61 | else:
62 | logger.info(f"Executing query: successful.")
63 |
64 |
65 | @task
66 | def create_roles(path_str: str) -> None:
67 | """Creates roles."""
68 | _execute_sql(retrieve_conn_id(), _get_roles_sql(path_str))
69 |
70 |
71 | @task
72 | def create_tables(path_str: str) -> None:
73 | """Creates hypertables."""
74 | _execute_sql(retrieve_conn_id(), _read_sql(path_str))
75 |
76 |
--------------------------------------------------------------------------------
/part5/pipecraft/pipecraft.Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:2.8.1-python3.11
2 |
3 | # install additional requirements (if needed)
4 | # COPY requirements.txt /
5 | # RUN pip install --no-cache-dir -r /requirements.txt
6 |
7 | # set environment variables for Airflow
8 | ENV AIRFLOW__CORE__EXECUTOR=LocalExecutor
9 | ENV AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS=false
10 | ENV AIRFLOW__CORE__LOAD_EXAMPLES=false
11 | ENV AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION=true
12 | ENV AIRFLOW__LOGGING__LOGGING_LEVEL=INFO
13 |
14 | # copy DAGs and other configurations into the image
15 | COPY ./dags /opt/airflow/dags
16 | COPY ./config /opt/airflow/config
17 | COPY ./plugins /opt/airflow/plugins
18 | COPY ./scripts /opt/airflow/scripts
19 |
20 | # connect docker image to your repo (not required)
21 | # LABEL org.opencontainers.image.source https://github.com/bylethquant/substack-data-infra
22 |
23 | # expose port 8080 for the Airflow UI
24 | EXPOSE 8080
25 |
--------------------------------------------------------------------------------
/part5/pipecraft/pipecraft_build_and_push.sh:
--------------------------------------------------------------------------------
1 | # define the image name, tag, and dockerfile name
2 | CONTAINER_REGISTRY="ghcr.io/bylethquant/"
3 | IMAGE_NAME="sds-pipecraft"
4 | TAG="latest"
5 | DOCKERFILE_NAME="pipecraft.Dockerfile"
6 |
7 | # build the docker image
8 | docker build -t $CONTAINER_REGISTRY$IMAGE_NAME:$TAG -f $DOCKERFILE_NAME .
9 |
10 | # push the docker image to the repository
11 | docker push $CONTAINER_REGISTRY$IMAGE_NAME:$TAG
12 |
--------------------------------------------------------------------------------
/part5/pipecraft/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bylethquant/simple-data-stack/b6b107c0b1db37a5d003122b75da4017d6af56f8/part5/pipecraft/plugins/__init__.py
--------------------------------------------------------------------------------
/part5/pipecraft/scripts/entry_init.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | airflow db migrate
4 |
5 | airflow users create \
6 | --username "${_AIRFLOW_WWW_USER_USERNAME}" \
7 | --firstname "${_AIRFLOW_WWW_USER_FIRSTNAME}" \
8 | --lastname "${_AIRFLOW_WWW_USER_LASTNAME}" \
9 | --role "${_AIRFLOW_WWW_USER_ROLE}" \
10 | --email "${_AIRFLOW_WWW_USER_EMAIL}" \
11 | --password "${_AIRFLOW_WWW_USER_PASSWORD}" || true
12 |
13 | echo "Airflow database initialization completed."
14 |
--------------------------------------------------------------------------------
/part5/pipecraft/scripts/gen_fernet_key.py:
--------------------------------------------------------------------------------
1 | from cryptography.fernet import Fernet
2 |
3 |
4 | def get_fernet_key():
5 | """Generates a fernet key."""
6 | return Fernet.generate_key().decode()
7 |
8 |
9 | def main():
10 | print(get_fernet_key())
11 |
12 |
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/part5/requirements.txt:
--------------------------------------------------------------------------------
1 | cryptography~=42.0.5
2 | apache-airflow~=2.8.1
3 | apache-airflow-providers-postgres~=5.10.0
4 | numpy~=1.24.4
5 | pandas~=2.0.3
6 | psycopg2-binary~=2.9.7
7 | requests~=2.31.0
8 | tenacity~=8.2.3
--------------------------------------------------------------------------------