├── .gitignore
├── LICENSE
├── README.md
├── images
    ├── benchmarks.png
    ├── dashboard.png
    ├── demo.gif
    ├── flow.png
    ├── s3.png
    ├── serverless.png
    └── streamlit.png
└── src
    ├── Makefile
    ├── benchmark.py
    ├── dashboard
        ├── dashboard.py
        └── dbt
        │   ├── analysis
        │       └── .gitkeep
        │   ├── dbt_project.yml
        │   ├── macros
        │       └── .gitkeep
        │   ├── models
        │       └── taxi
        │       │   ├── top_pickup_locations.sql
        │       │   └── trips_by_pickup_location.sql
        │   ├── snapshots
        │       └── .gitkeep
        │   └── tests
        │       └── .gitkeep
    ├── data
        └── .gitkeep
    ├── local.env
    ├── package-lock.json
    ├── package.json
    ├── quack.py
    ├── requirements.txt
    ├── run_me_first.py
    ├── serverless.yml
    └── serverless
        ├── Dockerfile
        └── app.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | *.parquet
132 | 
133 | .serverless/
134 | node_modules/
135 | 
136 | .DS_Store
137 | .node-version
138 | .python-version


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Bauplan Labs
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Quack-reduce
  2 | A playground for running duckdb as a stateless query engine over a data lake. 
  3 | The idea is to have a zero-maintenance, [very fast](https://www.loom.com/share/96f1fd938c814d0a825facb215546f03) and almost free data engine for small analytics apps. 
  4 | This repo is the companion code for this [blog post](https://towardsdatascience.com/a-serverless-query-engine-from-spare-parts-bd6320f10353).
  5 | 
  6 | Please refer to the blog post for more background information and details on the use case.
  7 | 
  8 | ## Quick Start ..ε=(｡ﾉ･ω･)ﾉ 
  9 | 
 10 | If you read the [blog post](https://towardsdatascience.com/a-serverless-query-engine-from-spare-parts-bd6320f10353) and know already what we are up to, follow the quick setup steps below to run everything in no time.
 11 | 
 12 | ### Setup your account
 13 | 
 14 | Make sure you have:
 15 | 
 16 | - A working AWS account and an access key with [sufficient priviledges to deploy a lambda instance](https://www.serverless.com/framework/docs/providers/aws/guide/credentials) -- this could be the `AdministratorAccess` policy in AWS IAM, or something more fine grained;
 17 | - [Docker](https://docs.docker.com/get-docker/) installed and running on your machine;
 18 | - Python 3.9+ and Node.js properly installed on your machine;
 19 | - A `profiles.yaml` file on your local machine to run the dbt project.
 20 | 
 21 | In the `src` folder, you should copy `local.env` to `.env` (do *not* commit it) and fill it with proper values:
 22 | 
 23 | | value                 | type | description                                          |                   example |
 24 | |-----------------------|------|------------------------------------------------------|--------------------------:|
 25 | | AWS_ACCESS_KEY_ID     | str  | User key for AWS access                              | AKIAIO...                 |
 26 | | AWS_SECRET_ACCESS_KEY | str  | Secret key for AWS access                            | wJalr/...                 |
 27 | | AWS_DEFAULT_REGION    | str  | AWS region for the deployment                        | us-east-1                 |
 28 | | S3_BUCKET_NAME        | str  | Bucket to host the data (must be unique)             | my-duck-bucket-130dcqda0u |
 29 | 
 30 | These variables will be used by the setup script and the runner to communicate with AWS services. Make sure the user has the permissions to:
 31 | 
 32 | - create a bucket and upload files to it;
 33 | - invoke the lambda below.
 34 | 
 35 | ### Run the project
 36 | From the `src` folder:
 37 | 
 38 | >**1. Create the DuckDB Lambda:** run `make nodejs-init` and then `make serverless-deploy`.  Note that `src/serverless.yml` is configured to use `arm64`. This does a local Docker build, so if you're on an `x86_64` machine, it will fail.  Replace `arm64` with `x86_64` as needed.  After deployment, you can test the lambda is working from the [console](https://www.loom.com/share/97785a387af84924b830b9e0f35d8a1e).
 39 | 
 40 | >**2. Build the Python env:** run `make python-init`.
 41 | 
 42 | >**3. Download the data and upload it to S3:** run `make run_me_first` (check your S3 bucket and make sure you find a `partitioned` folder with [this structure](images/s3.png)).
 43 | 
 44 | >**4. Test the serverless query engine:** run `make test`.
 45 | 
 46 | >**5. Set up your dbt profile:** to run dbt locally, set up a dbt [profile](https://docs.getdbt.com/docs/core/connection-profiles) named `duckdb-taxi` (see [here](https://github.com/jwills/dbt-duckdb) for examples):
 47 | ```yaml
 48 | # ~/.dbt/profiles.yml
 49 | duckdb-taxi:
 50 |   outputs:
 51 |    dev:
 52 |      type: duckdb
 53 |      path: ':memory:'
 54 |      extensions:
 55 |         - httpfs
 56 |         - parquet
 57 |   target: dev
 58 | ```
 59 | >**6. Run the dbt project:** run `make dbt-run`.
 60 | 
 61 | >**7. Run the Analytics app:** run `make dashboard`.
 62 | 
 63 | <img src="images/demo.gif" width="448">
 64 | 
 65 | Note that every time the input field in the dashboard changes, we run a full round-trip on our engine in the back: it can be *this* fast!
 66 | 
 67 | ***
 68 | 
 69 | ## Project Overview
 70 | 
 71 | If you want to give it a try, follow the instructions in the `Quick Start` section above to get the system up and running. The rest of the README explores in more details the various components:
 72 | 
 73 | * the lambda function running the queries;
 74 | * interactions from a local script;
 75 | * a serverless BI application;
 76 | * running massive parallel workloads through the engine.
 77 | 
 78 | <img src="images/flow.png" width="448">
 79 | 
 80 | > NOTE: this project (including this README!) is written for pedagogical purposes and it is not production-ready (or even well tested!): our main goal is to provide a reference implementation of few key concepts as a starting point for future projects - so, sorry for being a bit verbose and perhaps pedantic at times.
 81 | 
 82 | ### Duckdb lambda
 83 | 
 84 | The `src/serverless` folder is a standard [serverless](https://www.serverless.com/framework/) project, to build an AWS lambda function with Duckdb on it. It has three main components:
 85 | 
 86 | - a Dockerfile, which starts from the public AWS lambda image for Python (`public.ecr.aws/lambda/python:3.9`) and add the few dependencies we need;
 87 | - an `app.py` file, containing the actual code our lambda will execute;
 88 | - a `../serverless.yml` file, which ties all these things together in the infra-as-code fashion, and allows us to deploy and manage the function from the CLI.
 89 | 
 90 | The cloud setup is done for you when you run `make nodejs-init` and  `make serverless-deploy` (Step 1 in the setup list above). The first time, deployment will take a while as it needs to create the image, ship it to AWS and [create the stack](images/serverless.png) - note that this is _a "one-off" thing_.
 91 | 
 92 | > NOTE: you may get a `403 Forbidden` error when building the image: in our experience, this usually goes away with `aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws`.
 93 | 
 94 | ### Interacting with the engine
 95 | 
 96 | We can use a simple Python script to interact with our engine. First, we can test the system with a hard-coded query. Make sure you run Step 2 and Step 3 in the quick start list (to setup Python and the dataset): now, we can test everything is working with `make test`.
 97 | 
 98 | If all looks good, you can now run arbitrary queries (replacing `MY_BUCKET_NAME` with your value) just by using the provided `python quack.py` script; make sure to manually activate your venv with `source ./.venv/bin/activate`, e.g. you can run
 99 | 
100 | `python quack.py -q "SELECT pickup_location_id AS location_id, COUNT(*) AS counts FROM read_parquet(['s3://MY_BUCKET_NAME/dataset/taxi_2019_04.parquet']) WHERE pickup_at >= '2019-04-01' AND pickup_at < '2019-04-03' GROUP BY 1 ORDER BY 2 DESC"`
101 | 
102 | to get the most popular pickup location (IDs) for the first few days of April. 
103 | 
104 | Since the amount of data that can be returned by a lambda is limited, the lambda will automatically limit your rows if you don't specific a limit in the script. You can get more data back with:
105 | 
106 | `python quack.py -q ... -limit 100`
107 | 
108 | but be mindful of the infrastructure constraints!
109 | 
110 | ### Serverless BI architecture (Optional)
111 | 
112 | If you want to see how this architecture can bridge the gap between offline pipelines preparing artifacts, and real-time querying for BI (or other use cases), you can simulate how a dbt project may prepare a view that is querable in a dashboard, through our engine (check our blog post for some more context on this use case). 
113 | 
114 | The quickest setup is running dbt locally, so you need to set up a dbt [profile](https://docs.getdbt.com/docs/core/connection-profiles) named `duckdb-taxi` (see [here](https://github.com/jwills/dbt-duckdb) for examples):
115 | 
116 | ```yaml
117 | # ~/.dbt/profiles.yml
118 | duckdb-taxi:
119 |   outputs:
120 |    dev:
121 |      type: duckdb
122 |      path: ':memory:'
123 |      extensions:
124 |         - httpfs
125 |         - parquet
126 |   target: dev
127 | ```
128 | 
129 | > NOTE: since we run dbt through `make`, there is no need to add credentials to the `extensions`. If you prefer to run it manually, your dbt profile should look more like this:
130 | 
131 | ```yaml
132 | # ~/.dbt/profiles.yml
133 | duckdb-taxi:
134 |   outputs:
135 |    dev:
136 |      type: duckdb
137 |      path: ':memory:'
138 |      extensions:
139 |         - httpfs
140 |         - parquet
141 |      settings:
142 |         s3_region: us-east-1
143 |         s3_access_key_id: YOUR_S3_USER
144 |         s3_secret_access_key: YOUR_S3_KEY
145 |   target: dev
146 | ```
147 | 
148 | After the dbt setup is completed, you can use again the `make` file to run a "batch pipeline" that produces an artifact in S3 from raw data: just type `make dbt-run` to materialize our view as a parquet file:
149 | 
150 | <img src="images/dashboard.png" width="448">
151 | 
152 | > NOTE: different warehouses would need different configurations to export the node to the same location, e.g. [Snowflake](https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet). 
153 | 
154 | To run the front-end (a dashboard built with streamlit querying the view we materialized) run `make dashboard`. A page should open in the browser, displaying a chart:
155 | 
156 | <img src="images/streamlit.png" width="448">
157 | 
158 | You can use the form to interact in real time with the dataset (video [here](https://www.loom.com/share/9d5de3ba822a445d9d117225c1b0307f)), through the serverless infrastructure we built.
159 | 
160 | ### From quack to quack-reduce (Optional)
161 | 
162 | The staless execution of SQL over an object storage (and therefore, using duckdb not really as a db, but basically as "just" a query engine) coupled with the parallel nature of AWS lambdas opens up interesting optimization possibilities.
163 | 
164 | In particular, we could rephrase (some) SQL queries through a map-reduce programming pattern *with other SQL queries*, and execute them all at the same time. To consider a trivial example, a query such as:
165 | 
166 | `SELECT COUNT(*) FROM myTable WHERE DATE BETWEEN 04/01/2022 AND 04/05/2022`
167 | 
168 | can be rewritten as the SUM of the results of these smaller queries:
169 | 
170 | `SELECT COUNT(*) FROM myTable WHERE DATE BETWEEN 04/01/2022 AND 04/02/2022` +
171 | `SELECT COUNT(*) FROM myTable WHERE DATE BETWEEN 04/02/2022 AND 04/03/2022` +
172 | ...
173 | 
174 | As the number of files increases (as in a typical hive-partitioned data lake), scanning the object storage (in duckdb syntax `parquet_scan('folder/', HIVE_PARTITIONING=1)`) may take much longer than reading single _k_ files directly through ideally _k_ parallel functions, drastically improving query performances.
175 | 
176 | To test out this hypothesis, we built a script that compares the same engine across different deployment patterns - local, remote etc. You can run the bechmarks with default values with `make benchmark`. The script is minimal, but should be enough to give you a feeling of how the different setups perform compared to each other, and the trade-offs involved (check the code for how it's built, but don't expect much!).
177 | 
178 | [A typical run](https://www.loom.com/share/18a060b89a6a4f6d814e06ffa2674b13) will result in something like this [table](images/benchmarks.png) (numbers will vary).
179 | 
180 | Please refer to the blogpost for more musings on this opportunity (and the non-trivial associated challenges).
181 | 
182 | > NOTE: if you have never raised your concurrency limits on AWS lambda, you may need to request through the console for an increase in parallel execution, otherwise AWS will not allowed the scaling out of the function.
183 | 
184 | ## What's next?
185 | 
186 | If you like what you've seen so far, you may wonder what you could do next! There's a million ways to improve this design, some of which more obvious than others - as a non-exhaustive list ("left as an excercise to the reader"), this is where we would start:
187 | 
188 | * if you always query the same table (say, a view for your dashboard), you may want to leverage the `cold` / `warm` pattern in the lambda code to store the table in memory when cold, and read from there (instead of parquet) when warm;
189 | 
190 | * when you move from one file to multiple files, scanning parquet folders is a huge overhead: wouldn't it be nice to know where to look? While HIVE partitioning is great, modern table formats (e.g. Iceberg) are even better, so you could think of combine their table scan properties with our serverless engine. Performance aside, if you have queried `quack.py`, you know how tedious it is to fully remember the full file name every time: leveraging catalogs like Iceberg, Glue, Nessie etc. would make the experience more "database-like";
191 | 
192 | * try out other use cases! For example, consider this recent [event collection](https://github.com/fal-ai/fal-events) platform. If you modify it to a dump-to-s3-then-query pattern (leveraging the engine we built with this repo), you end up with a lambda-only version of the [Snowflake architecture](https://github.com/jacopotagliabue/paas-data-ingestion) we open sourced some time ago - an end-to-end analytics platform running without servers;
193 | 
194 | * while we now run the query in memory and return a subset of row from the lambda, this pattern is certainly not perfect: on the one hand, sometime we may wish to write back the result of a query (dbt-style, so to speak); on the other, even if analytics queries are often aggregates, result tables may still grow big (row-wise): writing them to s3 and have the client stream back rows from there may be a nice feature to add!
195 | 
196 | ## License
197 | 
198 | All the code is released without warranty, "as is" under a MIT License.
199 | 
200 | This started as a fun week-end project and should be treated with the appropriate sense of humour.
201 | 


--------------------------------------------------------------------------------
/images/benchmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/benchmarks.png


--------------------------------------------------------------------------------
/images/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/dashboard.png


--------------------------------------------------------------------------------
/images/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/demo.gif


--------------------------------------------------------------------------------
/images/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/flow.png


--------------------------------------------------------------------------------
/images/s3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/s3.png


--------------------------------------------------------------------------------
/images/serverless.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/serverless.png


--------------------------------------------------------------------------------
/images/streamlit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/images/streamlit.png


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | include .env
 2 | 
 3 | # need bash because we use the "source" command (otherwise it fails when make defaults to /bin/sh)
 4 | SHELL=bash
 5 | 
 6 | nodejs-init:
 7 | 	npm install
 8 | .PHONY: nodejs-init
 9 | 
10 | serverless-deploy:
11 | 	npx serverless deploy
12 | .PHONY: serverless-deploy
13 | 
14 | python-init:
15 | 	python3 -m venv ./.venv && source ./.venv/bin/activate && pip install -r requirements.txt
16 | .PHONY: python-init
17 | 
18 | run_me_first:
19 | 	source ./.venv/bin/activate && python3 run_me_first.py
20 | .PHONY: run_me_first
21 | 
22 | test:
23 | 	source ./.venv/bin/activate && python3 quack.py
24 | .PHONY: test
25 | 
26 | test-distinct:
27 | 	source ./.venv/bin/activate && python3 quack.py -q "SELECT pickup_location_id AS location_id, COUNT(*) AS counts FROM read_parquet(['s3://${S3_BUCKET_NAME}/dataset/taxi_2019_04.parquet']) WHERE pickup_at >= '2019-04-01' AND pickup_at < '2019-04-03' GROUP BY 1 ORDER BY 2 DESC"
28 | .PHONY: test-distinct
29 | 
30 | benchmark:
31 | 	source ./.venv/bin/activate && python3 benchmark.py
32 | .PHONY: benchmark
33 | 
34 | dbt-run:
35 | 	source ./.venv/bin/activate && cd dashboard/dbt && S3_BUCKET_NAME=${S3_BUCKET_NAME} dbt run
36 | .PHONY: dbt-run
37 | 
38 | dbt-docs:
39 | 	source ./.venv/bin/activate && cd dashboard/dbt && S3_BUCKET_NAME=${S3_BUCKET_NAME} dbt docs generate && dbt docs serve
40 | .PHONY: dbt-docs
41 | 
42 | dashboard:
43 | 	source ./.venv/bin/activate && cd dashboard && streamlit run dashboard.py
44 | .PHONY: dashboard
45 | 


--------------------------------------------------------------------------------
/src/benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This Python script benchmarks the performance of running queries using duckdb as engine and 
  4 | an object storage as data source. 
  5 | 
  6 | The script is minimal and not very configurable, but should be enough to give you a 
  7 | feeling of how the different setups perform and the trade-offs involved.
  8 | 
  9 | Please note we basically start with 2019-04-01 and based on the -d flag we add days to the date.
 10 | So by increasing -d you will increase the amount of data to be processed. The map reduce version
 11 | manually unpacks the queries into queries-by-date and then runs them in parallel.
 12 | 
 13 | Due to cold start, the first run of the serverless version may be slower than the others, so you should
 14 | re-run the same script multiple times to get a better idea of the performance.
 15 | 
 16 | """
 17 | 
 18 | import os
 19 | import duckdb
 20 | import json
 21 | import statistics
 22 | import time
 23 | from fastcore.parallel import parallel
 24 | from quack import invoke_lambda, display_table
 25 | from dotenv import load_dotenv
 26 | from collections import defaultdict
 27 | from rich.console import Console
 28 | 
 29 | 
 30 | # get the environment variables from the .env file
 31 | load_dotenv()
 32 | 
 33 | 
 34 | def run_benchmarks(
 35 |     bucket: str,
 36 |     repetitions: int,
 37 |     threads: int,
 38 |     days: int,
 39 |     is_debug: bool = False
 40 | ):
 41 |     test_location_id = 237
 42 |     execution_times = []
 43 |     # NOTE: as usual we re-use the same naming convention as in the setup script
 44 |     # and all the others
 45 |     partitioned_dataset_scan = f"s3://{bucket}/partitioned/*/*.parquet"
 46 |     # run the map reduce version
 47 |     repetition_times = []
 48 |     print("\n====> Running map reduce version")
 49 |     for i in range(repetitions):
 50 |         start_time = time.time()
 51 |         map_reduce_results = run_map_reduce(
 52 |             bucket=bucket,
 53 |             days=days,
 54 |             threads=threads,
 55 |             is_debug=is_debug
 56 |         )
 57 |         repetition_times.append(time.time() - start_time)
 58 |         time.sleep(3)
 59 |     
 60 |     execution_times.append({
 61 |         'type': 'map_reduce',
 62 |         'mean': round(sum(repetition_times) / len(repetition_times), 3),
 63 |         'std': round(statistics.stdev(repetition_times), 3),
 64 |         'test location': map_reduce_results[test_location_id]
 65 |     })
 66 |     # run the standard serverless version
 67 |     repetition_times = []
 68 |     print("\n====> Running serverless duckdb")
 69 |     for i in range(repetitions):
 70 |         start_time = time.time()
 71 |         results = run_serverless_lambda(
 72 |             partitioned_dataset_scan=partitioned_dataset_scan,
 73 |             days=days,
 74 |             is_debug=is_debug
 75 |         )
 76 |         repetition_times.append(time.time() - start_time)
 77 |         time.sleep(3)
 78 | 
 79 |     execution_times.append({
 80 |         'type': 'serverless',
 81 |         'mean': round(sum(repetition_times) / len(repetition_times), 3),
 82 |         'std': round(statistics.stdev(repetition_times), 3),
 83 |         'test location': results[test_location_id]
 84 |     })
 85 |     # run a local db querying the data lake
 86 |     repetition_times = []
 87 |     print("\n====> Running local duckdb")
 88 |     for i in range(repetitions):
 89 |         start_time = time.time()
 90 |         # just re-use the code inside the lambda without thinking too much ;-)
 91 |         con = duckdb.connect(database=':memory:')
 92 |         con.execute(f"""
 93 |             INSTALL httpfs;
 94 |             LOAD httpfs;
 95 |             SET s3_region='{os.environ.get('AWS_DEFAULT_REGION', 'us-east-1')}';
 96 |             SET s3_access_key_id='{os.environ['AWS_ACCESS_KEY_ID']}';
 97 |             SET s3_secret_access_key='{os.environ['AWS_SECRET_ACCESS_KEY']}';
 98 |         """)
 99 |         local_results = run_local_db(
100 |             con=con,
101 |             partitioned_dataset_scan=partitioned_dataset_scan,
102 |             days=days,
103 |             is_debug=is_debug
104 |         )
105 |         del con
106 |         repetition_times.append(time.time() - start_time)
107 |         time.sleep(3)
108 |     
109 |     execution_times.append({
110 |         'type': 'local',
111 |         'mean': round(sum(repetition_times) / len(repetition_times), 3),
112 |         'std': round(statistics.stdev(repetition_times), 3),
113 |         'test location': local_results[test_location_id]
114 |     })
115 |     # make sure the results are the same
116 |     assert results[test_location_id] == map_reduce_results[test_location_id] == local_results[test_location_id], "The results are not the same!"
117 | 
118 |     # display results in a table
119 |     console = Console()
120 |     display_table(console, execution_times, title="Benchmarks", color="cyan")
121 | 
122 |     # all done, say goodbye
123 |     print("All done! See you, duck cowboy!")
124 |     return     
125 | 
126 | 
127 | def run_local_db(
128 |     con,
129 |     partitioned_dataset_scan: str,
130 |     days: int,
131 |     is_debug: bool
132 | ):
133 |     single_query = """
134 |         SELECT 
135 |             pickup_location_id AS location_id, 
136 |             COUNT(*) AS counts 
137 |         FROM 
138 |             parquet_scan('{}', HIVE_PARTITIONING=1)
139 |         WHERE 
140 |             DATE >= '2019-04-01' AND DATE < '2019-04-{}'
141 |         GROUP BY 1
142 |     """.format(
143 |         partitioned_dataset_scan,
144 |         "{:02d}".format(1 + days)
145 |     )
146 |     if is_debug:
147 |         print(single_query)
148 |     # just re-use the code inside the lambda with no particular changes
149 |     _df = con.execute(single_query).df()
150 |     _df = _df.head(1000)
151 |     records = _df.to_dict('records')
152 | 
153 |     return { row['location_id']: row['counts'] for row in records }
154 | 
155 | 
156 | def run_serverless_lambda(
157 |     partitioned_dataset_scan: str,
158 |     days: int,
159 |     is_debug: bool
160 | ):
161 |     single_query = """
162 |         SELECT 
163 |             pickup_location_id AS location_id, 
164 |             COUNT(*) AS counts 
165 |         FROM 
166 |             parquet_scan('{}', HIVE_PARTITIONING=1)
167 |         WHERE 
168 |             DATE >= '2019-04-01' AND DATE < '2019-04-{}'
169 |         GROUP BY 1
170 |     """.format(
171 |         partitioned_dataset_scan,
172 |         "{:02d}".format(1 + days)
173 |     )
174 |     if is_debug:
175 |         print(single_query)
176 |     response = invoke_lambda(json.dumps({ 'q': single_query, 'limit': 1000}))
177 |     if 'errorMessage' in response:
178 |         print(response['errorMessage'])
179 |         raise Exception("There was an error in the serverless invocation")
180 |     records = response['data']['records']
181 | 
182 |     return { row['location_id']: row['counts'] for row in records }
183 | 
184 | 
185 | def run_map_reduce(
186 |     bucket: str,
187 |     days: int,
188 |     threads: int,
189 |     is_debug: bool
190 | ):
191 |     query = """
192 |         SELECT 
193 |             pickup_location_id AS location_id, 
194 |             COUNT(*) AS counts 
195 |         FROM 
196 |             read_parquet('{}', HIVE_PARTITIONING=1)
197 |         WHERE 
198 |             DATE >= '2019-04-{}' AND DATE < '2019-04-{}'
199 |         GROUP BY 1
200 |     """.strip()
201 |     # prepare the queries for the map step
202 |     queries = prepare_map_queries(query, bucket, days)
203 |     if is_debug:
204 |         print(queries[:3])
205 |     assert len(queries) == days, "The number of queries is not correct"
206 |     # run the queries in parallel
207 |     payloads = [json.dumps({'q': q, 'limit': 1000 }) for q in queries]
208 |     _results = parallel(
209 |             invoke_lambda, 
210 |             payloads,
211 |             n_workers=threads)
212 |     # check for errors in ANY response
213 |     if any(['errorMessage' in response for response in _results]):
214 |         print(next(response['errorMessage'] for response in _results if 'errorMessage' in response))
215 |         raise Exception("There was an error in the parallel invocation")
216 |     # do the "reduce" step in code
217 |     results = defaultdict(lambda: 0)
218 |     # loop over the results
219 |     for response in _results:
220 |         records = response['data']['records']
221 |         for row in records:
222 |             results[row['location_id']] += row['counts']
223 |         
224 |     return results
225 | 
226 | 
227 | def prepare_map_queries(
228 |         query: str,
229 |         bucket: str,
230 |         days: int
231 |         ):
232 |     # template for parquet scan
233 |     queries = []
234 |     for i in range(1, days + 1):
235 |         start_day_as_str = "{:02d}".format(i)
236 |         end_day_as_str = "{:02d}".format(i + 1)
237 |         parquet_scan = f"s3://{bucket}/partitioned/date=2019-04-{start_day_as_str}/*.parquet"
238 |         queries.append(query.format(parquet_scan, start_day_as_str, end_day_as_str))
239 |     
240 |     return queries
241 | 
242 | if __name__ == "__main__":
243 |     # make sure the envs are set
244 |     assert 'S3_BUCKET_NAME' in os.environ, "Please set the S3_BUCKET_NAME environment variable"
245 |     assert 'AWS_ACCESS_KEY_ID' in os.environ, "Please set the AWS_ACCESS_KEY_ID environment variable"
246 |     assert 'AWS_SECRET_ACCESS_KEY' in os.environ, "Please set the AWS_SECRET_ACCESS_KEY environment variable"
247 |     # get args from command line
248 |     import argparse
249 |     parser = argparse.ArgumentParser()
250 |     parser.add_argument(
251 |         "-n",
252 |         type=int,
253 |         help="number of repetitions", 
254 |         default=3)
255 |     # note: without reserved concurrency, too much concurrency will cause errors
256 |     parser.add_argument(
257 |         "-t",
258 |         type=int,
259 |         help="concurrent queries for map reduce", 
260 |         default=20)
261 |     parser.add_argument(
262 |         "-d",
263 |         type=int,
264 |         help="number of days in April to query", 
265 |         default=10)
266 |     parser.add_argument(
267 |         "--debug", 
268 |         action="store_true",
269 |         help="increase output verbosity",
270 |         default=False)
271 |     args = parser.parse_args()
272 |     # run the main function
273 |     run_benchmarks(
274 |         bucket=os.environ['S3_BUCKET_NAME'],
275 |         repetitions=args.n,
276 |         threads=args.t,
277 |         days=args.d,
278 |         is_debug=args.debug
279 |     )


--------------------------------------------------------------------------------
/src/dashboard/dashboard.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Simple dashboard for the taxi data based on Streamlit.
 4 | 
 5 | It re-uses through hugly imports the code from the quack.py script, and use seaborn to plot the data.
 6 | 
 7 | """
 8 | 
 9 | import sys
10 | import os
11 | import matplotlib.pyplot as plt
12 | import pandas as pd
13 | import seaborn as sns
14 | import streamlit as st
15 | from dotenv import load_dotenv
16 | 
17 | 
18 | # get the environment variables from the .env file
19 | load_dotenv('../.env')
20 | assert 'S3_BUCKET_NAME' in os.environ, "Please set the S3_BUCKET_NAME environment variable"
21 | S3_BUCKET_NAME = os.environ['S3_BUCKET_NAME']
22 | # note: this is the same file we exported in the top_pickup_locations.sql query
23 | # as part of our data transformation pipeline
24 | PARQUET_FILE = f"s3://{S3_BUCKET_NAME}/dashboard/my_view.parquet"
25 | 
26 | # import querying functoin from the runner
27 | sys.path.insert(0,'..')
28 | from quack import fetch_all
29 | # build up the dashboard
30 | st.markdown("# Trip Dashboard")
31 | st.write("This dashboard shows KPIs for our taxi business.")
32 | st.header("Top pickup locations (map id) by number of trips")
33 | # hardcode the columns
34 | COLS = ['PICKUP_LOCATION_ID', 'TRIPS']
35 | 
36 | # get the total row count
37 | query = f"SELECT COUNT(*) AS C FROM read_parquet(['{PARQUET_FILE}'])"
38 | df, metadata = fetch_all(query, limit=1, display=False, is_debug=False)
39 | st.write(f"Total row count: {df['C'][0]}")
40 | 
41 | # get the interactive chart
42 | base_query = f"""
43 |     SELECT 
44 |         location_id AS {COLS[0]}, 
45 |         counts AS {COLS[1]} 
46 |     FROM 
47 |         read_parquet(['{PARQUET_FILE}'])
48 |     """.strip()
49 | top_k = st.text_input('# of pickup locations', '5')
50 | # add a limit to the query based on the user input
51 | final_query = "{} LIMIT {};".format(base_query, top_k).format(top_k)
52 | df, metadata = fetch_all(final_query, limit=int(top_k), display=False, is_debug=False)
53 | 
54 | # if no error is returned, we plot the data
55 | if df is not None:
56 |     fig = plt.figure(figsize=(10,5))
57 |     sns.barplot(
58 |         x = COLS[0],
59 |         y = COLS[1],
60 |         data = df,
61 |         order=df.sort_values(COLS[1],ascending = False)[COLS[0]])
62 |     plt.xticks(rotation=70)
63 |     plt.tight_layout()
64 |     st.pyplot(fig)
65 | else:
66 |     st.write("Sorry, something went wrong :-(")
67 | 
68 | # display metadata
69 | st.write(f"Roundtrip ms: {metadata['roundtrip_time']}")
70 | st.write(f"Query exec. time ms: {metadata['timeMs']}")
71 | st.write(f"Lambda is warm: {metadata['warm']}")
72 |         
73 | 


--------------------------------------------------------------------------------
/src/dashboard/dbt/analysis/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/src/dashboard/dbt/analysis/.gitkeep


--------------------------------------------------------------------------------
/src/dashboard/dbt/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | name: 'taxi_dashboard'
 2 | version: '1.0.0'
 3 | 
 4 | config-version: 2
 5 | profile: 'duckdb-taxi'
 6 | source-paths: ["models"]
 7 | analysis-paths: ["analysis"]
 8 | test-paths: ["tests"]
 9 | data-paths: ["data"]
10 | macro-paths: ["macros"]
11 | snapshot-paths: ["snapshots"]
12 | 
13 | target-path: "target" 
14 | clean-targets: 
15 |   - "target"
16 |   - "dbt_modules"
17 | 
18 | models:
19 |   taxi:
20 |     foundation:
21 |       +materialized: view
22 | 


--------------------------------------------------------------------------------
/src/dashboard/dbt/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/src/dashboard/dbt/macros/.gitkeep


--------------------------------------------------------------------------------
/src/dashboard/dbt/models/taxi/top_pickup_locations.sql:
--------------------------------------------------------------------------------
1 | {{ config(materialized='external', location="s3://{{ env_var('S3_BUCKET_NAME') }}/dashboard/my_view.parquet") }}
2 | 
3 | SELECT 
4 |     location_id,
5 |     counts
6 | FROM
7 |     {{ ref('trips_by_pickup_location') }}
8 | ORDER BY 2 DESC
9 | LIMIT 200


--------------------------------------------------------------------------------
/src/dashboard/dbt/models/taxi/trips_by_pickup_location.sql:
--------------------------------------------------------------------------------
1 | SELECT 
2 |     pickup_location_id AS location_id, 
3 |     COUNT(*) AS counts 
4 | FROM 
5 |     read_parquet(['s3://{{ env_var('S3_BUCKET_NAME') }}/dataset/taxi_2019_04.parquet']) 
6 | GROUP BY 1


--------------------------------------------------------------------------------
/src/dashboard/dbt/snapshots/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/src/dashboard/dbt/snapshots/.gitkeep


--------------------------------------------------------------------------------
/src/dashboard/dbt/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/src/dashboard/dbt/tests/.gitkeep


--------------------------------------------------------------------------------
/src/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BauplanLabs/quack-reduce/aac1326941c16e5131d20be1751ad29664def698/src/data/.gitkeep


--------------------------------------------------------------------------------
/src/local.env:
--------------------------------------------------------------------------------
1 | AWS_ACCESS_KEY_ID=
2 | AWS_SECRET_ACCESS_KEY=
3 | AWS_DEFAULT_REGION=us-east-1
4 | S3_BUCKET_NAME=


--------------------------------------------------------------------------------
/src/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "serverless": "^3.30.1",
4 |     "serverless-iam-roles-per-function": "^3.2.0"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/src/quack.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Python script to interact with the serverless architecture. Query and limit parameters
  4 | can be passed through the command line, or the script can be run without parameters
  5 | to check the status of the lambda (it will return the results from a pre-defined query).
  6 | 
  7 | 
  8 | Check the README.md for more details.
  9 | 
 10 | """
 11 | 
 12 | 
 13 | import os
 14 | import time
 15 | import boto3
 16 | import pandas as pd
 17 | import json
 18 | from rich.console import Console
 19 | from rich.table import Table
 20 | from dotenv import load_dotenv
 21 | 
 22 | 
 23 | # get the environment variables from the .env file
 24 | load_dotenv()
 25 | # we don't allow to display more than 10 rows in the terminal
 26 | MAX_ROWS_IN_TERMINAL = 10
 27 | # instantiate the boto3 client to communicate with the lambda
 28 | lambda_client = boto3.client('lambda')
 29 | 
 30 | 
 31 | def invoke_lambda(json_payload_as_str: str):
 32 |     """
 33 |     Invoke our duckdb lambda function. Note that the payload is a string,
 34 |     so the method should be called with json.dumps(payload)
 35 |     """
 36 |     response = lambda_client.invoke(
 37 |         # the name of the lambda function should match what you have in your console
 38 |         # if you did not change the serverless.yml file, it should be this one:
 39 |         FunctionName='quack-reduce-lambda-dev-duckdb',
 40 |         InvocationType='RequestResponse',
 41 |         LogType='Tail',
 42 |         Payload=json_payload_as_str
 43 |     )
 44 | 
 45 |     # return response as dict
 46 |     return json.loads(response['Payload'].read().decode("utf-8"))
 47 | 
 48 | 
 49 | def fetch_all(
 50 |     query: str,
 51 |     limit: int,
 52 |     display: bool=False,
 53 |     is_debug = False
 54 | )-> pd.DataFrame:
 55 |     """
 56 |     Get results from lambda and display them
 57 |     """
 58 |     if is_debug:
 59 |         print(f"Running query: {query}, with limit: {limit}")
 60 |     # run the query
 61 |     start_time = time.time()
 62 |     response = invoke_lambda(json.dumps({'q': query, 'limit': limit}))
 63 |     roundtrip_time =  int((time.time() - start_time) * 1000.0)
 64 |     # check for errors first
 65 |     if 'errorMessage' in response:
 66 |         print(f"Error: {response['errorMessage']}")
 67 |         # just raise an exception now as we don't have a proper error handling
 68 |         raise Exception(response['errorMessage'])
 69 |     # no error returned, display the results
 70 |     if is_debug:
 71 |         print(f"Debug reponse: {response}")
 72 | 
 73 |     rows = response['data']['records']
 74 |     # add the roundtrip time to the metadata
 75 |     response['metadata']['roundtrip_time'] = roundtrip_time
 76 |     # display in the console if required
 77 |     if display:
 78 |         console = Console()
 79 |         display_query_metadata(console, response['metadata'])
 80 |         display_table(console, rows)
 81 |     
 82 |     # return the results as a pandas dataframe and metadata
 83 |     return pd.DataFrame(rows), response['metadata']
 84 | 
 85 | 
 86 | def display_query_metadata(
 87 |         console: Console, 
 88 |         metadata: dict
 89 |         ):
 90 |     """
 91 |     Display the metadata returned by the lambda - we receive a dictionary with
 92 |     few properties (total time, echo of the query, is warm, etc.)
 93 |     """
 94 |     # NOTE: we cut to 25 max the field values, to avoid the table to be too wide
 95 |     values = [{ 'Field': k, 'Value': str(v)[:50] } for k, v in metadata.items()]
 96 |     display_table(console, values, title="Metadata", color="cyan")
 97 | 
 98 |     return
 99 | 
100 | 
101 | def display_table(
102 |         console: Console,
103 |         rows: list, 
104 |         title: str="My query", 
105 |         color: str="green"
106 |         ):
107 |     """
108 |     We receive a list of rows, each row is a dict with the column names as keys.
109 |     We use rich (https://rich.readthedocs.io/en/stable/tables.html) to display a nice table in the terminal.
110 |     """
111 |     # build the table
112 |     table = Table(title=title)
113 |     # buld the header
114 |     cols = list(rows[0].keys())
115 |     for col in cols:
116 |         table.add_column(col, justify="left", style=color, no_wrap=True)
117 |     # add the rows
118 |     for row in rows[:MAX_ROWS_IN_TERMINAL]:
119 |         # NOTE: we need to render str
120 |         table.add_row(*[str(row[col]) for col in cols])
121 |     # diplay the table
122 |     console.print(table)
123 | 
124 |     return
125 | 
126 | 
127 | def runner(
128 |     bucket: str,
129 |     query: str=None,
130 |     limit: int=10,
131 |     is_debug: bool = False
132 | ):
133 |     """
134 |     Run queries against our serverless (and stateless) database.
135 | 
136 |     We basically use duckdb not as a database much, but as an engine, and use
137 |     object storage to store artifacts, like tables.
138 | 
139 |     If query and limits are not specified, we overwrite them with sensible choices.
140 |     """
141 |     # if no query is specified, we run a simple count to verify that the lambda is working
142 |     if query is None:
143 |         # NOTE: the file path, after the bucket, should be the same as the one we have
144 |         # in the run_me_first.py script. If you changed it there, you should change it here
145 |         target_file = f"s3://{bucket}/dataset/taxi_2019_04.parquet"
146 |         query = f"SELECT COUNT(*) AS COUNTS FROM read_parquet(['{target_file}'])"
147 |         # since this is a test query, we force debug to be True
148 |         rows, metadata = fetch_all(query, limit, display=True, is_debug=True)
149 |     else:
150 |         # run the query as it is
151 |         rows, metadata = fetch_all(query, limit, display=True, is_debug=is_debug)
152 | 
153 |     return
154 | 
155 | 
156 | if __name__ == "__main__":
157 |     assert 'S3_BUCKET_NAME' in os.environ, "Please set the S3_BUCKET_NAME environment variable"
158 |     # get args from command line
159 |     import argparse
160 |     # declare basic arguments
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument(
163 |         "-q",
164 |         type=str,
165 |         help="query", 
166 |         default=None)
167 |     parser.add_argument(
168 |         "-limit",
169 |         type=int,
170 |         help="max rows to return from the lambda",
171 |         default=10)
172 |     parser.add_argument(
173 |         "--debug", 
174 |         action="store_true",
175 |         help="increase output verbosity",
176 |         default=False)
177 |     args = parser.parse_args()
178 |     # run the main function
179 |     runner(
180 |         bucket=os.environ['S3_BUCKET_NAME'],
181 |         query=args.q,
182 |         limit=args.limit,
183 |         is_debug=args.debug
184 |     )


--------------------------------------------------------------------------------
/src/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests==2.28.2
 2 | streamlit==1.20.0
 3 | python-dotenv==1.0.0
 4 | fsspec==2023.4.0
 5 | s3fs==2023.4.0
 6 | dbt-duckdb==1.4.1
 7 | fastcore==1.5.27
 8 | boto3==1.26.3
 9 | matplotlib==3.6.3
10 | seaborn==0.12.0


--------------------------------------------------------------------------------
/src/run_me_first.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Python script to run a one-time setup for testing the serverless duckdb architecture.
  4 | 
  5 | Check the README.md for more details and for the prerequisites.
  6 | 
  7 | """
  8 | 
  9 | 
 10 | import os
 11 | 
 12 | import boto3
 13 | import requests
 14 | import pandas as pd
 15 | from dotenv import load_dotenv
 16 | 
 17 | 
 18 | # get the environment variables from the .env file
 19 | load_dotenv()
 20 | 
 21 | 
 22 | def donwload_data(url: str, target_file: str):
 23 |     """
 24 |     Download a file from a url and save it to a target file.
 25 |     """
 26 |     r = requests.get(url)
 27 |     open(target_file, 'wb').write(r.content)
 28 | 
 29 |     return True
 30 | 
 31 | 
 32 | def download_taxi_data():
 33 |     """
 34 |     Download the taxi data from the duckdb repo - if the file disappears, 
 35 |     you can of course replace it with any other version of the same dataset.
 36 |     """
 37 |     print('Downloading the taxi dataset')
 38 |     
 39 |     url = 'https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_04.parquet'
 40 |     file_name = 'data/taxi_2019_04.parquet'
 41 |     donwload_data(url, file_name)
 42 | 
 43 |     return file_name
 44 | 
 45 | 
 46 | def upload_file_to_bucket(s3_client, file_name, bucket, object_name=None):
 47 |     """
 48 |     Upload a file to an S3 bucket.
 49 |     """
 50 |     from botocore.exceptions import ClientError
 51 |     
 52 |     try:
 53 |         print(f"Uploading {object_name}")
 54 |         response = s3_client.upload_file(file_name, bucket, object_name)
 55 |     except ClientError as e:
 56 |         print(f"Error uploading file {file_name} to bucket {bucket} with error {e}")
 57 |         return False
 58 |     
 59 |     return True
 60 | 
 61 | 
 62 | def upload_datasets(s3_client, bucket: str, taxi_dataset_path: str):
 63 |     """
 64 |     Upload the datasets to the bucket, first as one parquet file, then as
 65 |     a directory of parquet files with hive partitioning.
 66 |     """
 67 |     file_name = os.path.basename(taxi_dataset_path)
 68 |     # upload file as is, a single parquet file in the data/ folder of the target bucket
 69 |     is_uploaded = upload_file_to_bucket(
 70 |         s3_client, 
 71 |         taxi_dataset_path,
 72 |         bucket,
 73 |         object_name=f"dataset/{file_name}"
 74 |     )
 75 |     is_uploaded = upload_partioned_dataset(
 76 |         bucket, 
 77 |         taxi_dataset_path
 78 |         )
 79 | 
 80 |     return
 81 | 
 82 | 
 83 | def upload_partioned_dataset(
 84 |         bucket: str,
 85 |         taxi_dataset_path: str,
 86 |         partition_col: str = 'date'
 87 |         ):
 88 |     """
 89 |     Use pandas to read the parquet file, then save it again as a directory
 90 |     on our s3 bucket. The final directory will have a subdirectory for each
 91 |     value of the partition column, and each subdirectory will contain parquet files.
 92 |     """
 93 | 
 94 |     df = pd.read_parquet(taxi_dataset_path)
 95 |     df[partition_col] = pd.to_datetime(df['pickup_at']).dt.date
 96 |     target_folder = os.path.join('s3://', bucket, 'partitioned')
 97 |     print(f"Saving data with hive partitioning ({partition_col}) in {target_folder}")
 98 |     df.to_parquet(target_folder, partition_cols=[partition_col])
 99 |     
100 |     return True
101 | 
102 | 
103 | def setup_project():
104 |     # check vars are ok
105 |     assert 'S3_BUCKET_NAME' in os.environ, "Please set the S3_BUCKET_NAME environment variable"
106 |     AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID')
107 |     AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY')
108 |     AWS_PROFILE = os.environ.get('AWS_PROFILE')
109 |     assert (AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY) or AWS_PROFILE, "Please set the AWS_ACCESS_KEY_ID & AWS_SECRET_ACCESS_KEY (or the AWS_PROFILE) environment variables"
110 |     
111 |     # first download the data
112 |     taxi_dataset_path = download_taxi_data()
113 |     # upload the data to the bucket
114 |     s3_client = boto3.client('s3')
115 |     upload_datasets(
116 |         s3_client,
117 |         os.environ['S3_BUCKET_NAME'],
118 |         taxi_dataset_path
119 |         )
120 |     # all done
121 |     print("All done! See you, duck cowboy!")
122 |     return     
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     setup_project()


--------------------------------------------------------------------------------
/src/serverless.yml:
--------------------------------------------------------------------------------
 1 | service: quack-reduce-lambda
 2 | useDotenv: true
 3 | 
 4 | provider:
 5 |   name: aws
 6 |   region: ${env:AWS_DEFAULT_REGION, 'us-east-1'}
 7 |   architecture: arm64
 8 |   memorySize: 3008
 9 |   timeout: 600
10 |   ecr:
11 |     images:
12 |       quackimageblog:
13 |         path: ./serverless
14 |         platform: linux/arm64
15 | 
16 | functions:
17 |   duckdb:
18 |     ephemeralStorageSize: 3008
19 |     image:
20 |       name: quackimageblog
21 |     iamRoleStatements:
22 |       - Effect: Allow
23 |         Action:
24 |           - s3:GetObject
25 |           - s3:PutObject
26 |         Resource:
27 |           - Fn::Join:
28 |             - ''
29 |             - - Fn::GetAtt:
30 |                 - S3Bucket
31 |                 - Arn
32 |               - '/*'
33 |       - Effect: Allow
34 |         Action:
35 |           - s3:ListBucket
36 |         Resource:
37 |           - Fn::GetAtt:
38 |             - S3Bucket
39 |             - Arn
40 | 
41 | resources:
42 |   Resources:
43 |     S3Bucket:
44 |       Type: AWS::S3::Bucket
45 |       DeletionPolicy: Retain
46 |       Properties:
47 |         BucketName: ${env:S3_BUCKET_NAME}
48 | 
49 | plugins:
50 |   - serverless-iam-roles-per-function
51 | 


--------------------------------------------------------------------------------
/src/serverless/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/lambda/python:3.9
 2 | 
 3 | # Install pip and other dependencies
 4 | RUN pip3 install --upgrade pip \
 5 |     && yum install gcc gcc-c++ -y \
 6 |     && pip3 install pandas==1.5.3 duckdb==0.7.1 --target "${LAMBDA_TASK_ROOT}"
 7 | 
 8 | ENV HOME=/home/aws
 9 | 
10 | RUN mkdir /home/aws && python3 -c "import duckdb; duckdb.query('INSTALL httpfs;');"
11 | 
12 | COPY app.py ${LAMBDA_TASK_ROOT}
13 | 
14 | # Set the CMD to the lambda handler
15 | CMD [ "app.handler" ]


--------------------------------------------------------------------------------
/src/serverless/app.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | import os
 3 | import time
 4 | import duckdb
 5 | import pandas as pd
 6 | 
 7 | 
 8 | con = None # global conn object - we re-use this across calls
 9 | DEFAULT_LIMIT = 20 # if we don't specify a limit, we will return at most 20 results
10 | 
11 | 
12 | def return_duckdb_connection():
13 |     """
14 |     Return a duckdb connection object
15 |     """
16 |     duckdb_connection = duckdb.connect(database=':memory:')
17 |     duckdb_connection.execute(f"""
18 |         LOAD httpfs;
19 |         SET s3_region='{os.environ['AWS_REGION']}';
20 |         SET s3_session_token='{os.environ['AWS_SESSION_TOKEN']}';
21 |     """
22 |     )
23 | 
24 |     return duckdb_connection
25 | 
26 | 
27 | def handler(event, context):
28 |     """
29 |     Run a SQL query in a memory db as a serverless function
30 |     """
31 | 
32 |     is_warm = False
33 |     # run a timer for info
34 |     start = time.time()
35 |     global con
36 |     if not con:
37 |         # create a new connection
38 |         con = return_duckdb_connection()
39 |     else:
40 |         # return to the caller the status of the lambda
41 |         is_warm = True
42 | 
43 |     # get the query to be executed from the payload
44 |     event_query = event.get('q', None)
45 |     limit = int(event.get('limit', DEFAULT_LIMIT))
46 |     results = []
47 |     if not event_query:
48 |         print("No query provided, will return empty results")
49 |     else:
50 |         # execute the query and return a pandas dataframe
51 |         _df = con.execute(event_query).df()
52 |         # take rows up the limit, to avoid crashing the lambda
53 |         # by returning too many results
54 |         _df = _df.head(limit)
55 |         results = convert_records_to_json(_df)
56 |     
57 |     # return response to the client with metadata
58 |     return wrap_response(start, event_query, results, is_warm)
59 | 
60 | 
61 | def convert_records_to_json(_df):
62 |     if len(_df) > 0:
63 |         # convert timestamp to string to avoid serialization issues
64 |         cols = [col for col in _df.columns if _df[col].dtype == 'datetime64[ns]']
65 |         _df = _df.astype({_: str for _ in cols})
66 | 
67 |     return _df.to_dict('records')
68 | 
69 | 
70 | def wrap_response(start, event_query, results, is_warm):
71 |     """
72 |     Wrap the response in a format that can be used by the client
73 |     """
74 |     return {
75 |         "metadata": {
76 |             "timeMs": int((time.time() - start) * 1000.0),
77 |             "epochMs": int(time.time() * 1000),
78 |             "eventId": str(uuid.uuid4()),
79 |             "query": event_query,
80 |             "warm": is_warm
81 |         },
82 |         "data": {
83 |             "records": results
84 |         }
85 |     }
86 | 


--------------------------------------------------------------------------------