├── scripts
    ├── __init__.py
    ├── polars_query.py
    ├── duckdb_query.py
    ├── pandas_query.py
    ├── dask_query.py
    ├── plot_benchmark_results.py
    ├── duck_db_query_with_persistent_database.py
    └── create_data.py
├── .python-version
├── README.md
├── presentation
    ├── images
    │   ├── background_marp.jpg
    │   ├── benchmark_memory_total.jpg
    │   └── benchmark_times_total.jpg
    └── 20250915_Local-Big-Data-Processing-with-Python.md
├── execute_benchmark.sh
├── pyproject.toml
├── .pre-commit-config.yaml
├── utils
    └── __init__.py
└── .gitignore


/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.11
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Local-Big-Data-Processing
2 | 


--------------------------------------------------------------------------------
/presentation/images/background_marp.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/background_marp.jpg


--------------------------------------------------------------------------------
/presentation/images/benchmark_memory_total.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/benchmark_memory_total.jpg


--------------------------------------------------------------------------------
/presentation/images/benchmark_times_total.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/benchmark_times_total.jpg


--------------------------------------------------------------------------------
/execute_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Array of customer counts
 4 | 
 5 | CUSTOMER_COUNTS=(1000 10000 100000 1000000 10000000)
 6 | 
 7 | for COUNT in "${CUSTOMER_COUNTS[@]}"; do
 8 |     echo "Running benchmark for number_of_customers=$COUNT"
 9 |     python scripts/create_data.py --n-customers "$COUNT"
10 |     python scripts/pandas_query.py
11 |     python scripts/duckdb_query.py
12 |     python scripts/duck_db_query_with_persistent_database.py
13 |     python scripts/dask_query.py
14 |     python scripts/polars_query.py
15 | done


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "local-big-data-processing"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | dependencies = [
 8 |     "dask[complete]>=2025.9.0",
 9 |     "duckdb>=1.3.2",
10 |     "faker>=37.6.0",
11 |     "ipdb>=0.13.13",
12 |     "matplotlib>=3.10.6",
13 |     "numpy>=2.3.2",
14 |     "pandas>=2.3.2",
15 |     "polars>=1.33.1",
16 |     "psutil>=7.0.0",
17 |     "pyarrow>=21.0.0",
18 |     "rich>=14.1.0",
19 | ]
20 | 
21 | [build-system]
22 | requires = ["hatchling"]
23 | build-backend = "hatchling.build"
24 | 
25 | [tool.hatch.build.targets.wheel]
26 | packages = ["utils", "scripts"]
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   # Ruff: lint + fix + format
 3 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 4 |     rev: v0.12.10 # Use latest release
 5 |     hooks:
 6 |       - id: ruff
 7 |         args: [--fix] # auto-fix lint issues, remove unused imports, sort imports
 8 |       - id: ruff-format # black-compatible formatting
 9 | 
10 |   # Pre-commit official hooks
11 |   - repo: https://github.com/pre-commit/pre-commit-hooks
12 |     rev: v6.0.0
13 |     hooks:
14 |       - id: trailing-whitespace # removes trailing spaces
15 |       - id: end-of-file-fixer # ensures newline at EOF
16 |       - id: mixed-line-ending # normalizes line endings
17 |       - id: check-yaml # validates YAML
18 |       - id: check-toml # validates TOML
19 |       - id: check-json # validates JSON
20 |       - id: check-added-large-files # prevents huge files from being committedpre
21 |   - repo: local
22 |     hooks:
23 |       - id: ty
24 |         name: Run Typechecker ty
25 |         entry: uvx ty check scripts utils
26 |         language: system
27 | 


--------------------------------------------------------------------------------
/scripts/polars_query.py:
--------------------------------------------------------------------------------
 1 | import polars as pl
 2 | from rich.console import Console
 3 | 
 4 | from utils import precise_timer, save_benchmark, print_df_as_table
 5 | 
 6 | console = Console()
 7 | 
 8 | customers_file_pattern = "data_csv/customers-*.csv"
 9 | purchases_file_pattern = "data_csv/purchases-*.csv"
10 | 
11 | with save_benchmark(tool_name="Polars", benchmark_case="Total"):
12 |     # -------------------------
13 |     # Lazy load data
14 |     # -------------------------
15 |     customers = pl.scan_csv(customers_file_pattern, has_header=True)
16 |     purchases = pl.scan_csv(purchases_file_pattern, has_header=True)
17 | 
18 |     # -------------------------
19 |     # Base Query: Top customers by number of purchases
20 |     # -------------------------
21 |     with save_benchmark(tool_name="Polars", benchmark_case="Base Query"):
22 |         df_base_query = (
23 |             purchases.group_by("customer_id")
24 |             .agg(n_purchases=pl.len())
25 |             .sort(["n_purchases", "customer_id"], descending=[True, False])
26 |             .limit(10)
27 |             .collect()
28 |         )
29 | 
30 |         print_df_as_table(df_base_query.to_pandas())
31 | 
32 |     # -------------------------
33 |     # Complex Query: Pivot
34 |     # -------------------------
35 |     with save_benchmark(tool_name="Polars", benchmark_case="Complex Query"):
36 |         # Extract year lazily
37 |         purchases_with_year = purchases.with_columns(
38 |             pl.col("purchase_date").str.strptime(pl.Datetime).dt.year().alias("year")
39 |         )
40 | 
41 |         # Join lazily
42 |         base = customers.join(purchases_with_year, on="customer_id")
43 | 
44 |         # Group by city, payment_method, year and sum amounts (still lazy)
45 |         grouped = base.group_by(["city", "payment_method", "year"]).agg(
46 |             total_spent=pl.sum("amount")
47 |         )
48 | 
49 |         # Collect only the aggregated result (much smaller than full dataset)
50 |         grouped_df = grouped.collect(streaming=True)
51 | 
52 |         pivot_df = grouped_df.pivot(
53 |             on="year",
54 |             index=["city", "payment_method"],
55 |             values="total_spent",
56 |             aggregate_function="sum",
57 |             sort_columns=True,
58 |         ).sort(["city", "payment_method"])
59 | 
60 |         print_df_as_table(pivot_df.to_pandas())
61 | 


--------------------------------------------------------------------------------
/scripts/duckdb_query.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | from rich.console import Console
 3 | from rich.table import Table
 4 | 
 5 | from utils import (
 6 |     precise_timer,
 7 |     save_benchmark,
 8 |     print_df_as_table,
 9 | )
10 | 
11 | console = Console()
12 | 
13 | 
14 | # Connect to DuckDB (in memory DB):
15 | con = duckdb.connect()
16 | 
17 | customers_file_pattern = "data_csv/customers-*.csv"
18 | purchases_file_pattern = "data_csv/purchases-*.csv"
19 | 
20 | with save_benchmark(tool_name="DuckDB", benchmark_case="Total"):
21 |     # Create views
22 |     con.execute(
23 |         f"CREATE VIEW customers AS SELECT * FROM read_csv_auto({customers_file_pattern!r}, HEADER=TRUE)"
24 |     )
25 |     con.execute(
26 |         f"CREATE VIEW purchases AS SELECT * FROM read_csv_auto({purchases_file_pattern!r}, HEADER=TRUE)"
27 |     )
28 | 
29 |     with save_benchmark(tool_name="DuckDB", benchmark_case="Base Query"):
30 |         # Example query: top 5 cities by total spent
31 |         query = """
32 |             SELECT
33 |                 customer_id, 
34 |                 COUNT(*) AS n_purchases,
35 |             FROM purchases
36 |             GROUP BY customer_id
37 |             ORDER BY n_purchases DESC, customer_id ASC
38 |             LIMIT 10 
39 |         """
40 | 
41 |         result = con.execute(query).fetchall()
42 | 
43 |         # Create a Rich Table
44 |         table = Table(title="Top 10 Most Acitve Customers")
45 | 
46 |         # Add columns (DuckDB returns tuples)
47 |         table.add_column("Customer ID", style="cyan", justify="left")
48 |         table.add_column("Number of Purchases", style="magenta", justify="right")
49 | 
50 |         # Add rows
51 |         for row in result:
52 |             customer_id, n_purchases = row
53 |             table.add_row(
54 |                 str(customer_id),
55 |                 str(n_purchases),
56 |             )
57 | 
58 |         # Display the table
59 |         console.print(table)
60 | 
61 |     with save_benchmark(tool_name="DuckDB", benchmark_case="Complex Query"):
62 |         ### More complex query:
63 |         query = """
64 |                 WITH base_query AS (SELECT
65 |                     c.city,
66 |                     p.payment_method,
67 |                     p.amount,
68 |                     EXTRACT(year FROM p.purchase_date) AS year
69 |                 FROM customers c
70 |                 JOIN purchases p ON c.customer_id = p.customer_id)
71 | 
72 |                 PIVOT base_query
73 |                 on year
74 |                 USING SUM(AMOUNT)
75 |                 ORDER BY city, payment_method
76 |         """
77 | 
78 |         result = con.execute(query).fetchdf()
79 |         print_df_as_table(result)
80 | 


--------------------------------------------------------------------------------
/scripts/pandas_query.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import sys
 3 | 
 4 | import pandas as pd
 5 | from rich.console import Console
 6 | 
 7 | from utils import (
 8 |     get_number_of_customers,
 9 |     precise_timer,
10 |     save_benchmark,
11 |     print_df_as_table,
12 | )
13 | 
14 | console = Console()
15 | 
16 | customers_file_pattern = "data_csv/customers-*.csv"
17 | purchases_file_pattern = "data_csv/purchases-*.csv"
18 | 
19 | if get_number_of_customers() > 1_000_000:
20 |     # Skip since otherwise out of memory errors.
21 |     sys.exit(0)
22 | 
23 | 
24 | # ----------------------------
25 | # Load CSVs into Pandas
26 | # ----------------------------
27 | def load_csvs(pattern: str) -> pd.DataFrame:
28 |     files = glob.glob(pattern)
29 |     dfs = [pd.read_csv(f) for f in files]
30 |     return pd.concat(dfs, ignore_index=True)
31 | 
32 | 
33 | with save_benchmark(tool_name="Pandas", benchmark_case="Total"):
34 |     with precise_timer("Load DataFrames"):
35 |         customers = load_csvs(customers_file_pattern)
36 |         purchases = load_csvs(purchases_file_pattern)
37 | 
38 |     # Make sure dates are parsed properly
39 |     customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce")
40 |     purchases["purchase_date"] = pd.to_datetime(
41 |         purchases["purchase_date"], errors="coerce"
42 |     )
43 | 
44 |     # ----------------------------
45 |     # Query 1: Top cities by total spent
46 |     # ----------------------------
47 |     with precise_timer("Base Query"):
48 |         merged = purchases.merge(customers, on="customer_id")
49 | 
50 |         df_base_query = (
51 |             merged.groupby("city")
52 |             .agg(
53 |                 n_purchases=("purchase_id", "count"),
54 |                 total_spent=("amount", "sum"),
55 |                 avg_number_of_items=("number_of_items", "mean"),
56 |             )
57 |             .reset_index()
58 |             .sort_values("total_spent", ascending=False)
59 |         )
60 | 
61 |         print_df_as_table(df_base_query)
62 | 
63 |     # ----------------------------
64 |     # Query 2: Pivot by year and payment method
65 |     # ----------------------------
66 |     with precise_timer("Complex Query"):
67 |         purchases["year"] = purchases["purchase_date"].dt.year
68 | 
69 |         base = purchases.merge(customers, on="customer_id")
70 | 
71 |         pivot = (
72 |             base.pivot_table(
73 |                 index=["city", "payment_method"],
74 |                 columns="year",
75 |                 values="amount",
76 |                 aggfunc="sum",
77 |                 fill_value=0,
78 |             )
79 |             .reset_index()
80 |             .sort_values(by=["city", "payment_method"])
81 |         )
82 | 
83 |         print_df_as_table(pivot)
84 | 


--------------------------------------------------------------------------------
/scripts/dask_query.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import dask.dataframe as dd
 4 | from utils import precise_timer, save_benchmark, print_df_as_table
 5 | from dask.distributed import Client, LocalCluster
 6 | 
 7 | 
 8 | def main():
 9 |     customers_file_pattern = "data_csv/customers-*.csv"
10 |     purchases_file_pattern = "data_csv/purchases-*.csv"
11 | 
12 |     with save_benchmark(tool_name="Dask", benchmark_case="Total"):
13 |         # -------------------------
14 |         # Load CSVs lazily
15 |         # -------------------------
16 |         customers = dd.read_csv(
17 |             customers_file_pattern, assume_missing=True, blocksize=None
18 |         )
19 |         purchases = dd.read_csv(
20 |             purchases_file_pattern, assume_missing=True, blocksize=None
21 |         )
22 | 
23 |         # Ensure dates are parsed
24 |         customers["signup_date"] = dd.to_datetime(
25 |             customers["signup_date"], errors="coerce"
26 |         )
27 |         purchases["purchase_date"] = dd.to_datetime(
28 |             purchases["purchase_date"], errors="coerce"
29 |         )
30 | 
31 |         # -------------------------
32 |         # Base Query: Top customers by number of purchases
33 |         # -------------------------
34 |         with save_benchmark(tool_name="Dask", benchmark_case="Base Query"):
35 |             df_base_query = (
36 |                 purchases.groupby("customer_id")
37 |                 .agg(n_purchases=("purchase_id", "count"))
38 |                 .reset_index()
39 |                 .compute()
40 |                 .sort_values(["n_purchases", "customer_id"], ascending=[False, True])
41 |                 .head(10)
42 |             )
43 | 
44 |             print_df_as_table(df_base_query)
45 | 
46 |         # -------------------------
47 |         # Complex Query: Pivot by year
48 |         # -------------------------
49 |         with save_benchmark(tool_name="Dask", benchmark_case="Complex Query"):
50 |             purchases["year"] = purchases["purchase_date"].dt.year
51 | 
52 |             base = customers.merge(purchases, on="customer_id")
53 | 
54 |             # Group by city, payment_method, year, sum amounts
55 |             grouped = (
56 |                 base.groupby(["city", "payment_method", "year"])
57 |                 .agg({"amount": "sum"})
58 |                 .reset_index()
59 |             )
60 |             grouped = grouped.rename(columns={"amount": "total_spent"})
61 | 
62 |             # Compute before pivot (Dask pivot not fully lazy)
63 |             grouped_df = grouped.compute()
64 | 
65 |             # Pivot using pandas (Dask doesn’t fully support pivot for multi-index)
66 |             pivot_df = grouped_df.pivot_table(
67 |                 index=["city", "payment_method"],
68 |                 columns="year",
69 |                 values="total_spent",
70 |                 aggfunc="sum",
71 |                 fill_value=0,
72 |             ).sort_index()
73 | 
74 |             # Display pivot with Rich
75 |             print_df_as_table(
76 |                 pivot_df.reset_index(),
77 |             )
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     n_cores = os.cpu_count()
82 | 
83 |     # Create a local cluster using 1 process per core
84 |     cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, memory_limit="auto")
85 |     client = Client(cluster)
86 |     print(f"Dashboard link: {client.dashboard_link}")
87 |     main()
88 | 


--------------------------------------------------------------------------------
/scripts/plot_benchmark_results.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | DB_FILE = "/tmp/local-big-data-processing-benchmark.db"
 7 | 
 8 | 
 9 | def fetch_benchmark_data():
10 |     conn = sqlite3.connect(DB_FILE)
11 |     df = pd.read_sql_query("SELECT * FROM benchmarks", conn)
12 |     conn.close()
13 |     return df
14 | 
15 | 
16 | def plot_single_case(df, case, filename):
17 |     # Filter for the specific benchmark case
18 |     df_case = df[df["benchmark_case"] == case]
19 |     pivot = df_case.pivot_table(
20 |         index="number_of_customers",
21 |         columns="tool_name",
22 |         values="duration_seconds",
23 |     ).sort_index()
24 | 
25 |     ax = pivot.plot(kind="bar", logy=True, figsize=(12, 7), width=0.8)
26 |     ax.set_xlabel("Number of Customers")
27 |     ax.set_ylabel("Execution Time (seconds, log scale)")
28 |     ax.set_title(f"Benchmark: Execution Time for {case} (log scale)")
29 |     ax.set_xticklabels([f"{int(idx):,}" for idx in pivot.index], rotation=45)
30 |     plt.tight_layout()
31 | 
32 |     # Add duration labels on bars
33 |     for container in ax.containers:
34 |         for bar in container:
35 |             height = bar.get_height()
36 |             if height > 0:
37 |                 ax.annotate(
38 |                     f"{height:.2f}",
39 |                     xy=(bar.get_x() + bar.get_width() / 2, height),
40 |                     xytext=(0, 3),  # 3 points vertical offset
41 |                     textcoords="offset points",
42 |                     ha="center",
43 |                     va="bottom",
44 |                     fontsize=8,
45 |                     rotation=90,
46 |                 )
47 | 
48 |     plt.savefig(filename, dpi=200)
49 |     plt.close()
50 | 
51 | 
52 | def plot_memory_usage(df, filename):
53 |     # Filter for the "Total" benchmark case
54 |     df_case = df[df["benchmark_case"] == "Total"]
55 |     pivot = df_case.pivot_table(
56 |         index="number_of_customers",
57 |         columns="tool_name",
58 |         values="peak_memory_mb",
59 |     ).sort_index()
60 | 
61 |     ax = pivot.plot(kind="bar", figsize=(12, 7), width=0.8)
62 |     ax.set_xlabel("Number of Customers")
63 |     ax.set_ylabel("Peak Memory Usage (MB)")
64 |     ax.set_title("Benchmark: Peak Memory Usage for Total (MB)")
65 |     ax.set_xticklabels([f"{int(idx):,}" for idx in pivot.index], rotation=45)
66 |     plt.tight_layout()
67 | 
68 |     # Add memory labels on bars
69 |     for container in ax.containers:
70 |         for bar in container:
71 |             height = bar.get_height()
72 |             if height > 0:
73 |                 ax.annotate(
74 |                     f"{height:.1f}",
75 |                     xy=(bar.get_x() + bar.get_width() / 2, height),
76 |                     xytext=(0, 3),
77 |                     textcoords="offset points",
78 |                     ha="center",
79 |                     va="bottom",
80 |                     fontsize=8,
81 |                     rotation=90,
82 |                 )
83 | 
84 |     plt.savefig(filename, dpi=200)
85 |     plt.close()
86 | 
87 | 
88 | def plot_benchmark_queries(df):
89 |     plot_single_case(df, "Base Query", "benchmark_times_base_query.jpg")
90 |     plot_single_case(df, "Complex Query", "benchmark_times_complex_query.jpg")
91 |     plot_single_case(df, "Total", "benchmark_times_total.jpg")
92 |     plot_memory_usage(df, "benchmark_memory_total.jpg")
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     df = fetch_benchmark_data()
97 |     plot_benchmark_queries(df)
98 | 


--------------------------------------------------------------------------------
/scripts/duck_db_query_with_persistent_database.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | 
  3 | import duckdb
  4 | from rich.console import Console
  5 | from rich.table import Table
  6 | from rich import print
  7 | 
  8 | from utils import save_benchmark, precise_timer
  9 | 
 10 | console = Console()
 11 | 
 12 | # Connect to DuckDB (persistent DB):
 13 | DUCK_DB_FILE = "/tmp/test.duckdb"
 14 | if os.path.exists(DUCK_DB_FILE):
 15 |     os.remove(DUCK_DB_FILE)
 16 | 
 17 | 
 18 | def load_tables():
 19 |     con = duckdb.connect(DUCK_DB_FILE)
 20 | 
 21 |     customers_file_pattern = "data_csv/customers-*.csv"
 22 |     purchases_file_pattern = "data_csv/purchases-*.csv"
 23 | 
 24 |     # Create views
 25 |     with precise_timer("Create and load to Tables"):
 26 |         con.execute(
 27 |             f"CREATE TABLE customers AS SELECT * FROM read_csv_auto({customers_file_pattern!r}, HEADER=TRUE)"
 28 |         )
 29 |         con.execute(
 30 |             f"CREATE TABLE purchases AS SELECT * FROM read_csv_auto({purchases_file_pattern!r}, HEADER=TRUE)"
 31 |         )
 32 | 
 33 | 
 34 | def execute_queries():
 35 |     con = duckdb.connect(DUCK_DB_FILE)
 36 |     with save_benchmark(
 37 |         tool_name="DuckDB (persistent storage)", benchmark_case="Total"
 38 |     ):
 39 |         with save_benchmark(
 40 |             tool_name="DuckDB (persistent storage)", benchmark_case="Base Query"
 41 |         ):
 42 |             # Example query: top 5 cities by total spent
 43 |             query = """
 44 |                 SELECT
 45 |                     c.city,
 46 |                     COUNT(*) AS n_purchases,
 47 |                     SUM(p.amount) AS total_spent,
 48 |                     AVG(p.number_of_items) as avg_number_of_items
 49 |                 FROM customers c
 50 |                 JOIN purchases p ON c.customer_id = p.customer_id
 51 |                 GROUP BY c.city
 52 |                 ORDER BY total_spent DESC
 53 |             """
 54 | 
 55 |             result = con.execute(query).fetchall()
 56 | 
 57 |             # Create a Rich Table
 58 |             table = Table(title="Cities by Total Spent")
 59 | 
 60 |             # Add columns (DuckDB returns tuples)
 61 |             table.add_column("City", style="cyan", justify="left")
 62 |             table.add_column("Number of Purchases", style="magenta", justify="right")
 63 |             table.add_column("Total Spent", style="green", justify="right")
 64 |             table.add_column(
 65 |                 "Average number of items per purchase", style="red", justify="right"
 66 |             )
 67 | 
 68 |             # Add rows
 69 |             for row in result:
 70 |                 city, n_purchases, total_spent, avg_items = row
 71 |                 table.add_row(
 72 |                     city,
 73 |                     f"{n_purchases:,}",
 74 |                     f"${total_spent:,.2f}",
 75 |                     f"{avg_items:,.2f}",
 76 |                 )
 77 | 
 78 |             # Display the table
 79 |             console.print(table)
 80 | 
 81 |         with save_benchmark(
 82 |             tool_name="DuckDB (persistent storage)", benchmark_case="Complex Query"
 83 |         ):
 84 |             ### More complex query:
 85 |             query = """
 86 |                     WITH base_query AS (SELECT
 87 |                         c.city,
 88 |                         p.payment_method,
 89 |                         p.amount,
 90 |                         EXTRACT(year FROM p.purchase_date) AS year
 91 |                     FROM customers c
 92 |                     JOIN purchases p ON c.customer_id = p.customer_id)
 93 | 
 94 |                     PIVOT base_query
 95 |                     on year
 96 |                     USING SUM(AMOUNT)
 97 |                     ORDER BY city, payment_method
 98 |             """
 99 | 
100 |             result = con.execute(query).fetchdf()
101 |             print(result)
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     load_tables()
106 |     execute_queries()
107 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import json
  3 | from pathlib import Path
  4 | import sqlite3
  5 | import time
  6 | import os
  7 | import psutil
  8 | import threading
  9 | from contextlib import contextmanager
 10 | 
 11 | from rich import print
 12 | 
 13 | from rich.table import Table
 14 | from rich.console import Console
 15 | from typing import Optional
 16 | 
 17 | console = Console()
 18 | 
 19 | DB_FILE = "/tmp/local-big-data-processing-benchmark.db"
 20 | DATA_DIR = Path(__file__).parent.parent / "data_csv"
 21 | db_conn = sqlite3.connect(DB_FILE)
 22 | 
 23 | 
 24 | def get_number_of_customers() -> int:
 25 |     with open(DATA_DIR / "config.json") as f:
 26 |         return json.load(f)["number_of_customers"]
 27 | 
 28 | 
 29 | def create_benchmark_db():
 30 |     cur = db_conn.cursor()
 31 |     cur.execute(
 32 |         """
 33 |         CREATE TABLE IF NOT EXISTS benchmarks (
 34 |             tool_name TEXT NOT NULL,
 35 |             benchmark_case TEXT NOT NULL,
 36 |             number_of_customers INTEGER NOT NULL,
 37 |             duration_seconds REAL NOT NULL,
 38 |             peak_memory_mb REAL,
 39 |             UNIQUE(tool_name, benchmark_case, number_of_customers)
 40 |         )
 41 |         """
 42 |     )
 43 |     db_conn.commit()
 44 | 
 45 | 
 46 | def add_or_update_benchmark_entry(
 47 |     tool_name: str,
 48 |     benchmark_case: str,
 49 |     number_of_customers: int,
 50 |     duration_seconds: float,
 51 |     peak_memory_mb: float = None,
 52 | ) -> None:
 53 |     create_benchmark_db()
 54 |     cur = db_conn.cursor()
 55 |     cur.execute(
 56 |         """
 57 |             INSERT INTO benchmarks (tool_name, benchmark_case, number_of_customers, duration_seconds, peak_memory_mb)
 58 |             VALUES (?, ?, ?, ?, ?)
 59 |             ON CONFLICT(tool_name, benchmark_case, number_of_customers) DO UPDATE
 60 |             SET duration_seconds = excluded.duration_seconds,
 61 |                 peak_memory_mb = excluded.peak_memory_mb
 62 |             """,
 63 |         (
 64 |             tool_name,
 65 |             benchmark_case,
 66 |             number_of_customers,
 67 |             duration_seconds,
 68 |             peak_memory_mb,
 69 |         ),
 70 |     )
 71 |     db_conn.commit()
 72 | 
 73 | 
 74 | @contextmanager
 75 | def precise_timer(task_name: str):
 76 |     start = time.perf_counter()
 77 |     yield
 78 |     end = time.perf_counter()
 79 |     duration = end - start
 80 |     print(f"[bold]Elapsed time ({task_name}):[/bold] {duration:.6f} seconds")
 81 | 
 82 | 
 83 | @contextmanager
 84 | def save_benchmark(tool_name: str, benchmark_case: str):
 85 |     process = psutil.Process(os.getpid())
 86 |     peak_rss = 0
 87 |     running = True
 88 | 
 89 |     def sampler():
 90 |         nonlocal peak_rss
 91 |         while running:
 92 |             rss = process.memory_info().rss
 93 |             peak_rss = max(peak_rss, rss)
 94 |             time.sleep(0.01)
 95 | 
 96 |     thread = threading.Thread(target=sampler, daemon=True)
 97 |     thread.start()
 98 |     start = time.perf_counter()
 99 |     try:
100 |         yield
101 |     finally:
102 |         running = False
103 |         thread.join()
104 |         end = time.perf_counter()
105 |         duration = end - start
106 |         peak_memory_mb = peak_rss / 1024**2
107 |         print(f"Elapsed time ({tool_name} - {benchmark_case}): {duration:.6f} seconds")
108 |         print(f"Peak Memory: {peak_memory_mb:.2f} MB")
109 |         add_or_update_benchmark_entry(
110 |             tool_name,
111 |             benchmark_case,
112 |             get_number_of_customers(),
113 |             duration,
114 |             peak_memory_mb,
115 |         )
116 | 
117 | 
118 | def print_df_as_table(df, title: Optional[str] = None):
119 |     """
120 |     Convert a Pandas DataFrame to a Rich table and print it.
121 | 
122 |     Args:
123 |         df (pd.DataFrame): The DataFrame to display.
124 |         title (str, optional): Title of the table.
125 |     """
126 |     table = Table(title=title)
127 | 
128 |     # Add columns
129 |     for col in df.columns:
130 |         table.add_column(
131 |             str(col), justify="right" if df[col].dtype.kind in "if" else "left"
132 |         )
133 | 
134 |     # Add rows with formatting
135 |     for _, row in df.iterrows():
136 |         cells = [str(row[col]) for col in df.columns]
137 |         table.add_row(*cells)
138 | 
139 |     console.print(table)
140 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[codz]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114 | #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115 | #pdm.lock
116 | #pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # pixi
121 | #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122 | #pixi.lock
123 | #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124 | #   in the .venv directory. It is recommended not to include this directory in version control.
125 | .pixi
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .envrc
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | # Pyre type checker
163 | .pyre/
164 | 
165 | # pytype static type analyzer
166 | .pytype/
167 | 
168 | # Cython debug symbols
169 | cython_debug/
170 | 
171 | # PyCharm
172 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
175 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
176 | #.idea/
177 | 
178 | # Abstra
179 | # Abstra is an AI-powered process automation framework.
180 | # Ignore directories containing user credentials, local state, and settings.
181 | # Learn more at https://abstra.io/docs
182 | .abstra/
183 | 
184 | # Visual Studio Code
185 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187 | #  and can be added to the global gitignore or merged into this file. However, if you prefer,
188 | #  you could uncomment the following to ignore the entire vscode folder
189 | # .vscode/
190 | 
191 | # Ruff stuff:
192 | .ruff_cache/
193 | 
194 | # PyPI configuration file
195 | .pypirc
196 | 
197 | # Cursor
198 | #  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199 | #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200 | #  refer to https://docs.cursor.com/context/ignore-files
201 | .cursorignore
202 | .cursorindexingignore
203 | 
204 | # Marimo
205 | marimo/_static/
206 | marimo/_lsp/
207 | __marimo__/
208 | 
209 | .idea/
210 | data/
211 | data_csv/
212 | *.jpg
213 | !presentation/images/*
214 | *.html


--------------------------------------------------------------------------------
/scripts/create_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import csv
  4 | import shutil
  5 | import argparse
  6 | 
  7 | import numpy as np
  8 | from uuid import uuid4
  9 | from datetime import datetime, timedelta
 10 | from dataclasses import dataclass, fields
 11 | from multiprocessing import Pool, cpu_count
 12 | 
 13 | from utils import DATA_DIR
 14 | 
 15 | # ----------------------------
 16 | # Adjustable global variables
 17 | # ----------------------------
 18 | AVG_PURCHASES = 20  # average purchases per customer (Poisson mean)
 19 | CUSTOMERS_PER_FILE = 500_000  # customer rows per customers-<i>.csv
 20 | PURCHASES_PER_FILE = 2_000_000  # target purchase rows per purchases-<i>.csv
 21 | 
 22 | 
 23 | def parse_args():
 24 |     parser = argparse.ArgumentParser(
 25 |         description="Generate synthetic customer and purchase data."
 26 |     )
 27 |     parser.add_argument(
 28 |         "--n-customers",
 29 |         type=int,
 30 |         required=True,
 31 |         help="Total number of customers to generate.",
 32 |     )
 33 |     return parser.parse_args()
 34 | 
 35 | 
 36 | # ----------------------------
 37 | # Dataclasses (for headers)
 38 | # ----------------------------
 39 | @dataclass
 40 | class Customer:
 41 |     customer_id: int
 42 |     city: str
 43 |     age: int
 44 |     signup_date: str
 45 | 
 46 | 
 47 | @dataclass
 48 | class Purchase:
 49 |     purchase_id: str
 50 |     customer_id: int
 51 |     amount: float
 52 |     number_of_items: int
 53 |     purchase_date: str
 54 |     payment_method: str
 55 | 
 56 | 
 57 | # ----------------------------
 58 | # Helpers
 59 | # ----------------------------
 60 | def generate_customers_chunk(start_id: int, n_customers: int):
 61 |     cities = np.array(
 62 |         ["New York", "Berlin", "Tokyo", "San Francisco", "Paris", "London", "Sydney"]
 63 |     )
 64 |     city_probs = np.array([0.2, 0.15, 0.1, 0.1, 0.15, 0.2, 0.1])
 65 | 
 66 |     ids = np.arange(start_id, start_id + n_customers, dtype=np.int64)
 67 |     city_choices = np.random.choice(cities, size=n_customers, p=city_probs)
 68 |     ages = np.clip(np.random.normal(40, 12, size=n_customers).astype(int), 18, 90)
 69 | 
 70 |     days = np.random.randint(0, 5 * 365, size=n_customers)
 71 |     signup_dates = [
 72 |         (datetime.today() - timedelta(days=int(d))).date().isoformat() for d in days
 73 |     ]
 74 | 
 75 |     return ids.tolist(), city_choices.tolist(), ages.tolist(), signup_dates
 76 | 
 77 | 
 78 | def generate_purchases_from_counts(counts_slice, start_customer_id):
 79 |     counts = np.asarray(counts_slice, dtype=np.int64)
 80 |     n_customers = len(counts)
 81 |     total_purchases = int(counts.sum())
 82 |     if total_purchases == 0:
 83 |         return [], [], [], [], [], []
 84 | 
 85 |     cust_id_array = np.repeat(
 86 |         np.arange(start_customer_id, start_customer_id + n_customers, dtype=np.int64),
 87 |         counts,
 88 |     )
 89 | 
 90 |     amounts = np.round(
 91 |         np.random.exponential(scale=50.0, size=total_purchases), 2
 92 |     ).tolist()
 93 |     number_of_items = np.maximum(
 94 |         1, np.random.poisson(lam=3, size=total_purchases)
 95 |     ).tolist()
 96 | 
 97 |     payment_methods = np.random.choice(
 98 |         ["credit_card", "paypal", "bank_transfer", "apple_pay"],
 99 |         size=total_purchases,
100 |         p=[0.6, 0.2, 0.15, 0.05],
101 |     ).tolist()
102 | 
103 |     days = np.random.randint(0, 5 * 365, size=total_purchases)
104 |     purchase_dates = [
105 |         (datetime.today() - timedelta(days=int(d))).date().isoformat() for d in days
106 |     ]
107 | 
108 |     purchase_ids = [str(uuid4()) for _ in range(total_purchases)]
109 | 
110 |     return (
111 |         purchase_ids,
112 |         cust_id_array.tolist(),
113 |         amounts,
114 |         number_of_items,
115 |         purchase_dates,
116 |         payment_methods,
117 |     )
118 | 
119 | 
120 | # ----------------------------
121 | # Writers
122 | # ----------------------------
123 | def write_customers_file(args):
124 |     file_index, start_id, n_customers = args
125 |     filename = os.path.join(DATA_DIR, f"customers-{file_index}.csv")
126 |     ids, cities, ages, signup_dates = generate_customers_chunk(start_id, n_customers)
127 |     headers = [f.name for f in fields(Customer)]
128 |     with open(filename, "w", newline="") as f:
129 |         writer = csv.writer(f)
130 |         writer.writerow(headers)
131 |         writer.writerows(zip(ids, cities, ages, signup_dates))
132 |     print(f"✅ Wrote {n_customers:,} customers to {filename}")
133 | 
134 | 
135 | def write_purchases_file(args):
136 |     file_index, start_customer_id, counts_slice = args
137 |     filename = os.path.join(DATA_DIR, f"purchases-{file_index}.csv")
138 |     (pids, cust_ids, amounts, items, pdates, methods) = generate_purchases_from_counts(
139 |         counts_slice, start_customer_id
140 |     )
141 | 
142 |     if len(pids) == 0:
143 |         print(f"ℹ️  Skipping {filename} (0 purchases)")
144 |         return
145 | 
146 |     headers = [f.name for f in fields(Purchase)]
147 |     with open(filename, "w", newline="") as f:
148 |         writer = csv.writer(f)
149 |         writer.writerow(headers)
150 |         writer.writerows(zip(pids, cust_ids, amounts, items, pdates, methods))
151 | 
152 |     print(f"✅ Wrote purchases chunk to {filename} ({len(pids):,} rows)")
153 | 
154 | 
155 | # ----------------------------
156 | # Partitioning for purchases
157 | # ----------------------------
158 | def compute_purchase_partitions(purchases_per_customer, target_rows_per_file):
159 |     partitions = []
160 |     N = len(purchases_per_customer)
161 |     i = 0
162 |     while i < N:
163 |         cum = 0
164 |         j = i
165 |         while j < N and cum < target_rows_per_file:
166 |             cum += int(purchases_per_customer[j])
167 |             j += 1
168 |         if j == i:  # ensure progress
169 |             j = i + 1
170 |         partitions.append((i, j))
171 |         i = j
172 |     return partitions
173 | 
174 | 
175 | # ----------------------------
176 | # Main
177 | # ----------------------------
178 | def main():
179 |     args = parse_args()
180 |     number_of_customers = args.n_customers
181 | 
182 |     if os.path.exists(DATA_DIR):
183 |         shutil.rmtree(DATA_DIR)
184 |     os.makedirs(DATA_DIR)
185 | 
186 |     np.random.seed(42)
187 | 
188 |     with open(DATA_DIR / "config.json", "w") as f:
189 |         json.dump({"number_of_customers": number_of_customers}, f)
190 | 
191 |     purchases_per_customer = np.random.poisson(AVG_PURCHASES, size=number_of_customers)
192 |     total_purchases = int(purchases_per_customer.sum())
193 |     print(
194 |         f"Total customers: {number_of_customers:,}, total (simulated) purchases: {total_purchases:,}"
195 |     )
196 | 
197 |     partitions = compute_purchase_partitions(purchases_per_customer, PURCHASES_PER_FILE)
198 |     print(
199 |         f"Will create {len(partitions)} purchase files (target ~{PURCHASES_PER_FILE:,} rows each)"
200 |     )
201 | 
202 |     customer_tasks = []
203 |     n_customer_files = (
204 |         number_of_customers + CUSTOMERS_PER_FILE - 1
205 |     ) // CUSTOMERS_PER_FILE
206 |     for i in range(n_customer_files):
207 |         start_id = i * CUSTOMERS_PER_FILE + 1
208 |         n_rows = min(CUSTOMERS_PER_FILE, number_of_customers - (i * CUSTOMERS_PER_FILE))
209 |         customer_tasks.append((i + 1, start_id, n_rows))
210 | 
211 |     purchase_tasks = []
212 |     for idx, (start_idx, end_idx) in enumerate(partitions, start=1):
213 |         start_customer_id = start_idx + 1
214 |         counts_slice = purchases_per_customer[start_idx:end_idx].tolist()
215 |         purchase_tasks.append((idx, start_customer_id, counts_slice))
216 | 
217 |     with Pool(processes=cpu_count()) as pool:
218 |         pool.map(write_customers_file, customer_tasks)
219 |         pool.map(write_purchases_file, purchase_tasks)
220 | 
221 |     print("✅ All CSV files generated (with multiprocessing pool).")
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     main()
226 | 


--------------------------------------------------------------------------------
/presentation/20250915_Local-Big-Data-Processing-with-Python.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | paginate: true
  3 | marp: true
  4 | theme: uncover
  5 | style: |
  6 |   .small {
  7 |     font-size: 0.2rem;
  8 |   }
  9 | 
 10 |   .red-text {
 11 |     color: red;
 12 |   }
 13 | 
 14 |   .blue-text {
 15 |     color: blue;
 16 |   }
 17 | 
 18 |   .mx-2 {
 19 |     margin-left: 1em;
 20 |     margin-right: 1em;
 21 |   }
 22 | 
 23 |   .bold {
 24 |     font-weight: bold;
 25 |   }
 26 | 
 27 |   .green-text {
 28 |     color: green;
 29 |   }
 30 | 
 31 |   h1 {
 32 |     font-size: 40px;
 33 |   }
 34 | 
 35 |   h2 {
 36 |     font-size: 35px;
 37 |   } 
 38 | 
 39 |   h3 {
 40 |     font-size: 30px;
 41 |   }
 42 | 
 43 |   h4 {
 44 |     font-size: 28px;
 45 |   }
 46 | 
 47 |   h5,h6,p,li,code,table {
 48 |     font-size: 25px;
 49 |   }
 50 | 
 51 |    .container{
 52 |     display: flex;
 53 |     gap: 1rem;
 54 |     }
 55 | 
 56 |   .col{
 57 |       flex: 1;
 58 |   }
 59 | headingDivider: 1
 60 | math: mathjax
 61 | backgroundImage: url('images/background_marp.jpg')
 62 | ---
 63 | 
 64 | # Local Big Data Processing with Python
 65 | 
 66 | 15.09.2025
 67 | 
 68 | # Outline
 69 | 
 70 | 1. Motivation
 71 | 
 72 | 2. Baseline Benchmark (Pandas)
 73 | 
 74 | 3. DuckDB
 75 | 
 76 | 4. Polars
 77 | 
 78 | 5. Dask
 79 | 
 80 | 6. Other tools
 81 | 
 82 | # Motivation
 83 | 
 84 | Modern Computers/Servers deliver enough performance to process larger amounts of data on a single machine using **Vertical Scaling (Multicore Processing)** without the need to scale **Horizontally (Distributed Computing on multiple Servers)**.
 85 | 
 86 | ---
 87 | 
 88 | | Aspect        |                                                              Horizontal (scale-out) |                                                                 Vertical (scale-up) |
 89 | | ------------- | ----------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------: |
 90 | | Pros          |                       Virtually unlimited capacity; better fault tolerance; elastic | Simpler programming; lower coordination overhead; no network latency for local work |
 91 | | Cons          | More complex (coordination, networking); network/IO can bottleneck; higher ops cost |                    Finite limits (hardware), can be costly, single point of failure |
 92 | | Typical tools |                                                    Dask, Ray, PySpark, Hadoop/Spark |                                                   DuckDB, Polars, Vaex, RAPIDS/cuDF |
 93 | 
 94 | 
 95 | # Benchmarking Example
 96 | 
 97 | #### Customers
 98 | 
 99 | 
100 | <table>
101 |   <thead>
102 |     <tr>
103 |       <th>customer_id</th>
104 |       <th>city</th>
105 |       <th>age</th>
106 |       <th>signup_date</th>
107 |     </tr>
108 |   </thead>
109 |   <tbody>
110 |     <tr>
111 |       <td>1</td>
112 |       <td>San Francisco</td>
113 |       <td>29</td>
114 |       <td>2025-03-16</td>
115 |     </tr>
116 |     <tr>
117 |       <td>2</td>
118 |       <td>Francisco</td>
119 |       <td>60</td>
120 |       <td>2020-11-19</td>
121 |     </tr>
122 |     <tr>
123 |       <td>...</td>
124 |       <td>...</td>
125 |       <td>...</td>
126 |       <td>...</td>
127 |     </tr>
128 |   </tbody>
129 | </table>
130 | 
131 | #### Purchases
132 | 
133 | <table>
134 |   <thead>
135 |     <tr>
136 |       <th>purchase<br>id</th>
137 |       <th>customer<br>id</th>
138 |       <th>amount</th>
139 |       <th>number<br>of<br>items</th>
140 |       <th>purchase<br>date</th>
141 |       <th>payment<br>method</th>
142 |     </tr>
143 |   </thead>
144 |   <tbody>
145 |     <tr>
146 |       <td>f8822c42...</td>
147 |       <td>1</td>
148 |       <td>31.49</td>
149 |       <td>3</td>
150 |       <td>2022-05-15</td>
151 |       <td>apple_pay</td>
152 |     </tr>
153 |     <tr>
154 |       <td>98ef3adc...</td>
155 |       <td>1</td>
156 |       <td>36.31</td>
157 |       <td>5</td>
158 |       <td>2021-02-06</td>
159 |       <td>credit_card</td>
160 |     </tr>
161 |     <tr>
162 |       <td>...</td>
163 |       <td>...</td>
164 |       <td>...</td>
165 |       <td>...</td>
166 |       <td>...</td>
167 |       <td>...</td>
168 |     </tr>
169 |   </tbody>
170 | </table>
171 | 
172 | 
173 | # CSV Structure
174 | 
175 | ```
176 | data_csv
177 | ├── customers-1.csv
178 | ├── customers-2.csv
179 | ├── customers-3.csv
180 | ├── customers-...csv
181 | ├── purchases-1.csv
182 | ├── purchases-2.csv
183 | ├── purchases-3.csv
184 | ├── purchases-...csv
185 | ```
186 | 
187 | 
188 | # Queries
189 | 
190 | <div class="container">
191 | <div class="col">
192 | <h4>Base Query</h4>
193 | 
194 | ```sql
195 | SELECT
196 |     customer_id, 
197 |     COUNT(*) AS n_purchases,
198 | FROM purchases
199 | GROUP BY customer_id
200 | ORDER BY n_purchases DESC, customer_id ASC
201 | LIMIT 10 
202 | ```
203 | 
204 | 
205 | -> Simple Map/Reduce without join
206 | 
207 | </div>
208 | 
209 | <div class="col">
210 | <h4>Complex Query</h4>
211 | 
212 | ```sql
213 | WITH base_query AS (
214 |   SELECT
215 |       c.city,
216 |       p.payment_method,
217 |       p.amount,
218 |       EXTRACT(year FROM p.purchase_date) AS year
219 |   FROM customers c
220 |   JOIN purchases p ON c.customer_id = p.customer_id)
221 | 
222 | PIVOT base_query
223 |   on year
224 |   USING SUM(AMOUNT)
225 |   ORDER BY city, payment_method
226 | ```
227 | 
228 | -> JOIN + more complex Pivot aggregation
229 | </div>
230 | </div>
231 | 
232 | # Pandas (Baseline)
233 | 
234 | 
235 | <div class="container">
236 | 
237 | <div class="col">
238 | Pros ✅
239 | <br>
240 | <br>
241 | 
242 | - Weapon of choice for interactive Data Analysis of small datasets (in memory)
243 | - Rich API and documentation
244 | - Fast for Iterative Analysis since Data loaded once in memory
245 | - Wide IO support (CSV/JSON/Excel/HDF5) and good tooling for prototyping
246 | - Great Integration with other tools (ML Libs, Visualization, ...)
247 | 
248 | </div>
249 | 
250 | <div class="col">
251 | Cons ❌
252 | <br>
253 | <br>
254 | 
255 | - Memory-bound: requires dataset (or working set) to fit in RAM
256 | - No builtin way for parallelization (so single-threaded execution)
257 | - No built-in lazy execution or query optimizer (hard to optimize complex pipelines)
258 | 
259 | 
260 | </div>
261 | </div>
262 | 
263 | 
264 | # Dask (Distributed)
265 | 
266 | <div class="container">
267 | 
268 | <div class="col">
269 | Pros ✅
270 | <br>
271 | <br>
272 | 
273 | - Scales from single machine to large clusters with a familiar pandas-like API  
274 | - Lazy execution and task scheduling (reduces unnecessary & enables out-of-memory computation)  
275 | - Integrates with dask.distributed for monitoring, retries, and resource management (dashboard)  
276 | - Rich ecosystem (dask.array, dask-ml) and integrations with other tooling  
277 | 
278 | </div>
279 | 
280 | <div class="col">
281 | Cons ❌
282 | <br>
283 | <br>
284 | 
285 | - Joins and wide shuffles can be expensive — partitioning strategy matters  
286 | - Need to tune partition sizes, persist intermediates, and manage memory to avoid recomputation  
287 | - More operational complexity for clusters (scheduling, workers, networking)  
288 | - Debugging and predictable performance harder than single-node tools
289 | </div>
290 | </div>
291 | 
292 | 
293 | # Polars
294 | 
295 | <div class="container">
296 | 
297 | <div class="col">
298 | Pros ✅
299 | <br>
300 | <br>
301 | 
302 | - Lightning-fast DataFrame library implemented in Rust
303 | - multithreaded by default (fast CPU utilization)  
304 | - LazyFrame API with query optimization available
305 | - Great for single-node vertical scaling and building performant pipelines  
306 | - Easy conversion from and to Pandas Dataframes
307 | 
308 | </div>
309 | 
310 | <div class="col">
311 | Cons ❌
312 | <br>
313 | <br>
314 | 
315 | - API differs from pandas — some learning curve and fewer 1:1 convenience helpers  
316 | - Smaller ecosystem and fewer direct integrations than pandas (ML/visualization adapters)  
317 | - Not natively distributed (use with Ray/Dask for scale-out)  
318 | - Some operations like Pivot require materialization (collect)
319 | 
320 | </div>
321 | </div>
322 | 
323 | # DuckDB
324 | 
325 | <div class="container">
326 | 
327 | <div class="col">
328 | Pros ✅
329 | <br>
330 | <br>
331 | 
332 | - Fast analytical SQL engine optimized for single-node, columnar workloads (Vectorized execution & Query Planner)
333 | - Can query Parquet/CSV/Arrow in-place 
334 | - Excellent Python & notebook integration (e.g. conversion to/from Pandas/Polars `dfs`)
335 | - In-Memory DB and file-based DB (like `sqlite`)
336 | 
337 | </div>
338 | 
339 | <div class="col">
340 | Cons ❌
341 | <br>
342 | <br>
343 | 
344 | - Single-node only (scale-up); not a distributed cluster engine  
345 | - Not intended for high-frequency transactions or streaming workloads  
346 | - Primary interface is SQL — less direct Pandas-like API (convert results to pandas/polars)  
347 | - Very large workloads still limited by disk I/O and single-machine resources
348 | 
349 | </div>
350 | </div>
351 | 
352 | 
353 | # Benchmarking
354 | ---
355 | 
356 | ![bg 80%](images/benchmark_times_total.jpg)
357 | 
358 | ---
359 | ![bg 80%](images/benchmark_memory_total.jpg)
360 | 
361 | # Summary
362 | | Tool | Good for | Bad for |
363 | |---|---|---|
364 | | Pandas | Interactive analysis of small to medium dataset | Datasets > Memory|
365 | | Dask | Memory-restricted hardware with Pandas-like API / Potential horizontal scaling| High networking overhead when running on single node |
366 | | Polars | Fast multithreaded single‑node analytics| No horizontal scaling / Uses a lot of memory| 
367 | DuckDB | Fast single‑node SQL; in‑place CSV/Parquet/Arrow queries; optional persistent DB for fast querying | No horizontal scaling
368 | 
369 | # Other Tools
370 | 
371 | - Pyspark 
372 | - Ray / Ray Data
373 | - Apache Arrow
374 | - ...
375 | 
376 | 
377 | 


--------------------------------------------------------------------------------