├── scripts ├── __init__.py ├── polars_query.py ├── duckdb_query.py ├── pandas_query.py ├── dask_query.py ├── plot_benchmark_results.py ├── duck_db_query_with_persistent_database.py └── create_data.py ├── .python-version ├── README.md ├── presentation ├── images │ ├── background_marp.jpg │ ├── benchmark_memory_total.jpg │ └── benchmark_times_total.jpg └── 20250915_Local-Big-Data-Processing-with-Python.md ├── execute_benchmark.sh ├── pyproject.toml ├── .pre-commit-config.yaml ├── utils └── __init__.py └── .gitignore /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.11 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Local-Big-Data-Processing 2 | -------------------------------------------------------------------------------- /presentation/images/background_marp.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/background_marp.jpg -------------------------------------------------------------------------------- /presentation/images/benchmark_memory_total.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/benchmark_memory_total.jpg -------------------------------------------------------------------------------- /presentation/images/benchmark_times_total.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrikHlobil/Local-Big-Data-Processing/main/presentation/images/benchmark_times_total.jpg -------------------------------------------------------------------------------- /execute_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Array of customer counts 4 | 5 | CUSTOMER_COUNTS=(1000 10000 100000 1000000 10000000) 6 | 7 | for COUNT in "${CUSTOMER_COUNTS[@]}"; do 8 | echo "Running benchmark for number_of_customers=$COUNT" 9 | python scripts/create_data.py --n-customers "$COUNT" 10 | python scripts/pandas_query.py 11 | python scripts/duckdb_query.py 12 | python scripts/duck_db_query_with_persistent_database.py 13 | python scripts/dask_query.py 14 | python scripts/polars_query.py 15 | done -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "local-big-data-processing" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "dask[complete]>=2025.9.0", 9 | "duckdb>=1.3.2", 10 | "faker>=37.6.0", 11 | "ipdb>=0.13.13", 12 | "matplotlib>=3.10.6", 13 | "numpy>=2.3.2", 14 | "pandas>=2.3.2", 15 | "polars>=1.33.1", 16 | "psutil>=7.0.0", 17 | "pyarrow>=21.0.0", 18 | "rich>=14.1.0", 19 | ] 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build.targets.wheel] 26 | packages = ["utils", "scripts"] 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | # Ruff: lint + fix + format 3 | - repo: https://github.com/astral-sh/ruff-pre-commit 4 | rev: v0.12.10 # Use latest release 5 | hooks: 6 | - id: ruff 7 | args: [--fix] # auto-fix lint issues, remove unused imports, sort imports 8 | - id: ruff-format # black-compatible formatting 9 | 10 | # Pre-commit official hooks 11 | - repo: https://github.com/pre-commit/pre-commit-hooks 12 | rev: v6.0.0 13 | hooks: 14 | - id: trailing-whitespace # removes trailing spaces 15 | - id: end-of-file-fixer # ensures newline at EOF 16 | - id: mixed-line-ending # normalizes line endings 17 | - id: check-yaml # validates YAML 18 | - id: check-toml # validates TOML 19 | - id: check-json # validates JSON 20 | - id: check-added-large-files # prevents huge files from being committedpre 21 | - repo: local 22 | hooks: 23 | - id: ty 24 | name: Run Typechecker ty 25 | entry: uvx ty check scripts utils 26 | language: system 27 | -------------------------------------------------------------------------------- /scripts/polars_query.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from rich.console import Console 3 | 4 | from utils import precise_timer, save_benchmark, print_df_as_table 5 | 6 | console = Console() 7 | 8 | customers_file_pattern = "data_csv/customers-*.csv" 9 | purchases_file_pattern = "data_csv/purchases-*.csv" 10 | 11 | with save_benchmark(tool_name="Polars", benchmark_case="Total"): 12 | # ------------------------- 13 | # Lazy load data 14 | # ------------------------- 15 | customers = pl.scan_csv(customers_file_pattern, has_header=True) 16 | purchases = pl.scan_csv(purchases_file_pattern, has_header=True) 17 | 18 | # ------------------------- 19 | # Base Query: Top customers by number of purchases 20 | # ------------------------- 21 | with save_benchmark(tool_name="Polars", benchmark_case="Base Query"): 22 | df_base_query = ( 23 | purchases.group_by("customer_id") 24 | .agg(n_purchases=pl.len()) 25 | .sort(["n_purchases", "customer_id"], descending=[True, False]) 26 | .limit(10) 27 | .collect() 28 | ) 29 | 30 | print_df_as_table(df_base_query.to_pandas()) 31 | 32 | # ------------------------- 33 | # Complex Query: Pivot 34 | # ------------------------- 35 | with save_benchmark(tool_name="Polars", benchmark_case="Complex Query"): 36 | # Extract year lazily 37 | purchases_with_year = purchases.with_columns( 38 | pl.col("purchase_date").str.strptime(pl.Datetime).dt.year().alias("year") 39 | ) 40 | 41 | # Join lazily 42 | base = customers.join(purchases_with_year, on="customer_id") 43 | 44 | # Group by city, payment_method, year and sum amounts (still lazy) 45 | grouped = base.group_by(["city", "payment_method", "year"]).agg( 46 | total_spent=pl.sum("amount") 47 | ) 48 | 49 | # Collect only the aggregated result (much smaller than full dataset) 50 | grouped_df = grouped.collect(streaming=True) 51 | 52 | pivot_df = grouped_df.pivot( 53 | on="year", 54 | index=["city", "payment_method"], 55 | values="total_spent", 56 | aggregate_function="sum", 57 | sort_columns=True, 58 | ).sort(["city", "payment_method"]) 59 | 60 | print_df_as_table(pivot_df.to_pandas()) 61 | -------------------------------------------------------------------------------- /scripts/duckdb_query.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | from rich.console import Console 3 | from rich.table import Table 4 | 5 | from utils import ( 6 | precise_timer, 7 | save_benchmark, 8 | print_df_as_table, 9 | ) 10 | 11 | console = Console() 12 | 13 | 14 | # Connect to DuckDB (in memory DB): 15 | con = duckdb.connect() 16 | 17 | customers_file_pattern = "data_csv/customers-*.csv" 18 | purchases_file_pattern = "data_csv/purchases-*.csv" 19 | 20 | with save_benchmark(tool_name="DuckDB", benchmark_case="Total"): 21 | # Create views 22 | con.execute( 23 | f"CREATE VIEW customers AS SELECT * FROM read_csv_auto({customers_file_pattern!r}, HEADER=TRUE)" 24 | ) 25 | con.execute( 26 | f"CREATE VIEW purchases AS SELECT * FROM read_csv_auto({purchases_file_pattern!r}, HEADER=TRUE)" 27 | ) 28 | 29 | with save_benchmark(tool_name="DuckDB", benchmark_case="Base Query"): 30 | # Example query: top 5 cities by total spent 31 | query = """ 32 | SELECT 33 | customer_id, 34 | COUNT(*) AS n_purchases, 35 | FROM purchases 36 | GROUP BY customer_id 37 | ORDER BY n_purchases DESC, customer_id ASC 38 | LIMIT 10 39 | """ 40 | 41 | result = con.execute(query).fetchall() 42 | 43 | # Create a Rich Table 44 | table = Table(title="Top 10 Most Acitve Customers") 45 | 46 | # Add columns (DuckDB returns tuples) 47 | table.add_column("Customer ID", style="cyan", justify="left") 48 | table.add_column("Number of Purchases", style="magenta", justify="right") 49 | 50 | # Add rows 51 | for row in result: 52 | customer_id, n_purchases = row 53 | table.add_row( 54 | str(customer_id), 55 | str(n_purchases), 56 | ) 57 | 58 | # Display the table 59 | console.print(table) 60 | 61 | with save_benchmark(tool_name="DuckDB", benchmark_case="Complex Query"): 62 | ### More complex query: 63 | query = """ 64 | WITH base_query AS (SELECT 65 | c.city, 66 | p.payment_method, 67 | p.amount, 68 | EXTRACT(year FROM p.purchase_date) AS year 69 | FROM customers c 70 | JOIN purchases p ON c.customer_id = p.customer_id) 71 | 72 | PIVOT base_query 73 | on year 74 | USING SUM(AMOUNT) 75 | ORDER BY city, payment_method 76 | """ 77 | 78 | result = con.execute(query).fetchdf() 79 | print_df_as_table(result) 80 | -------------------------------------------------------------------------------- /scripts/pandas_query.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import sys 3 | 4 | import pandas as pd 5 | from rich.console import Console 6 | 7 | from utils import ( 8 | get_number_of_customers, 9 | precise_timer, 10 | save_benchmark, 11 | print_df_as_table, 12 | ) 13 | 14 | console = Console() 15 | 16 | customers_file_pattern = "data_csv/customers-*.csv" 17 | purchases_file_pattern = "data_csv/purchases-*.csv" 18 | 19 | if get_number_of_customers() > 1_000_000: 20 | # Skip since otherwise out of memory errors. 21 | sys.exit(0) 22 | 23 | 24 | # ---------------------------- 25 | # Load CSVs into Pandas 26 | # ---------------------------- 27 | def load_csvs(pattern: str) -> pd.DataFrame: 28 | files = glob.glob(pattern) 29 | dfs = [pd.read_csv(f) for f in files] 30 | return pd.concat(dfs, ignore_index=True) 31 | 32 | 33 | with save_benchmark(tool_name="Pandas", benchmark_case="Total"): 34 | with precise_timer("Load DataFrames"): 35 | customers = load_csvs(customers_file_pattern) 36 | purchases = load_csvs(purchases_file_pattern) 37 | 38 | # Make sure dates are parsed properly 39 | customers["signup_date"] = pd.to_datetime(customers["signup_date"], errors="coerce") 40 | purchases["purchase_date"] = pd.to_datetime( 41 | purchases["purchase_date"], errors="coerce" 42 | ) 43 | 44 | # ---------------------------- 45 | # Query 1: Top cities by total spent 46 | # ---------------------------- 47 | with precise_timer("Base Query"): 48 | merged = purchases.merge(customers, on="customer_id") 49 | 50 | df_base_query = ( 51 | merged.groupby("city") 52 | .agg( 53 | n_purchases=("purchase_id", "count"), 54 | total_spent=("amount", "sum"), 55 | avg_number_of_items=("number_of_items", "mean"), 56 | ) 57 | .reset_index() 58 | .sort_values("total_spent", ascending=False) 59 | ) 60 | 61 | print_df_as_table(df_base_query) 62 | 63 | # ---------------------------- 64 | # Query 2: Pivot by year and payment method 65 | # ---------------------------- 66 | with precise_timer("Complex Query"): 67 | purchases["year"] = purchases["purchase_date"].dt.year 68 | 69 | base = purchases.merge(customers, on="customer_id") 70 | 71 | pivot = ( 72 | base.pivot_table( 73 | index=["city", "payment_method"], 74 | columns="year", 75 | values="amount", 76 | aggfunc="sum", 77 | fill_value=0, 78 | ) 79 | .reset_index() 80 | .sort_values(by=["city", "payment_method"]) 81 | ) 82 | 83 | print_df_as_table(pivot) 84 | -------------------------------------------------------------------------------- /scripts/dask_query.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import dask.dataframe as dd 4 | from utils import precise_timer, save_benchmark, print_df_as_table 5 | from dask.distributed import Client, LocalCluster 6 | 7 | 8 | def main(): 9 | customers_file_pattern = "data_csv/customers-*.csv" 10 | purchases_file_pattern = "data_csv/purchases-*.csv" 11 | 12 | with save_benchmark(tool_name="Dask", benchmark_case="Total"): 13 | # ------------------------- 14 | # Load CSVs lazily 15 | # ------------------------- 16 | customers = dd.read_csv( 17 | customers_file_pattern, assume_missing=True, blocksize=None 18 | ) 19 | purchases = dd.read_csv( 20 | purchases_file_pattern, assume_missing=True, blocksize=None 21 | ) 22 | 23 | # Ensure dates are parsed 24 | customers["signup_date"] = dd.to_datetime( 25 | customers["signup_date"], errors="coerce" 26 | ) 27 | purchases["purchase_date"] = dd.to_datetime( 28 | purchases["purchase_date"], errors="coerce" 29 | ) 30 | 31 | # ------------------------- 32 | # Base Query: Top customers by number of purchases 33 | # ------------------------- 34 | with save_benchmark(tool_name="Dask", benchmark_case="Base Query"): 35 | df_base_query = ( 36 | purchases.groupby("customer_id") 37 | .agg(n_purchases=("purchase_id", "count")) 38 | .reset_index() 39 | .compute() 40 | .sort_values(["n_purchases", "customer_id"], ascending=[False, True]) 41 | .head(10) 42 | ) 43 | 44 | print_df_as_table(df_base_query) 45 | 46 | # ------------------------- 47 | # Complex Query: Pivot by year 48 | # ------------------------- 49 | with save_benchmark(tool_name="Dask", benchmark_case="Complex Query"): 50 | purchases["year"] = purchases["purchase_date"].dt.year 51 | 52 | base = customers.merge(purchases, on="customer_id") 53 | 54 | # Group by city, payment_method, year, sum amounts 55 | grouped = ( 56 | base.groupby(["city", "payment_method", "year"]) 57 | .agg({"amount": "sum"}) 58 | .reset_index() 59 | ) 60 | grouped = grouped.rename(columns={"amount": "total_spent"}) 61 | 62 | # Compute before pivot (Dask pivot not fully lazy) 63 | grouped_df = grouped.compute() 64 | 65 | # Pivot using pandas (Dask doesn’t fully support pivot for multi-index) 66 | pivot_df = grouped_df.pivot_table( 67 | index=["city", "payment_method"], 68 | columns="year", 69 | values="total_spent", 70 | aggfunc="sum", 71 | fill_value=0, 72 | ).sort_index() 73 | 74 | # Display pivot with Rich 75 | print_df_as_table( 76 | pivot_df.reset_index(), 77 | ) 78 | 79 | 80 | if __name__ == "__main__": 81 | n_cores = os.cpu_count() 82 | 83 | # Create a local cluster using 1 process per core 84 | cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, memory_limit="auto") 85 | client = Client(cluster) 86 | print(f"Dashboard link: {client.dashboard_link}") 87 | main() 88 | -------------------------------------------------------------------------------- /scripts/plot_benchmark_results.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | DB_FILE = "/tmp/local-big-data-processing-benchmark.db" 7 | 8 | 9 | def fetch_benchmark_data(): 10 | conn = sqlite3.connect(DB_FILE) 11 | df = pd.read_sql_query("SELECT * FROM benchmarks", conn) 12 | conn.close() 13 | return df 14 | 15 | 16 | def plot_single_case(df, case, filename): 17 | # Filter for the specific benchmark case 18 | df_case = df[df["benchmark_case"] == case] 19 | pivot = df_case.pivot_table( 20 | index="number_of_customers", 21 | columns="tool_name", 22 | values="duration_seconds", 23 | ).sort_index() 24 | 25 | ax = pivot.plot(kind="bar", logy=True, figsize=(12, 7), width=0.8) 26 | ax.set_xlabel("Number of Customers") 27 | ax.set_ylabel("Execution Time (seconds, log scale)") 28 | ax.set_title(f"Benchmark: Execution Time for {case} (log scale)") 29 | ax.set_xticklabels([f"{int(idx):,}" for idx in pivot.index], rotation=45) 30 | plt.tight_layout() 31 | 32 | # Add duration labels on bars 33 | for container in ax.containers: 34 | for bar in container: 35 | height = bar.get_height() 36 | if height > 0: 37 | ax.annotate( 38 | f"{height:.2f}", 39 | xy=(bar.get_x() + bar.get_width() / 2, height), 40 | xytext=(0, 3), # 3 points vertical offset 41 | textcoords="offset points", 42 | ha="center", 43 | va="bottom", 44 | fontsize=8, 45 | rotation=90, 46 | ) 47 | 48 | plt.savefig(filename, dpi=200) 49 | plt.close() 50 | 51 | 52 | def plot_memory_usage(df, filename): 53 | # Filter for the "Total" benchmark case 54 | df_case = df[df["benchmark_case"] == "Total"] 55 | pivot = df_case.pivot_table( 56 | index="number_of_customers", 57 | columns="tool_name", 58 | values="peak_memory_mb", 59 | ).sort_index() 60 | 61 | ax = pivot.plot(kind="bar", figsize=(12, 7), width=0.8) 62 | ax.set_xlabel("Number of Customers") 63 | ax.set_ylabel("Peak Memory Usage (MB)") 64 | ax.set_title("Benchmark: Peak Memory Usage for Total (MB)") 65 | ax.set_xticklabels([f"{int(idx):,}" for idx in pivot.index], rotation=45) 66 | plt.tight_layout() 67 | 68 | # Add memory labels on bars 69 | for container in ax.containers: 70 | for bar in container: 71 | height = bar.get_height() 72 | if height > 0: 73 | ax.annotate( 74 | f"{height:.1f}", 75 | xy=(bar.get_x() + bar.get_width() / 2, height), 76 | xytext=(0, 3), 77 | textcoords="offset points", 78 | ha="center", 79 | va="bottom", 80 | fontsize=8, 81 | rotation=90, 82 | ) 83 | 84 | plt.savefig(filename, dpi=200) 85 | plt.close() 86 | 87 | 88 | def plot_benchmark_queries(df): 89 | plot_single_case(df, "Base Query", "benchmark_times_base_query.jpg") 90 | plot_single_case(df, "Complex Query", "benchmark_times_complex_query.jpg") 91 | plot_single_case(df, "Total", "benchmark_times_total.jpg") 92 | plot_memory_usage(df, "benchmark_memory_total.jpg") 93 | 94 | 95 | if __name__ == "__main__": 96 | df = fetch_benchmark_data() 97 | plot_benchmark_queries(df) 98 | -------------------------------------------------------------------------------- /scripts/duck_db_query_with_persistent_database.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import duckdb 4 | from rich.console import Console 5 | from rich.table import Table 6 | from rich import print 7 | 8 | from utils import save_benchmark, precise_timer 9 | 10 | console = Console() 11 | 12 | # Connect to DuckDB (persistent DB): 13 | DUCK_DB_FILE = "/tmp/test.duckdb" 14 | if os.path.exists(DUCK_DB_FILE): 15 | os.remove(DUCK_DB_FILE) 16 | 17 | 18 | def load_tables(): 19 | con = duckdb.connect(DUCK_DB_FILE) 20 | 21 | customers_file_pattern = "data_csv/customers-*.csv" 22 | purchases_file_pattern = "data_csv/purchases-*.csv" 23 | 24 | # Create views 25 | with precise_timer("Create and load to Tables"): 26 | con.execute( 27 | f"CREATE TABLE customers AS SELECT * FROM read_csv_auto({customers_file_pattern!r}, HEADER=TRUE)" 28 | ) 29 | con.execute( 30 | f"CREATE TABLE purchases AS SELECT * FROM read_csv_auto({purchases_file_pattern!r}, HEADER=TRUE)" 31 | ) 32 | 33 | 34 | def execute_queries(): 35 | con = duckdb.connect(DUCK_DB_FILE) 36 | with save_benchmark( 37 | tool_name="DuckDB (persistent storage)", benchmark_case="Total" 38 | ): 39 | with save_benchmark( 40 | tool_name="DuckDB (persistent storage)", benchmark_case="Base Query" 41 | ): 42 | # Example query: top 5 cities by total spent 43 | query = """ 44 | SELECT 45 | c.city, 46 | COUNT(*) AS n_purchases, 47 | SUM(p.amount) AS total_spent, 48 | AVG(p.number_of_items) as avg_number_of_items 49 | FROM customers c 50 | JOIN purchases p ON c.customer_id = p.customer_id 51 | GROUP BY c.city 52 | ORDER BY total_spent DESC 53 | """ 54 | 55 | result = con.execute(query).fetchall() 56 | 57 | # Create a Rich Table 58 | table = Table(title="Cities by Total Spent") 59 | 60 | # Add columns (DuckDB returns tuples) 61 | table.add_column("City", style="cyan", justify="left") 62 | table.add_column("Number of Purchases", style="magenta", justify="right") 63 | table.add_column("Total Spent", style="green", justify="right") 64 | table.add_column( 65 | "Average number of items per purchase", style="red", justify="right" 66 | ) 67 | 68 | # Add rows 69 | for row in result: 70 | city, n_purchases, total_spent, avg_items = row 71 | table.add_row( 72 | city, 73 | f"{n_purchases:,}", 74 | f"${total_spent:,.2f}", 75 | f"{avg_items:,.2f}", 76 | ) 77 | 78 | # Display the table 79 | console.print(table) 80 | 81 | with save_benchmark( 82 | tool_name="DuckDB (persistent storage)", benchmark_case="Complex Query" 83 | ): 84 | ### More complex query: 85 | query = """ 86 | WITH base_query AS (SELECT 87 | c.city, 88 | p.payment_method, 89 | p.amount, 90 | EXTRACT(year FROM p.purchase_date) AS year 91 | FROM customers c 92 | JOIN purchases p ON c.customer_id = p.customer_id) 93 | 94 | PIVOT base_query 95 | on year 96 | USING SUM(AMOUNT) 97 | ORDER BY city, payment_method 98 | """ 99 | 100 | result = con.execute(query).fetchdf() 101 | print(result) 102 | 103 | 104 | if __name__ == "__main__": 105 | load_tables() 106 | execute_queries() 107 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import json 3 | from pathlib import Path 4 | import sqlite3 5 | import time 6 | import os 7 | import psutil 8 | import threading 9 | from contextlib import contextmanager 10 | 11 | from rich import print 12 | 13 | from rich.table import Table 14 | from rich.console import Console 15 | from typing import Optional 16 | 17 | console = Console() 18 | 19 | DB_FILE = "/tmp/local-big-data-processing-benchmark.db" 20 | DATA_DIR = Path(__file__).parent.parent / "data_csv" 21 | db_conn = sqlite3.connect(DB_FILE) 22 | 23 | 24 | def get_number_of_customers() -> int: 25 | with open(DATA_DIR / "config.json") as f: 26 | return json.load(f)["number_of_customers"] 27 | 28 | 29 | def create_benchmark_db(): 30 | cur = db_conn.cursor() 31 | cur.execute( 32 | """ 33 | CREATE TABLE IF NOT EXISTS benchmarks ( 34 | tool_name TEXT NOT NULL, 35 | benchmark_case TEXT NOT NULL, 36 | number_of_customers INTEGER NOT NULL, 37 | duration_seconds REAL NOT NULL, 38 | peak_memory_mb REAL, 39 | UNIQUE(tool_name, benchmark_case, number_of_customers) 40 | ) 41 | """ 42 | ) 43 | db_conn.commit() 44 | 45 | 46 | def add_or_update_benchmark_entry( 47 | tool_name: str, 48 | benchmark_case: str, 49 | number_of_customers: int, 50 | duration_seconds: float, 51 | peak_memory_mb: float = None, 52 | ) -> None: 53 | create_benchmark_db() 54 | cur = db_conn.cursor() 55 | cur.execute( 56 | """ 57 | INSERT INTO benchmarks (tool_name, benchmark_case, number_of_customers, duration_seconds, peak_memory_mb) 58 | VALUES (?, ?, ?, ?, ?) 59 | ON CONFLICT(tool_name, benchmark_case, number_of_customers) DO UPDATE 60 | SET duration_seconds = excluded.duration_seconds, 61 | peak_memory_mb = excluded.peak_memory_mb 62 | """, 63 | ( 64 | tool_name, 65 | benchmark_case, 66 | number_of_customers, 67 | duration_seconds, 68 | peak_memory_mb, 69 | ), 70 | ) 71 | db_conn.commit() 72 | 73 | 74 | @contextmanager 75 | def precise_timer(task_name: str): 76 | start = time.perf_counter() 77 | yield 78 | end = time.perf_counter() 79 | duration = end - start 80 | print(f"[bold]Elapsed time ({task_name}):[/bold] {duration:.6f} seconds") 81 | 82 | 83 | @contextmanager 84 | def save_benchmark(tool_name: str, benchmark_case: str): 85 | process = psutil.Process(os.getpid()) 86 | peak_rss = 0 87 | running = True 88 | 89 | def sampler(): 90 | nonlocal peak_rss 91 | while running: 92 | rss = process.memory_info().rss 93 | peak_rss = max(peak_rss, rss) 94 | time.sleep(0.01) 95 | 96 | thread = threading.Thread(target=sampler, daemon=True) 97 | thread.start() 98 | start = time.perf_counter() 99 | try: 100 | yield 101 | finally: 102 | running = False 103 | thread.join() 104 | end = time.perf_counter() 105 | duration = end - start 106 | peak_memory_mb = peak_rss / 1024**2 107 | print(f"Elapsed time ({tool_name} - {benchmark_case}): {duration:.6f} seconds") 108 | print(f"Peak Memory: {peak_memory_mb:.2f} MB") 109 | add_or_update_benchmark_entry( 110 | tool_name, 111 | benchmark_case, 112 | get_number_of_customers(), 113 | duration, 114 | peak_memory_mb, 115 | ) 116 | 117 | 118 | def print_df_as_table(df, title: Optional[str] = None): 119 | """ 120 | Convert a Pandas DataFrame to a Rich table and print it. 121 | 122 | Args: 123 | df (pd.DataFrame): The DataFrame to display. 124 | title (str, optional): Title of the table. 125 | """ 126 | table = Table(title=title) 127 | 128 | # Add columns 129 | for col in df.columns: 130 | table.add_column( 131 | str(col), justify="right" if df[col].dtype.kind in "if" else "left" 132 | ) 133 | 134 | # Add rows with formatting 135 | for _, row in df.iterrows(): 136 | cells = [str(row[col]) for col in df.columns] 137 | table.add_row(*cells) 138 | 139 | console.print(table) 140 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[codz] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | #poetry.toml 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 115 | #pdm.lock 116 | #pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # pixi 121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 122 | #pixi.lock 123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 124 | # in the .venv directory. It is recommended not to include this directory in version control. 125 | .pixi 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .envrc 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | # Abstra 179 | # Abstra is an AI-powered process automation framework. 180 | # Ignore directories containing user credentials, local state, and settings. 181 | # Learn more at https://abstra.io/docs 182 | .abstra/ 183 | 184 | # Visual Studio Code 185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 187 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 188 | # you could uncomment the following to ignore the entire vscode folder 189 | # .vscode/ 190 | 191 | # Ruff stuff: 192 | .ruff_cache/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | 197 | # Cursor 198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 200 | # refer to https://docs.cursor.com/context/ignore-files 201 | .cursorignore 202 | .cursorindexingignore 203 | 204 | # Marimo 205 | marimo/_static/ 206 | marimo/_lsp/ 207 | __marimo__/ 208 | 209 | .idea/ 210 | data/ 211 | data_csv/ 212 | *.jpg 213 | !presentation/images/* 214 | *.html -------------------------------------------------------------------------------- /scripts/create_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import csv 4 | import shutil 5 | import argparse 6 | 7 | import numpy as np 8 | from uuid import uuid4 9 | from datetime import datetime, timedelta 10 | from dataclasses import dataclass, fields 11 | from multiprocessing import Pool, cpu_count 12 | 13 | from utils import DATA_DIR 14 | 15 | # ---------------------------- 16 | # Adjustable global variables 17 | # ---------------------------- 18 | AVG_PURCHASES = 20 # average purchases per customer (Poisson mean) 19 | CUSTOMERS_PER_FILE = 500_000 # customer rows per customers-.csv 20 | PURCHASES_PER_FILE = 2_000_000 # target purchase rows per purchases-.csv 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser( 25 | description="Generate synthetic customer and purchase data." 26 | ) 27 | parser.add_argument( 28 | "--n-customers", 29 | type=int, 30 | required=True, 31 | help="Total number of customers to generate.", 32 | ) 33 | return parser.parse_args() 34 | 35 | 36 | # ---------------------------- 37 | # Dataclasses (for headers) 38 | # ---------------------------- 39 | @dataclass 40 | class Customer: 41 | customer_id: int 42 | city: str 43 | age: int 44 | signup_date: str 45 | 46 | 47 | @dataclass 48 | class Purchase: 49 | purchase_id: str 50 | customer_id: int 51 | amount: float 52 | number_of_items: int 53 | purchase_date: str 54 | payment_method: str 55 | 56 | 57 | # ---------------------------- 58 | # Helpers 59 | # ---------------------------- 60 | def generate_customers_chunk(start_id: int, n_customers: int): 61 | cities = np.array( 62 | ["New York", "Berlin", "Tokyo", "San Francisco", "Paris", "London", "Sydney"] 63 | ) 64 | city_probs = np.array([0.2, 0.15, 0.1, 0.1, 0.15, 0.2, 0.1]) 65 | 66 | ids = np.arange(start_id, start_id + n_customers, dtype=np.int64) 67 | city_choices = np.random.choice(cities, size=n_customers, p=city_probs) 68 | ages = np.clip(np.random.normal(40, 12, size=n_customers).astype(int), 18, 90) 69 | 70 | days = np.random.randint(0, 5 * 365, size=n_customers) 71 | signup_dates = [ 72 | (datetime.today() - timedelta(days=int(d))).date().isoformat() for d in days 73 | ] 74 | 75 | return ids.tolist(), city_choices.tolist(), ages.tolist(), signup_dates 76 | 77 | 78 | def generate_purchases_from_counts(counts_slice, start_customer_id): 79 | counts = np.asarray(counts_slice, dtype=np.int64) 80 | n_customers = len(counts) 81 | total_purchases = int(counts.sum()) 82 | if total_purchases == 0: 83 | return [], [], [], [], [], [] 84 | 85 | cust_id_array = np.repeat( 86 | np.arange(start_customer_id, start_customer_id + n_customers, dtype=np.int64), 87 | counts, 88 | ) 89 | 90 | amounts = np.round( 91 | np.random.exponential(scale=50.0, size=total_purchases), 2 92 | ).tolist() 93 | number_of_items = np.maximum( 94 | 1, np.random.poisson(lam=3, size=total_purchases) 95 | ).tolist() 96 | 97 | payment_methods = np.random.choice( 98 | ["credit_card", "paypal", "bank_transfer", "apple_pay"], 99 | size=total_purchases, 100 | p=[0.6, 0.2, 0.15, 0.05], 101 | ).tolist() 102 | 103 | days = np.random.randint(0, 5 * 365, size=total_purchases) 104 | purchase_dates = [ 105 | (datetime.today() - timedelta(days=int(d))).date().isoformat() for d in days 106 | ] 107 | 108 | purchase_ids = [str(uuid4()) for _ in range(total_purchases)] 109 | 110 | return ( 111 | purchase_ids, 112 | cust_id_array.tolist(), 113 | amounts, 114 | number_of_items, 115 | purchase_dates, 116 | payment_methods, 117 | ) 118 | 119 | 120 | # ---------------------------- 121 | # Writers 122 | # ---------------------------- 123 | def write_customers_file(args): 124 | file_index, start_id, n_customers = args 125 | filename = os.path.join(DATA_DIR, f"customers-{file_index}.csv") 126 | ids, cities, ages, signup_dates = generate_customers_chunk(start_id, n_customers) 127 | headers = [f.name for f in fields(Customer)] 128 | with open(filename, "w", newline="") as f: 129 | writer = csv.writer(f) 130 | writer.writerow(headers) 131 | writer.writerows(zip(ids, cities, ages, signup_dates)) 132 | print(f"✅ Wrote {n_customers:,} customers to {filename}") 133 | 134 | 135 | def write_purchases_file(args): 136 | file_index, start_customer_id, counts_slice = args 137 | filename = os.path.join(DATA_DIR, f"purchases-{file_index}.csv") 138 | (pids, cust_ids, amounts, items, pdates, methods) = generate_purchases_from_counts( 139 | counts_slice, start_customer_id 140 | ) 141 | 142 | if len(pids) == 0: 143 | print(f"ℹ️ Skipping {filename} (0 purchases)") 144 | return 145 | 146 | headers = [f.name for f in fields(Purchase)] 147 | with open(filename, "w", newline="") as f: 148 | writer = csv.writer(f) 149 | writer.writerow(headers) 150 | writer.writerows(zip(pids, cust_ids, amounts, items, pdates, methods)) 151 | 152 | print(f"✅ Wrote purchases chunk to {filename} ({len(pids):,} rows)") 153 | 154 | 155 | # ---------------------------- 156 | # Partitioning for purchases 157 | # ---------------------------- 158 | def compute_purchase_partitions(purchases_per_customer, target_rows_per_file): 159 | partitions = [] 160 | N = len(purchases_per_customer) 161 | i = 0 162 | while i < N: 163 | cum = 0 164 | j = i 165 | while j < N and cum < target_rows_per_file: 166 | cum += int(purchases_per_customer[j]) 167 | j += 1 168 | if j == i: # ensure progress 169 | j = i + 1 170 | partitions.append((i, j)) 171 | i = j 172 | return partitions 173 | 174 | 175 | # ---------------------------- 176 | # Main 177 | # ---------------------------- 178 | def main(): 179 | args = parse_args() 180 | number_of_customers = args.n_customers 181 | 182 | if os.path.exists(DATA_DIR): 183 | shutil.rmtree(DATA_DIR) 184 | os.makedirs(DATA_DIR) 185 | 186 | np.random.seed(42) 187 | 188 | with open(DATA_DIR / "config.json", "w") as f: 189 | json.dump({"number_of_customers": number_of_customers}, f) 190 | 191 | purchases_per_customer = np.random.poisson(AVG_PURCHASES, size=number_of_customers) 192 | total_purchases = int(purchases_per_customer.sum()) 193 | print( 194 | f"Total customers: {number_of_customers:,}, total (simulated) purchases: {total_purchases:,}" 195 | ) 196 | 197 | partitions = compute_purchase_partitions(purchases_per_customer, PURCHASES_PER_FILE) 198 | print( 199 | f"Will create {len(partitions)} purchase files (target ~{PURCHASES_PER_FILE:,} rows each)" 200 | ) 201 | 202 | customer_tasks = [] 203 | n_customer_files = ( 204 | number_of_customers + CUSTOMERS_PER_FILE - 1 205 | ) // CUSTOMERS_PER_FILE 206 | for i in range(n_customer_files): 207 | start_id = i * CUSTOMERS_PER_FILE + 1 208 | n_rows = min(CUSTOMERS_PER_FILE, number_of_customers - (i * CUSTOMERS_PER_FILE)) 209 | customer_tasks.append((i + 1, start_id, n_rows)) 210 | 211 | purchase_tasks = [] 212 | for idx, (start_idx, end_idx) in enumerate(partitions, start=1): 213 | start_customer_id = start_idx + 1 214 | counts_slice = purchases_per_customer[start_idx:end_idx].tolist() 215 | purchase_tasks.append((idx, start_customer_id, counts_slice)) 216 | 217 | with Pool(processes=cpu_count()) as pool: 218 | pool.map(write_customers_file, customer_tasks) 219 | pool.map(write_purchases_file, purchase_tasks) 220 | 221 | print("✅ All CSV files generated (with multiprocessing pool).") 222 | 223 | 224 | if __name__ == "__main__": 225 | main() 226 | -------------------------------------------------------------------------------- /presentation/20250915_Local-Big-Data-Processing-with-Python.md: -------------------------------------------------------------------------------- 1 | --- 2 | paginate: true 3 | marp: true 4 | theme: uncover 5 | style: | 6 | .small { 7 | font-size: 0.2rem; 8 | } 9 | 10 | .red-text { 11 | color: red; 12 | } 13 | 14 | .blue-text { 15 | color: blue; 16 | } 17 | 18 | .mx-2 { 19 | margin-left: 1em; 20 | margin-right: 1em; 21 | } 22 | 23 | .bold { 24 | font-weight: bold; 25 | } 26 | 27 | .green-text { 28 | color: green; 29 | } 30 | 31 | h1 { 32 | font-size: 40px; 33 | } 34 | 35 | h2 { 36 | font-size: 35px; 37 | } 38 | 39 | h3 { 40 | font-size: 30px; 41 | } 42 | 43 | h4 { 44 | font-size: 28px; 45 | } 46 | 47 | h5,h6,p,li,code,table { 48 | font-size: 25px; 49 | } 50 | 51 | .container{ 52 | display: flex; 53 | gap: 1rem; 54 | } 55 | 56 | .col{ 57 | flex: 1; 58 | } 59 | headingDivider: 1 60 | math: mathjax 61 | backgroundImage: url('images/background_marp.jpg') 62 | --- 63 | 64 | # Local Big Data Processing with Python 65 | 66 | 15.09.2025 67 | 68 | # Outline 69 | 70 | 1. Motivation 71 | 72 | 2. Baseline Benchmark (Pandas) 73 | 74 | 3. DuckDB 75 | 76 | 4. Polars 77 | 78 | 5. Dask 79 | 80 | 6. Other tools 81 | 82 | # Motivation 83 | 84 | Modern Computers/Servers deliver enough performance to process larger amounts of data on a single machine using **Vertical Scaling (Multicore Processing)** without the need to scale **Horizontally (Distributed Computing on multiple Servers)**. 85 | 86 | --- 87 | 88 | | Aspect | Horizontal (scale-out) | Vertical (scale-up) | 89 | | ------------- | ----------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------: | 90 | | Pros | Virtually unlimited capacity; better fault tolerance; elastic | Simpler programming; lower coordination overhead; no network latency for local work | 91 | | Cons | More complex (coordination, networking); network/IO can bottleneck; higher ops cost | Finite limits (hardware), can be costly, single point of failure | 92 | | Typical tools | Dask, Ray, PySpark, Hadoop/Spark | DuckDB, Polars, Vaex, RAPIDS/cuDF | 93 | 94 | 95 | # Benchmarking Example 96 | 97 | #### Customers 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 |
customer_idcityagesignup_date
1San Francisco292025-03-16
2Francisco602020-11-19
............
130 | 131 | #### Purchases 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 |
purchase
id
customer
id
amountnumber
of
items
purchase
date
payment
method
f8822c42...131.4932022-05-15apple_pay
98ef3adc...136.3152021-02-06credit_card
..................
171 | 172 | 173 | # CSV Structure 174 | 175 | ``` 176 | data_csv 177 | ├── customers-1.csv 178 | ├── customers-2.csv 179 | ├── customers-3.csv 180 | ├── customers-...csv 181 | ├── purchases-1.csv 182 | ├── purchases-2.csv 183 | ├── purchases-3.csv 184 | ├── purchases-...csv 185 | ``` 186 | 187 | 188 | # Queries 189 | 190 |
191 |
192 |

Base Query

193 | 194 | ```sql 195 | SELECT 196 | customer_id, 197 | COUNT(*) AS n_purchases, 198 | FROM purchases 199 | GROUP BY customer_id 200 | ORDER BY n_purchases DESC, customer_id ASC 201 | LIMIT 10 202 | ``` 203 | 204 | 205 | -> Simple Map/Reduce without join 206 | 207 |
208 | 209 |
210 |

Complex Query

211 | 212 | ```sql 213 | WITH base_query AS ( 214 | SELECT 215 | c.city, 216 | p.payment_method, 217 | p.amount, 218 | EXTRACT(year FROM p.purchase_date) AS year 219 | FROM customers c 220 | JOIN purchases p ON c.customer_id = p.customer_id) 221 | 222 | PIVOT base_query 223 | on year 224 | USING SUM(AMOUNT) 225 | ORDER BY city, payment_method 226 | ``` 227 | 228 | -> JOIN + more complex Pivot aggregation 229 |
230 |
231 | 232 | # Pandas (Baseline) 233 | 234 | 235 |
236 | 237 |
238 | Pros ✅ 239 |
240 |
241 | 242 | - Weapon of choice for interactive Data Analysis of small datasets (in memory) 243 | - Rich API and documentation 244 | - Fast for Iterative Analysis since Data loaded once in memory 245 | - Wide IO support (CSV/JSON/Excel/HDF5) and good tooling for prototyping 246 | - Great Integration with other tools (ML Libs, Visualization, ...) 247 | 248 |
249 | 250 |
251 | Cons ❌ 252 |
253 |
254 | 255 | - Memory-bound: requires dataset (or working set) to fit in RAM 256 | - No builtin way for parallelization (so single-threaded execution) 257 | - No built-in lazy execution or query optimizer (hard to optimize complex pipelines) 258 | 259 | 260 |
261 |
262 | 263 | 264 | # Dask (Distributed) 265 | 266 |
267 | 268 |
269 | Pros ✅ 270 |
271 |
272 | 273 | - Scales from single machine to large clusters with a familiar pandas-like API 274 | - Lazy execution and task scheduling (reduces unnecessary & enables out-of-memory computation) 275 | - Integrates with dask.distributed for monitoring, retries, and resource management (dashboard) 276 | - Rich ecosystem (dask.array, dask-ml) and integrations with other tooling 277 | 278 |
279 | 280 |
281 | Cons ❌ 282 |
283 |
284 | 285 | - Joins and wide shuffles can be expensive — partitioning strategy matters 286 | - Need to tune partition sizes, persist intermediates, and manage memory to avoid recomputation 287 | - More operational complexity for clusters (scheduling, workers, networking) 288 | - Debugging and predictable performance harder than single-node tools 289 |
290 |
291 | 292 | 293 | # Polars 294 | 295 |
296 | 297 |
298 | Pros ✅ 299 |
300 |
301 | 302 | - Lightning-fast DataFrame library implemented in Rust 303 | - multithreaded by default (fast CPU utilization) 304 | - LazyFrame API with query optimization available 305 | - Great for single-node vertical scaling and building performant pipelines 306 | - Easy conversion from and to Pandas Dataframes 307 | 308 |
309 | 310 |
311 | Cons ❌ 312 |
313 |
314 | 315 | - API differs from pandas — some learning curve and fewer 1:1 convenience helpers 316 | - Smaller ecosystem and fewer direct integrations than pandas (ML/visualization adapters) 317 | - Not natively distributed (use with Ray/Dask for scale-out) 318 | - Some operations like Pivot require materialization (collect) 319 | 320 |
321 |
322 | 323 | # DuckDB 324 | 325 |
326 | 327 |
328 | Pros ✅ 329 |
330 |
331 | 332 | - Fast analytical SQL engine optimized for single-node, columnar workloads (Vectorized execution & Query Planner) 333 | - Can query Parquet/CSV/Arrow in-place 334 | - Excellent Python & notebook integration (e.g. conversion to/from Pandas/Polars `dfs`) 335 | - In-Memory DB and file-based DB (like `sqlite`) 336 | 337 |
338 | 339 |
340 | Cons ❌ 341 |
342 |
343 | 344 | - Single-node only (scale-up); not a distributed cluster engine 345 | - Not intended for high-frequency transactions or streaming workloads 346 | - Primary interface is SQL — less direct Pandas-like API (convert results to pandas/polars) 347 | - Very large workloads still limited by disk I/O and single-machine resources 348 | 349 |
350 |
351 | 352 | 353 | # Benchmarking 354 | --- 355 | 356 | ![bg 80%](images/benchmark_times_total.jpg) 357 | 358 | --- 359 | ![bg 80%](images/benchmark_memory_total.jpg) 360 | 361 | # Summary 362 | | Tool | Good for | Bad for | 363 | |---|---|---| 364 | | Pandas | Interactive analysis of small to medium dataset | Datasets > Memory| 365 | | Dask | Memory-restricted hardware with Pandas-like API / Potential horizontal scaling| High networking overhead when running on single node | 366 | | Polars | Fast multithreaded single‑node analytics| No horizontal scaling / Uses a lot of memory| 367 | DuckDB | Fast single‑node SQL; in‑place CSV/Parquet/Arrow queries; optional persistent DB for fast querying | No horizontal scaling 368 | 369 | # Other Tools 370 | 371 | - Pyspark 372 | - Ray / Ray Data 373 | - Apache Arrow 374 | - ... 375 | 376 | 377 | --------------------------------------------------------------------------------