├── demo └── delay-prediction │ ├── conf │ ├── local │ │ └── .gitkeep │ ├── base │ │ ├── parameters.yml │ │ ├── parameters_data_processing.yml │ │ ├── parameters_evaluation.yml │ │ ├── parameters_training.yml │ │ └── catalog.yml │ ├── README.md │ └── logging.yml │ ├── data │ ├── 01_raw │ │ ├── .gitkeep │ │ └── shuttles.xlsx │ ├── 03_primary │ │ └── .gitkeep │ ├── 04_feature │ │ └── .gitkeep │ ├── 06_models │ │ └── .gitkeep │ ├── 08_reporting │ │ └── .gitkeep │ ├── 02_intermediate │ │ └── .gitkeep │ ├── 05_model_input │ │ └── .gitkeep │ └── 07_model_output │ │ └── .gitkeep │ ├── notebooks │ └── .gitkeep │ ├── tests │ ├── __init__.py │ ├── pipelines │ │ ├── __init__.py │ │ └── data_science │ │ │ └── test_pipeline.py │ └── test_run.py │ ├── src │ └── delay_prediction │ │ ├── pipelines │ │ ├── __init__.py │ │ ├── model_evaluation │ │ │ ├── __init__.py │ │ │ ├── pipeline.py │ │ │ └── nodes.py │ │ ├── model_training │ │ │ ├── __init__.py │ │ │ ├── pipeline.py │ │ │ └── nodes.py │ │ └── data_processing │ │ │ ├── __init__.py │ │ │ ├── pipeline.py │ │ │ └── nodes.py │ │ ├── __init__.py │ │ ├── pipeline_registry.py │ │ ├── __main__.py │ │ └── settings.py │ ├── requirements.txt │ ├── docs │ └── source │ │ ├── index.rst │ │ └── conf.py │ ├── pyproject.toml │ ├── .gitignore │ └── README.md ├── static ├── qr.png ├── deepyaman.jpg ├── juanluis.png ├── codespaces.png ├── kedro-final-pipeline.png └── kedro-horizontal-color-on-light.png ├── requirements.txt ├── .gitignore ├── sql ├── verify_nycflights13.sql ├── load_nycflights13.sql └── create_nycflights13.sql ├── solutions ├── nb01_ex01_mutate_drop.py ├── nb01_ex03.py ├── nb03_ex01_catalog.yml ├── nb01_ex02.py ├── nb01_ex01_select.py └── nb03_ex02.py ├── .devcontainer ├── Dockerfile ├── compose.yaml └── devcontainer.json ├── README.md ├── 02 - Switching Backends.ipynb ├── 00 - Welcome.ipynb ├── codespace_requirements.txt ├── 03 - First Steps with Kedro.ipynb └── 01 - Getting Started with Ibis.ipynb /demo/delay-prediction/conf/local/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/01_raw/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/base/parameters.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/03_primary/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/04_feature/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/06_models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/08_reporting/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/02_intermediate/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/05_model_input/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/07_model_output/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/tests/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/base/parameters_data_processing.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /static/qr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/qr.png -------------------------------------------------------------------------------- /static/deepyaman.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/deepyaman.jpg -------------------------------------------------------------------------------- /static/juanluis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/juanluis.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r demo/delay-prediction/requirements.txt 2 | 3 | certifi>=2024.07.04 4 | jupyterlab==4.1.8 5 | -------------------------------------------------------------------------------- /static/codespaces.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/codespaces.png -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/__init__.py: -------------------------------------------------------------------------------- 1 | """Delay Prediction 2 | """ 3 | 4 | __version__ = "0.1" 5 | -------------------------------------------------------------------------------- /static/kedro-final-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/kedro-final-pipeline.png -------------------------------------------------------------------------------- /static/kedro-horizontal-color-on-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/kedro-horizontal-color-on-light.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb_checkpoints 2 | *.ddb 3 | *.parquet 4 | *.wal 5 | __pycache__ 6 | *.html 7 | *_files/ 8 | # pixi environments 9 | .pixi 10 | 11 | -------------------------------------------------------------------------------- /demo/delay-prediction/data/01_raw/shuttles.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/demo/delay-prediction/data/01_raw/shuttles.xlsx -------------------------------------------------------------------------------- /sql/verify_nycflights13.sql: -------------------------------------------------------------------------------- 1 | SELECT 'flights' table_name, COUNT(*) FROM flights UNION 2 | SELECT 'weather' table_name, COUNT(*) FROM weather 3 | ORDER BY table_name; 4 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import create_pipeline 2 | 3 | __all__ = ["create_pipeline"] 4 | 5 | __version__ = "0.1" 6 | -------------------------------------------------------------------------------- /sql/load_nycflights13.sql: -------------------------------------------------------------------------------- 1 | ATTACH 'dbname=postgres user=postgres' AS postgres_db (TYPE POSTGRES); 2 | INSERT INTO postgres_db.flights FROM flights; 3 | INSERT INTO postgres_db.weather FROM weather; 4 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_training/__init__.py: -------------------------------------------------------------------------------- 1 | """Complete Data Science pipeline for the spaceflights tutorial""" 2 | 3 | from .pipeline import create_pipeline # NOQA 4 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | """Complete Data Processing pipeline for the spaceflights tutorial""" 2 | 3 | from .pipeline import create_pipeline # NOQA 4 | -------------------------------------------------------------------------------- /demo/delay-prediction/requirements.txt: -------------------------------------------------------------------------------- 1 | ibis-framework[duckdb,polars,examples]~=9.1 2 | ibis-ml 3 | kedro~=0.19.6 4 | kedro-datasets[ibis-duckdb,ibis-postgres]~=3.0.1 5 | kedro-telemetry>=0.3.1 6 | kedro-viz~=9.1.0 7 | scikit-learn~=1.0 8 | -------------------------------------------------------------------------------- /solutions/nb01_ex01_mutate_drop.py: -------------------------------------------------------------------------------- 1 | # Convert the imperial units to metric, and drop the imperial columns. 2 | flights_metric_mutate_drop = ( 3 | flights.mutate(distance_km=flights.distance * 1.609).drop("distance") 4 | ) 5 | 6 | flights_metric_mutate_drop 7 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/base/parameters_evaluation.yml: -------------------------------------------------------------------------------- 1 | # This is a boilerplate parameters config generated for pipeline 'evaluation' 2 | # using Kedro 0.19.6. 3 | # 4 | # Documentation for this file format can be found in "Parameters" 5 | # Link: https://docs.kedro.org/en/0.19.6/configuration/parameters.html 6 | -------------------------------------------------------------------------------- /solutions/nb01_ex03.py: -------------------------------------------------------------------------------- 1 | # Which NYC airport has the lowest percentage of outbound flights 2 | # arriving 30 or more minutes late? 3 | sol3 = ( 4 | flights.group_by("origin") 5 | .agg((flights.arr_delay.try_cast(int) >= 30).mean()) 6 | .order_by("Mean(GreaterEqual(TryCast(arr_delay, Int64), 30))") 7 | ) 8 | 9 | sol3 10 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/base/parameters_training.yml: -------------------------------------------------------------------------------- 1 | # This is a boilerplate parameters config generated for pipeline 'training' 2 | # using Kedro 0.19.6. 3 | # 4 | # Documentation for this file format can be found in "Parameters" 5 | # Link: https://docs.kedro.org/en/0.19.6/configuration/parameters.html 6 | 7 | model_options: 8 | random_state: 222 9 | -------------------------------------------------------------------------------- /solutions/nb03_ex01_catalog.yml: -------------------------------------------------------------------------------- 1 | _root_folder: /workspaces/kedro-ibis-tutorial 2 | 3 | _connection: 4 | backend: duckdb 5 | database: "${_root_folder}/nycflights13.ddb" 6 | 7 | flights: 8 | type: ibis.TableDataset 9 | table_name: flights 10 | connection: ${_connection} 11 | 12 | weather: 13 | type: ibis.TableDataset 14 | table_name: weather 15 | connection: ${_connection} 16 | -------------------------------------------------------------------------------- /solutions/nb01_ex02.py: -------------------------------------------------------------------------------- 1 | # Which airlines had the longest average arrival delays in June 2013? 2 | sol2 = ( 3 | flights.filter( 4 | [ 5 | flights.month == 6, 6 | flights.year == 2013, 7 | ] 8 | ) 9 | .group_by("carrier") 10 | .agg(average_arr_delay=flights.arr_delay.try_cast(int).mean()) 11 | .order_by(ibis.desc("average_arr_delay")) 12 | ) 13 | 14 | sol2 15 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/pipeline.py: -------------------------------------------------------------------------------- 1 | from kedro.pipeline import Pipeline, node, pipeline 2 | 3 | from .nodes import evaluate_model 4 | 5 | 6 | def create_pipeline(**kwargs) -> Pipeline: 7 | return pipeline([ 8 | node( 9 | func=evaluate_model, 10 | inputs=["classifier", "X_test", "y_test"], 11 | outputs=None, 12 | ), 13 | ]) 14 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM mcr.microsoft.com/vscode/devcontainers/python:3.11 2 | 3 | RUN apt-get update -y && \ 4 | DEBIAN_FRONTEND=noninteractive \ 5 | apt-get install -y --no-install-recommends graphviz postgresql-client curl ca-certificates && \ 6 | rm -rf /var/lib/apt/lists/* 7 | 8 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh 9 | 10 | COPY codespace_requirements.txt / 11 | 12 | RUN /root/.cargo/bin/uv pip install --system -r codespace_requirements.txt 13 | -------------------------------------------------------------------------------- /demo/delay-prediction/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. delay_prediction documentation master file, created by sphinx-quickstart. 2 | You can adapt this file completely to your liking, but it should at least 3 | contain the root `toctree` directive. 4 | 5 | Welcome to project delay_prediction's API docs! 6 | ============================================= 7 | 8 | .. toctree:: 9 | :maxdepth: 4 10 | 11 | modules 12 | 13 | 14 | Indices and tables 15 | ================== 16 | 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipeline_registry.py: -------------------------------------------------------------------------------- 1 | """Project pipelines.""" 2 | from typing import Dict 3 | 4 | from kedro.framework.project import find_pipelines 5 | from kedro.pipeline import Pipeline 6 | 7 | 8 | def register_pipelines() -> Dict[str, Pipeline]: 9 | """Register the project's pipelines. 10 | 11 | Returns: 12 | A mapping from pipeline names to ``Pipeline`` objects. 13 | """ 14 | pipelines = find_pipelines() 15 | pipelines["__default__"] = sum(pipelines.values()) 16 | return pipelines 17 | -------------------------------------------------------------------------------- /solutions/nb01_ex01_select.py: -------------------------------------------------------------------------------- 1 | # Convert the imperial units to metric, and drop the imperial columns. 2 | flights_metric_select = flights.select( 3 | "year", 4 | "month", 5 | "day", 6 | "dep_time", 7 | "sched_dep_time", 8 | "dep_delay", 9 | "arr_time", 10 | "sched_arr_time", 11 | "arr_delay", 12 | "carrier", 13 | "flight", 14 | "tailnum", 15 | "origin", 16 | "dest", 17 | "air_time", 18 | "hour", 19 | "minute", 20 | "time_hour", 21 | distance_km=flights.distance * 1.609, 22 | ) 23 | 24 | flights_metric_select 25 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/data_processing/pipeline.py: -------------------------------------------------------------------------------- 1 | from kedro.pipeline import Pipeline, node, pipeline 2 | 3 | from .nodes import create_model_input_table, preprocess_flights 4 | 5 | 6 | def create_pipeline(**kwargs) -> Pipeline: 7 | return pipeline( 8 | [ 9 | node( 10 | func=preprocess_flights, 11 | inputs="flights", 12 | outputs="preprocessed_flights", 13 | ), 14 | node( 15 | func=create_model_input_table, 16 | inputs=["preprocessed_flights", "weather"], 17 | outputs="model_input_table", 18 | ), 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_training/pipeline.py: -------------------------------------------------------------------------------- 1 | from kedro.pipeline import Pipeline, node, pipeline 2 | 3 | from .nodes import split_data, train_model 4 | 5 | 6 | def create_pipeline(**kwargs) -> Pipeline: 7 | return pipeline( 8 | [ 9 | node( 10 | func=split_data, 11 | inputs=["model_input_table", "params:model_options.random_state"], 12 | outputs=["X_train", "X_test", "y_train", "y_test"], 13 | ), 14 | node( 15 | func=train_model, 16 | inputs=["X_train", "y_train"], 17 | outputs="classifier", 18 | ), 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /solutions/nb03_ex02.py: -------------------------------------------------------------------------------- 1 | def create_model_input_table(flights, weather): 2 | return ( 3 | flights.mutate( 4 | arr_delay=flights.arr_delay >= 30, 5 | date=flights.time_hour.date(), 6 | ) 7 | .inner_join(weather, ["origin", "time_hour"]) 8 | .select( 9 | "dep_time", 10 | "flight", 11 | "origin", 12 | "dest", 13 | "air_time", 14 | "distance", 15 | "carrier", 16 | "date", 17 | "arr_delay", 18 | "time_hour", 19 | ) 20 | .dropna() 21 | ) 22 | 23 | pipe = pipeline([ 24 | n0, 25 | node( 26 | func=create_model_input_table, 27 | inputs=["preprocessed_flights", "weather"], 28 | outputs="model_input_table", 29 | ), 30 | ]) 31 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/nodes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | from typing import TYPE_CHECKING 5 | 6 | from sklearn.metrics import accuracy_score 7 | from sklearn.pipeline import Pipeline 8 | 9 | if TYPE_CHECKING: 10 | import ibis.expr.types as ir 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def evaluate_model( 17 | pipe: Pipeline, X_test: ir.Table, y_test: ir.Column 18 | ): 19 | """Calculates and logs the coefficient of determination. 20 | 21 | Args: 22 | pipe: Trained model. 23 | X_test: Testing data of independent features. 24 | y_test: Testing data for price. 25 | """ 26 | y_pred = pipe.predict(X_test) 27 | score = accuracy_score(y_test, y_pred) 28 | logger.info("Model has an accuracy of %.3f on test data.", score) 29 | -------------------------------------------------------------------------------- /.devcontainer/compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | kedro-ibis-tutorial: 3 | build: 4 | context: .. 5 | dockerfile: .devcontainer/Dockerfile 6 | volumes: 7 | - ../..:/workspaces:cached 8 | command: sleep infinity 9 | network_mode: service:postgres 10 | environment: 11 | # elephant-shaped turtles all the way down 12 | PGPASSWORD: postgres 13 | PGHOST: postgres 14 | PGUSER: postgres 15 | PGDATABASE: postgres 16 | postgres: 17 | restart: unless-stopped 18 | environment: 19 | POSTGRES_PASSWORD: postgres 20 | POSTGRES_DB: postgres 21 | POSTGRES_USER: postgres 22 | image: postgres:15 23 | healthcheck: 24 | interval: 1s 25 | retries: 20 26 | test: 27 | - CMD 28 | - pg_isready 29 | volumes: 30 | - postgres:/var/lib/postgresql/data 31 | 32 | volumes: 33 | postgres: 34 | -------------------------------------------------------------------------------- /sql/create_nycflights13.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS flights CASCADE; 2 | CREATE TABLE flights ( 3 | "year" BIGINT, 4 | "month" BIGINT, 5 | "day" BIGINT, 6 | "dep_time" VARCHAR, 7 | "sched_dep_time" BIGINT, 8 | "dep_delay" VARCHAR, 9 | "arr_time" VARCHAR, 10 | "sched_arr_time" BIGINT, 11 | "arr_delay" VARCHAR, 12 | "carrier" VARCHAR, 13 | "flight" BIGINT, 14 | "tailnum" VARCHAR, 15 | "origin" VARCHAR, 16 | "dest" VARCHAR, 17 | "air_time" VARCHAR, 18 | "distance" BIGINT, 19 | "hour" BIGINT, 20 | "minute" BIGINT, 21 | "time_hour" TIMESTAMP(6) 22 | ); 23 | 24 | DROP TABLE IF EXISTS weather CASCADE; 25 | CREATE TABLE weather ( 26 | "origin" VARCHAR, 27 | "year" BIGINT, 28 | "month" BIGINT, 29 | "day" BIGINT, 30 | "hour" BIGINT, 31 | "temp" VARCHAR, 32 | "dewp" VARCHAR, 33 | "humid" VARCHAR, 34 | "wind_dir" VARCHAR, 35 | "wind_speed" VARCHAR, 36 | "wind_gust" VARCHAR, 37 | "precip" DOUBLE PRECISION, 38 | "pressure" VARCHAR, 39 | "visib" DOUBLE PRECISION, 40 | "time_hour" TIMESTAMP(6) 41 | ); 42 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/README.md: -------------------------------------------------------------------------------- 1 | # What is this for? 2 | 3 | This folder should be used to store configuration files used by Kedro or by separate tools. 4 | 5 | This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**. 6 | 7 | ## Local configuration 8 | 9 | The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys). 10 | 11 | > *Note:* Please do not check in any local configuration to version control. 12 | 13 | ## Base configuration 14 | 15 | The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members. 16 | 17 | WARNING: Please do not put access credentials in the base configuration folder. 18 | 19 | ## Find out more 20 | You can find out more about configuration from the [user guide documentation](https://docs.kedro.org/en/stable/configuration/configuration_basics.html). 21 | -------------------------------------------------------------------------------- /demo/delay-prediction/tests/test_run.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains an example test. 3 | 4 | Tests should be placed in ``src/tests``, in modules that mirror your 5 | project's structure, and in files named test_*.py. They are simply functions 6 | named ``test_*`` which test a unit of logic. 7 | """ 8 | from pathlib import Path 9 | 10 | import pytest 11 | from kedro.config import OmegaConfigLoader 12 | from kedro.framework.context import KedroContext 13 | from kedro.framework.hooks import _create_hook_manager 14 | 15 | 16 | @pytest.fixture 17 | def config_loader(): 18 | return OmegaConfigLoader(conf_source=str(Path.cwd())) 19 | 20 | 21 | @pytest.fixture 22 | def project_context(config_loader): 23 | return KedroContext( 24 | package_name="delay_prediction", 25 | project_path=Path.cwd(), 26 | config_loader=config_loader, 27 | hook_manager=_create_hook_manager(), 28 | ) 29 | 30 | 31 | # The tests below are here for the demonstration purpose 32 | # and should be replaced with the ones testing the project 33 | # functionality 34 | class TestProjectContext: 35 | def test_project_path(self, project_context): 36 | assert project_context.project_path == Path.cwd() 37 | -------------------------------------------------------------------------------- /demo/delay-prediction/conf/logging.yml: -------------------------------------------------------------------------------- 1 | # To enable this custom logging configuration, set KEDRO_LOGGING_CONFIG to the path of this file. 2 | # More information available at https://docs.kedro.org/en/stable/logging/logging.html 3 | version: 1 4 | 5 | disable_existing_loggers: False 6 | 7 | formatters: 8 | simple: 9 | format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 10 | 11 | handlers: 12 | console: 13 | class: logging.StreamHandler 14 | level: INFO 15 | formatter: simple 16 | stream: ext://sys.stdout 17 | 18 | info_file_handler: 19 | class: logging.handlers.RotatingFileHandler 20 | level: INFO 21 | formatter: simple 22 | filename: info.log 23 | maxBytes: 10485760 # 10MB 24 | backupCount: 20 25 | encoding: utf8 26 | delay: True 27 | 28 | rich: 29 | class: kedro.logging.RichHandler 30 | rich_tracebacks: True 31 | # Advance options for customisation. 32 | # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration 33 | # tracebacks_show_locals: False 34 | 35 | loggers: 36 | kedro: 37 | level: INFO 38 | 39 | delay_prediction: 40 | level: INFO 41 | 42 | root: 43 | handlers: [rich, info_file_handler] 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kedro-Ibis Tutorial 2 | 3 | This tutorial is designed to be run via GitHub codespaces. If you do not have a GitHub account, please create one before the tutorial (https://github.com/join). Other than that, just bring yourself and a laptop with a web browser, and everything else should be good to go! 4 | 5 | ## Codespace setup 6 | 7 | First, create a codespace for the repository. Click the **<> Code** button, then click the **Codespaces** tab. Select **Create codespace on main**. Once created, select **Open in browser**. 8 | 9 | ## References 10 | 11 | Kedro is an open-source Python framework for creating reproducible, maintainable, and modular data science and engineering code. It is an incubation-stage project of the LF AI & Data Foundation. To learn more about Kedro, visit the [Kedro website](https://kedro.org/) and [join our community on Slack](https://slack.kedro.org/). 12 | 13 | Ibis is an open-source Python dataframe library that works with any data system. Visit the [Ibis project website](https://ibis-project.org/) to learn more, or [join our community on Zulip](https://ibis-project.zulipchat.com/). 14 | 15 | This tutorial also uses IbisML, a new library for building scalable ML pipelines using Ibis. More information about IbisML can be found on the [IbisML website](https://ibis-project.github.io/ibis-ml/). There is a [dedicated IbisML stream on Zulip](https://ibis-project.zulipchat.com/#narrow/stream/426262-ibis-ml). 16 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "dockerComposeFile": "compose.yaml", 3 | "service": "kedro-ibis-tutorial", 4 | "runServices": ["postgres"], 5 | "forwardPorts": ["postgres:5432"], 6 | "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}", 7 | "customizations": { 8 | "codespaces": { 9 | "openFiles": ["00 - Welcome.ipynb"] 10 | }, 11 | "vscode": { 12 | "settings": { 13 | "extensions.ignoreRecommendations": true, 14 | "notifications.hideList": true, 15 | "notifications.hideToasts": true, 16 | "notifications.doNotDisturbMode": true, 17 | "update.showReleaseNotes": false, 18 | "vsicons.dontShowNewVersionMessage": true, 19 | "workbench.welcomePage.walkthroughs.openOnInstall": false, 20 | "python.defaultInterpreterPath": "/usr/local/bin/python", 21 | "jupyter.kernels.excludePythonEnvironments": [ 22 | "/usr/bin/python3", 23 | "/bin/python3" 24 | ] 25 | }, 26 | "extensions": [ 27 | "ms-toolsai.jupyter", 28 | "ms-python.python", 29 | "quarto.quarto", 30 | "donjayamanne.vscode-default-python-kernel" 31 | ] 32 | } 33 | }, 34 | "features": { 35 | "ghcr.io/eitsupi/devcontainer-features/duckdb-cli:1": { 36 | "extensions": "httpfs,sqlite,postgres,parquet,json,arrow", 37 | "version": "0.10.2" 38 | }, 39 | "ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": { 40 | "version": "1.5.13" 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/__main__.py: -------------------------------------------------------------------------------- 1 | """Delay Prediction file for ensuring the package is executable 2 | as `delay-prediction` and `python -m delay_prediction` 3 | """ 4 | import importlib 5 | from pathlib import Path 6 | 7 | from kedro.framework.cli.utils import KedroCliError, load_entry_points 8 | from kedro.framework.project import configure_project 9 | 10 | 11 | def _find_run_command(package_name): 12 | try: 13 | project_cli = importlib.import_module(f"{package_name}.cli") 14 | # fail gracefully if cli.py does not exist 15 | except ModuleNotFoundError as exc: 16 | if f"{package_name}.cli" not in str(exc): 17 | raise 18 | plugins = load_entry_points("project") 19 | run = _find_run_command_in_plugins(plugins) if plugins else None 20 | if run: 21 | # use run command from installed plugin if it exists 22 | return run 23 | # use run command from the framework project 24 | from kedro.framework.cli.project import run 25 | 26 | return run 27 | # fail badly if cli.py exists, but has no `cli` in it 28 | if not hasattr(project_cli, "cli"): 29 | raise KedroCliError(f"Cannot load commands from {package_name}.cli") 30 | return project_cli.run 31 | 32 | 33 | def _find_run_command_in_plugins(plugins): 34 | for group in plugins: 35 | if "run" in group.commands: 36 | return group.commands["run"] 37 | 38 | 39 | def main(*args, **kwargs): 40 | package_name = Path(__file__).parent.name 41 | configure_project(package_name) 42 | run = _find_run_command(package_name) 43 | run(*args, **kwargs) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /demo/delay-prediction/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "delay_prediction" 7 | readme = "README.md" 8 | dynamic = ["dependencies", "version"] 9 | 10 | [project.scripts] 11 | delay-prediction = "delay_prediction.__main__:main" 12 | 13 | [project.entry-points."kedro.hooks"] 14 | 15 | [project.optional-dependencies] 16 | docs = [ 17 | "docutils<0.18.0", 18 | "sphinx~=3.4.3", 19 | "sphinx_rtd_theme==0.5.1", 20 | "nbsphinx==0.8.1", 21 | "sphinx-autodoc-typehints==1.11.1", 22 | "sphinx_copybutton==0.3.1", 23 | "ipykernel>=5.3, <7.0", 24 | "Jinja2<3.1.0", 25 | "myst-parser~=0.17.2", 26 | ] 27 | 28 | [tool.setuptools.dynamic] 29 | dependencies = {file = "requirements.txt"} 30 | version = {attr = "delay_prediction.__version__"} 31 | 32 | [tool.setuptools.packages.find] 33 | where = ["src"] 34 | namespaces = false 35 | 36 | [tool.kedro] 37 | package_name = "delay_prediction" 38 | project_name = "Delay Prediction" 39 | kedro_init_version = "0.19.6" 40 | tools = ['None'] 41 | example_pipeline = "False" 42 | source_dir = "src" 43 | 44 | [tool.pytest.ini_options] 45 | addopts = """ 46 | --cov-report term-missing \ 47 | --cov src/delay_prediction -ra""" 48 | 49 | [tool.coverage.report] 50 | fail_under = 0 51 | show_missing = true 52 | exclude_lines = ["pragma: no cover", "raise NotImplementedError"] 53 | 54 | [tool.ruff] 55 | line-length = 88 56 | show-fixes = true 57 | 58 | [tool.ruff.format] 59 | docstring-code-format = true 60 | 61 | [tool.ruff.lint] 62 | select = [ 63 | "F", # Pyflakes 64 | "W", # pycodestyle 65 | "E", # pycodestyle 66 | "I", # isort 67 | "UP", # pyupgrade 68 | "PL", # Pylint 69 | "T201", # Print Statement 70 | ] 71 | ignore = ["E501"] # Ruff format takes care of line-too-long 72 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/settings.py: -------------------------------------------------------------------------------- 1 | """Project settings. There is no need to edit this file unless you want to change values 2 | from the Kedro defaults. For further information, including these default values, see 3 | https://docs.kedro.org/en/stable/kedro_project_setup/settings.html.""" 4 | 5 | # Instantiated project hooks. 6 | # For example, after creating a hooks.py and defining a ProjectHooks class there, do 7 | # from delay_prediction.hooks import ProjectHooks 8 | 9 | # Hooks are executed in a Last-In-First-Out (LIFO) order. 10 | # HOOKS = (ProjectHooks(),) 11 | 12 | # Installed plugins for which to disable hook auto-registration. 13 | # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",) 14 | 15 | # Class that manages storing KedroSession data. 16 | # from kedro.framework.session.store import BaseSessionStore 17 | # SESSION_STORE_CLASS = BaseSessionStore 18 | # Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor. 19 | # SESSION_STORE_ARGS = { 20 | # "path": "./sessions" 21 | # } 22 | 23 | # Directory that holds configuration. 24 | # CONF_SOURCE = "conf" 25 | 26 | # Class that manages how configuration is loaded. 27 | from kedro.config import OmegaConfigLoader # noqa: E402 28 | 29 | CONFIG_LOADER_CLASS = OmegaConfigLoader 30 | # Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor. 31 | CONFIG_LOADER_ARGS = { 32 | "base_env": "base", 33 | "default_run_env": "local", 34 | # "config_patterns": { 35 | # "spark" : ["spark*/"], 36 | # "parameters": ["parameters*", "parameters*/**", "**/parameters*"], 37 | # } 38 | } 39 | 40 | # Class that manages Kedro's library components. 41 | # from kedro.framework.context import KedroContext 42 | # CONTEXT_CLASS = KedroContext 43 | 44 | # Class that manages the Data Catalog. 45 | # from kedro.io import DataCatalog 46 | # DATA_CATALOG_CLASS = DataCatalog 47 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/data_processing/nodes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import ibis.selectors as s 6 | from ibis import _ 7 | 8 | if TYPE_CHECKING: 9 | import ibis.expr.types as ir 10 | 11 | 12 | def _replace_na_values(t: ir.Table) -> ir.Table: 13 | return t.mutate(s.across(s.of_type("string"), _.nullif("NA"))) 14 | 15 | 16 | def preprocess_flights(flights: ir.Table) -> ir.Table: 17 | """Preprocesses the data for flights. 18 | 19 | Args: 20 | flights: Raw data. 21 | Returns: 22 | Preprocessed data, with `dep_time` converted to a time and 23 | `arr_delay` and `air_time` converted to integers. 24 | """ 25 | return _replace_na_values(flights).mutate( 26 | dep_time=( 27 | _.dep_time.lpad(4, "0").substr(0, 2) 28 | + ":" 29 | + _.dep_time.substr(-2, 2) 30 | + ":00" 31 | ).cast("time"), 32 | arr_delay=_.arr_delay.cast(int), 33 | air_time=_.air_time.cast(int), 34 | ) 35 | 36 | 37 | def create_model_input_table(flights: ir.Table, weather: ir.Table) -> ir.Table: 38 | """Combines all data to create a model input table. 39 | 40 | Args: 41 | flights: Preprocessed data for flights. 42 | weather: Raw data for weather. 43 | Returns: 44 | Model input table. 45 | """ 46 | return ( 47 | flights.mutate( 48 | # Convert the arrival delay to a factor 49 | arr_delay=flights.arr_delay >= 30, 50 | # We will use the date (not date-time) in the recipe below 51 | date=flights.time_hour.date(), 52 | ) 53 | # Include the weather data 54 | .inner_join(weather, ["origin", "time_hour"]) 55 | # Only retain the specific columns we will use 56 | .select( 57 | "dep_time", 58 | "flight", 59 | "origin", 60 | "dest", 61 | "air_time", 62 | "distance", 63 | "carrier", 64 | "date", 65 | "arr_delay", 66 | "time_hour", 67 | ) 68 | # Exclude missing data 69 | .dropna() 70 | ) 71 | -------------------------------------------------------------------------------- /demo/delay-prediction/tests/pipelines/data_science/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | import pytest 4 | from kedro.io import DataCatalog 5 | from kedro.runner import SequentialRunner 6 | from delay_prediction.pipelines.data_science import create_pipeline as create_ds_pipeline 7 | from delay_prediction.pipelines.data_science.nodes import split_data 8 | 9 | @pytest.fixture 10 | def dummy_data(): 11 | return pd.DataFrame( 12 | { 13 | "engines": [1, 2, 3], 14 | "crew": [4, 5, 6], 15 | "passenger_capacity": [5, 6, 7], 16 | "price": [120, 290, 30], 17 | } 18 | ) 19 | 20 | @pytest.fixture 21 | def dummy_parameters(): 22 | parameters = { 23 | "model_options": { 24 | "test_size": 0.2, 25 | "random_state": 3, 26 | "features": ["engines", "passenger_capacity", "crew"], 27 | } 28 | } 29 | return parameters 30 | 31 | 32 | def test_split_data(dummy_data, dummy_parameters): 33 | X_train, X_test, y_train, y_test = split_data( 34 | dummy_data, dummy_parameters["model_options"] 35 | ) 36 | assert len(X_train) == 2 37 | assert len(y_train) == 2 38 | assert len(X_test) == 1 39 | assert len(y_test) == 1 40 | 41 | def test_split_data_missing_price(dummy_data, dummy_parameters): 42 | dummy_data_missing_price = dummy_data.drop(columns="price") 43 | with pytest.raises(KeyError) as e_info: 44 | X_train, X_test, y_train, y_test = split_data(dummy_data_missing_price, dummy_parameters["model_options"]) 45 | 46 | assert "price" in str(e_info.value) 47 | 48 | def test_data_science_pipeline(caplog, dummy_data, dummy_parameters): 49 | pipeline = ( 50 | create_ds_pipeline() 51 | .from_nodes("split_data_node") 52 | .to_nodes("evaluate_model_node") 53 | ) 54 | catalog = DataCatalog() 55 | catalog.add_feed_dict( 56 | { 57 | "model_input_table" : dummy_data, 58 | "params:model_options": dummy_parameters["model_options"], 59 | } 60 | ) 61 | 62 | caplog.set_level(logging.DEBUG, logger="kedro") 63 | successful_run_msg = "Pipeline execution completed successfully." 64 | 65 | SequentialRunner().run(pipeline, catalog) 66 | 67 | assert successful_run_msg in caplog.text -------------------------------------------------------------------------------- /demo/delay-prediction/conf/base/catalog.yml: -------------------------------------------------------------------------------- 1 | # Here you can define all your data sets by using simple YAML syntax. 2 | # 3 | # Documentation for this file format can be found in "The Data Catalog" 4 | # Link: https://docs.kedro.org/en/stable/data/data_catalog.html 5 | # 6 | # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS 7 | # 8 | # An example data set definition can look as follows: 9 | # 10 | #bikes: 11 | # type: pandas.CSVDataset 12 | # filepath: "data/01_raw/bikes.csv" 13 | # 14 | #weather: 15 | # type: spark.SparkDataset 16 | # filepath: s3a://your_bucket/data/01_raw/weather* 17 | # file_format: csv 18 | # credentials: dev_s3 19 | # load_args: 20 | # header: True 21 | # inferSchema: True 22 | # save_args: 23 | # sep: '|' 24 | # header: True 25 | # 26 | #scooters: 27 | # type: pandas.SQLTableDataset 28 | # credentials: scooters_credentials 29 | # table_name: scooters 30 | # load_args: 31 | # index_col: ['name'] 32 | # columns: ['name', 'gear'] 33 | # save_args: 34 | # if_exists: 'replace' 35 | # # if_exists: 'fail' 36 | # # if_exists: 'append' 37 | # 38 | # The Data Catalog supports being able to reference the same file using two different Dataset implementations 39 | # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: 40 | # https://docs.kedro.org/en/stable/data/data_catalog.html 41 | 42 | _root_folder: /workspaces/kedro-ibis-tutorial 43 | 44 | _connection: 45 | backend: duckdb 46 | database: "${_root_folder}/nycflights13.ddb" 47 | 48 | flights: 49 | type: ibis.TableDataset 50 | table_name: flights 51 | connection: ${_connection} 52 | 53 | weather: 54 | type: ibis.TableDataset 55 | table_name: weather 56 | connection: ${_connection} 57 | 58 | preprocessed_flights: 59 | type: ibis.TableDataset 60 | table_name: preprocessed_flights 61 | connection: ${_connection} 62 | 63 | model_input_table: 64 | type: ibis.TableDataset 65 | table_name: model_input_table 66 | connection: ${_connection} 67 | save_args: 68 | materialized: table 69 | 70 | classifier: 71 | type: pickle.PickleDataset 72 | filepath: data/06_models/classifier.pickle 73 | versioned: true 74 | 75 | X_test: 76 | type: ibis.TableDataset 77 | table_name: X_test 78 | connection: ${_connection} 79 | 80 | y_test: 81 | type: ibis.TableDataset 82 | table_name: y_test 83 | connection: ${_connection} 84 | -------------------------------------------------------------------------------- /demo/delay-prediction/.gitignore: -------------------------------------------------------------------------------- 1 | ########################## 2 | # KEDRO PROJECT 3 | 4 | # ignore all local configuration 5 | conf/local/** 6 | !conf/local/.gitkeep 7 | 8 | # ignore potentially sensitive credentials files 9 | conf/**/*credentials* 10 | 11 | # ignore everything in the following folders 12 | data/** 13 | 14 | # except their sub-folders 15 | !data/**/ 16 | 17 | # also keep all .gitkeep files 18 | !.gitkeep 19 | 20 | # keep also the example dataset 21 | !data/01_raw/* 22 | 23 | 24 | ########################## 25 | # Common files 26 | 27 | # IntelliJ 28 | .idea/ 29 | *.iml 30 | out/ 31 | .idea_modules/ 32 | 33 | ### macOS 34 | *.DS_Store 35 | .AppleDouble 36 | .LSOverride 37 | .Trashes 38 | 39 | # Vim 40 | *~ 41 | .*.swo 42 | .*.swp 43 | 44 | # emacs 45 | *~ 46 | \#*\# 47 | /.emacs.desktop 48 | /.emacs.desktop.lock 49 | *.elc 50 | 51 | # JIRA plugin 52 | atlassian-ide-plugin.xml 53 | 54 | # C extensions 55 | *.so 56 | 57 | ### Python template 58 | # Byte-compiled / optimized / DLL files 59 | __pycache__/ 60 | *.py[cod] 61 | *$py.class 62 | 63 | # Distribution / packaging 64 | .Python 65 | build/ 66 | develop-eggs/ 67 | dist/ 68 | downloads/ 69 | eggs/ 70 | .eggs/ 71 | lib/ 72 | lib64/ 73 | parts/ 74 | sdist/ 75 | var/ 76 | wheels/ 77 | *.egg-info/ 78 | .installed.cfg 79 | *.egg 80 | MANIFEST 81 | 82 | # PyInstaller 83 | # Usually these files are written by a python script from a template 84 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 85 | *.manifest 86 | *.spec 87 | 88 | # Installer logs 89 | pip-log.txt 90 | pip-delete-this-directory.txt 91 | 92 | # Unit test / coverage reports 93 | htmlcov/ 94 | .tox/ 95 | .coverage 96 | .coverage.* 97 | .cache 98 | nosetests.xml 99 | coverage.xml 100 | *.cover 101 | .hypothesis/ 102 | 103 | # Translations 104 | *.mo 105 | *.pot 106 | 107 | # Django stuff: 108 | *.log 109 | .static_storage/ 110 | .media/ 111 | local_settings.py 112 | 113 | # Flask stuff: 114 | instance/ 115 | .webassets-cache 116 | 117 | # Scrapy stuff: 118 | .scrapy 119 | 120 | # Sphinx documentation 121 | docs/_build/ 122 | 123 | # PyBuilder 124 | target/ 125 | 126 | # Jupyter Notebook 127 | .ipynb_checkpoints 128 | 129 | # pyenv 130 | .python-version 131 | 132 | # celery beat schedule file 133 | celerybeat-schedule 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | -------------------------------------------------------------------------------- /demo/delay-prediction/src/delay_prediction/pipelines/model_training/nodes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | from typing import TYPE_CHECKING 5 | 6 | import ibis 7 | import ibis_ml as ml 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.pipeline import Pipeline 10 | 11 | if TYPE_CHECKING: 12 | import ibis.expr.types as ir 13 | 14 | 15 | def split_data( 16 | flight_data: ir.Table, 17 | random_state: int = 42, 18 | ) -> tuple[ir.Table, ir.Column, ir.Table, ir.Column]: 19 | """Splits data into training and test sets. 20 | 21 | Args: 22 | data: Data containing features and target. 23 | Returns: 24 | Split data. 25 | """ 26 | flight_data_with_unique_key = flight_data.mutate( 27 | unique_key=ibis.literal(",").join( 28 | [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)] 29 | ) 30 | ) 31 | 32 | # Fix the random numbers by setting the seed 33 | # This enables the analysis to be reproducible when random numbers are used 34 | random.seed(random_state) 35 | 36 | # Put 3/4 of the data into the training set 37 | random_key = str(random.getrandbits(256)) 38 | data_split = flight_data_with_unique_key.mutate( 39 | train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3 40 | ) 41 | 42 | # Create data frames for the two sets: 43 | train_data = data_split[data_split.train].drop("unique_key", "train") 44 | test_data = data_split[~data_split.train].drop("unique_key", "train") 45 | 46 | X_train = train_data.drop("arr_delay") 47 | X_test = test_data.drop("arr_delay") 48 | y_train = train_data.arr_delay 49 | y_test = test_data.arr_delay 50 | return X_train, X_test, y_train, y_test 51 | 52 | 53 | def train_model(X_train: ir.Table, y_train: ir.Column) -> Pipeline: 54 | """Trains the logistic regression model. 55 | 56 | Args: 57 | X_train: Training data of independent features. 58 | y_train: Training data for whether a plane arrived more than 30 59 | minutes late. 60 | 61 | Returns: 62 | Trained model. 63 | """ 64 | flights_rec = ml.Recipe( 65 | ml.ExpandDate("date", components=["dow", "month"]), 66 | ml.Drop("date"), 67 | ml.TargetEncode(ml.nominal()), 68 | ml.DropZeroVariance(ml.everything()), 69 | ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()), 70 | ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()), 71 | # By default, PyTorch requires that the type of `X` is `np.float32`. 72 | # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2 73 | ml.Cast(ml.numeric(), "float32"), 74 | ) 75 | pipe = Pipeline([("flights_rec", flights_rec), ("lr_mod", LogisticRegression())]) 76 | pipe.fit(X_train, y_train) 77 | return pipe 78 | -------------------------------------------------------------------------------- /demo/delay-prediction/README.md: -------------------------------------------------------------------------------- 1 | # Delay Prediction 2 | 3 | ## Overview 4 | 5 | This is your new Kedro project, which was generated using `kedro 0.19.6`. 6 | 7 | Take a look at the [Kedro documentation](https://docs.kedro.org) to get started. 8 | 9 | ## Rules and guidelines 10 | 11 | In order to get the best out of the template: 12 | 13 | * Don't remove any lines from the `.gitignore` file we provide 14 | * Make sure your results can be reproduced by following a [data engineering convention](https://docs.kedro.org/en/stable/faq/faq.html#what-is-data-engineering-convention) 15 | * Don't commit data to your repository 16 | * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/` 17 | 18 | ## How to install dependencies 19 | 20 | Declare any dependencies in `requirements.txt` for `pip` installation. 21 | 22 | To install them, run: 23 | 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | ## How to run your Kedro pipeline 29 | 30 | You can run your Kedro project with: 31 | 32 | ``` 33 | kedro run 34 | ``` 35 | 36 | ## How to test your Kedro project 37 | 38 | Have a look at the files `src/tests/test_run.py` and `src/tests/pipelines/data_science/test_pipeline.py` for instructions on how to write your tests. Run the tests as follows: 39 | 40 | ``` 41 | pytest 42 | ``` 43 | 44 | To configure the coverage threshold, look at the `.coveragerc` file. 45 | 46 | ## Project dependencies 47 | 48 | To see and update the dependency requirements for your project use `requirements.txt`. You can install the project requirements with `pip install -r requirements.txt`. 49 | 50 | [Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies) 51 | 52 | ## How to work with Kedro and notebooks 53 | 54 | > Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `catalog`, `context`, `pipelines` and `session`. 55 | > 56 | > Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r requirements.txt` you will not need to take any extra steps before you use them. 57 | 58 | ### Jupyter 59 | To use Jupyter notebooks in your Kedro project, you need to install Jupyter: 60 | 61 | ``` 62 | pip install jupyter 63 | ``` 64 | 65 | After installing Jupyter, you can start a local notebook server: 66 | 67 | ``` 68 | kedro jupyter notebook 69 | ``` 70 | 71 | ### JupyterLab 72 | To use JupyterLab, you need to install it: 73 | 74 | ``` 75 | pip install jupyterlab 76 | ``` 77 | 78 | You can also start JupyterLab: 79 | 80 | ``` 81 | kedro jupyter lab 82 | ``` 83 | 84 | ### IPython 85 | And if you want to run an IPython session: 86 | 87 | ``` 88 | kedro ipython 89 | ``` 90 | 91 | ### How to ignore notebook output cells in `git` 92 | To automatically strip out all output cell contents before committing to `git`, you can use tools like [`nbstripout`](https://github.com/kynan/nbstripout). For example, you can add a hook in `.git/config` with `nbstripout --install`. This will run `nbstripout` before anything is committed to `git`. 93 | 94 | > *Note:* Your output cells will be retained locally. 95 | 96 | ## Package your Kedro project 97 | 98 | [Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html) 99 | -------------------------------------------------------------------------------- /02 - Switching Backends.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Switching Backends\n", 8 | "\n", 9 | "One use case for Ibis's portable API is the ability to develop a query locally (using DuckDB, for example) on a subset of the data, then rerun that same query on the full dataset (using BigQuery, for example) without rewriting your code.\n", 10 | "\n", 11 | "In this notebook, we'll run some of the queries we developed in the previous notebook, using the Postgres database we populated in the Welcome notebook." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import ibis\n", 21 | "\n", 22 | "ibis.options.interactive = True" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Create a connection, just as we did with DuckDB..." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "con = ibis.postgres.connect()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "...grab a reference to the flights table..." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "flights = con.table(\"flights\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "...and copy-paste some queries!" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "flights.order_by(ibis.desc(\"distance\")).select(\"carrier\", \"origin\", \"dest\", \"distance\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "flights.group_by([\"carrier\", \"origin\"]).agg(\n", 80 | " [flights.distance.mean().cast(\"float32\"), flights.air_time.min()]\n", 81 | ")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Later on in the tutorial, we'll see more practical examples of running the same Ibis code against multiple backends." 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.11.9" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /demo/delay-prediction/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | # delay_prediction documentation build 6 | # configuration file, created by sphinx-quickstart. 7 | # 8 | # This file is execfile()d with the current directory set to its 9 | # containing dir. 10 | # 11 | # Note that not all possible configuration values are present in this 12 | # autogenerated file. 13 | # 14 | # All configuration values have a default; values that are commented out 15 | # serve to show the default. 16 | 17 | # If extensions (or modules to document with autodoc) are in another directory, 18 | # add these directories to sys.path here. If the directory is relative to the 19 | # documentation root, use os.path.abspath to make it absolute, like shown here. 20 | # 21 | import re 22 | 23 | from kedro.framework.cli.utils import find_stylesheets 24 | from delay_prediction import __version__ as release 25 | 26 | # -- Project information ----------------------------------------------------- 27 | 28 | project = "delay_prediction" 29 | author = "Kedro" 30 | 31 | # The short X.Y version. 32 | version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1) 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # If your documentation needs a minimal Sphinx version, state it here. 37 | # 38 | # needs_sphinx = '1.0' 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 42 | # ones. 43 | extensions = [ 44 | "sphinx.ext.autodoc", 45 | "sphinx.ext.napoleon", 46 | "sphinx_autodoc_typehints", 47 | "sphinx.ext.doctest", 48 | "sphinx.ext.todo", 49 | "sphinx.ext.coverage", 50 | "sphinx.ext.ifconfig", 51 | "sphinx.ext.viewcode", 52 | "sphinx.ext.mathjax", 53 | "nbsphinx", 54 | "sphinx_copybutton", 55 | "myst_parser", 56 | ] 57 | 58 | # enable autosummary plugin (table of contents for modules/classes/class 59 | # methods) 60 | autosummary_generate = True 61 | 62 | # Add any paths that contain templates here, relative to this directory. 63 | templates_path = ["_templates"] 64 | 65 | # The suffix(es) of source filenames. 66 | # You can specify multiple suffix as a list of string: 67 | # 68 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"} 69 | 70 | # The master toctree document. 71 | master_doc = "index" 72 | 73 | # The language for content autogenerated by Sphinx. Refer to documentation 74 | # for a list of supported languages. 75 | # 76 | # This is also used if you do content translation via gettext catalogs. 77 | # Usually you set "language" from the command line for these cases. 78 | language = None 79 | 80 | # List of patterns, relative to source directory, that match files and 81 | # directories to ignore when looking for source files. 82 | # This pattern also affects html_static_path and html_extra_path . 83 | exclude_patterns = ["_build", "**.ipynb_checkpoints"] 84 | 85 | # The name of the Pygments (syntax highlighting) style to use. 86 | pygments_style = "sphinx" 87 | 88 | # -- Options for HTML output ------------------------------------------------- 89 | 90 | # The theme to use for HTML and HTML Help pages. See the documentation for 91 | # a list of builtin themes. 92 | # 93 | html_theme = "sphinx_rtd_theme" 94 | 95 | # Theme options are theme-specific and customize the look and feel of a theme 96 | # further. For a list of options available for each theme, see the 97 | # documentation. 98 | # 99 | html_theme_options = {"collapse_navigation": False, "style_external_links": True} 100 | 101 | # Add any paths that contain custom static files (such as style sheets) here, 102 | # relative to this directory. They are copied after the builtin static files, 103 | # so a file named "default.css" will overwrite the builtin "default.css". 104 | html_static_path = ["_static"] 105 | 106 | # Custom sidebar templates, must be a dictionary that maps document names 107 | # to template names. 108 | # 109 | # The default sidebars (for documents that don't match any pattern) are 110 | # defined by theme itself. Builtin themes are using these templates by 111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 112 | # 'searchbox.html']``. 113 | # 114 | # html_sidebars = {} 115 | 116 | html_show_sourcelink = False 117 | 118 | # Removes, from all docs, the copyright footer. 119 | html_show_copyright = False 120 | 121 | # -- Options for HTMLHelp output --------------------------------------------- 122 | 123 | # Output file base name for HTML help builder. 124 | htmlhelp_basename = "delay_predictiondoc" 125 | 126 | # -- Options for LaTeX output ------------------------------------------------ 127 | 128 | latex_elements = { 129 | # The paper size ('letterpaper' or 'a4paper'). 130 | # 131 | # 'papersize': 'letterpaper', 132 | # 133 | # The font size ('10pt', '11pt' or '12pt'). 134 | # 135 | # 'pointsize': '10pt', 136 | # 137 | # Additional stuff for the LaTeX preamble. 138 | # 139 | # 'preamble': '', 140 | # 141 | # Latex figure (float) alignment 142 | # 143 | # 'figure_align': 'htbp', 144 | } 145 | 146 | # Grouping the document tree into LaTeX files. List of tuples 147 | # (source start file, target name, title, 148 | # author, documentclass [howto, manual, or own class]). 149 | latex_documents = [ 150 | ( 151 | master_doc, 152 | "delay_prediction.tex", 153 | "delay_prediction Documentation", 154 | "Kedro", 155 | "manual", 156 | ) 157 | ] 158 | 159 | # -- Options for manual page output ------------------------------------------ 160 | 161 | # One entry per manual page. List of tuples 162 | # (source start file, name, description, authors, manual section). 163 | man_pages = [ 164 | ( 165 | master_doc, 166 | "delay_prediction", 167 | "delay_prediction Documentation", 168 | [author], 169 | 1, 170 | ) 171 | ] 172 | 173 | # -- Options for Texinfo output ---------------------------------------------- 174 | 175 | # Grouping the document tree into Texinfo files. List of tuples 176 | # (source start file, target name, title, author, 177 | # dir menu entry, description, category) 178 | texinfo_documents = [ 179 | ( 180 | master_doc, 181 | "delay_prediction", 182 | "delay_prediction Documentation", 183 | author, 184 | "delay_prediction", 185 | "Project delay_prediction codebase.", 186 | "Data-Science", 187 | ) 188 | ] 189 | 190 | # -- Options for todo extension ---------------------------------------------- 191 | 192 | # If true, `todo` and `todoList` produce output, else they produce nothing. 193 | todo_include_todos = False 194 | 195 | # -- Extension configuration ------------------------------------------------- 196 | 197 | # nbsphinx_prolog = """ 198 | # see here for prolog/epilog details: 199 | # https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html 200 | # """ 201 | 202 | # -- NBconvert kernel config ------------------------------------------------- 203 | nbsphinx_kernel_name = "python3" 204 | 205 | 206 | def remove_arrows_in_examples(lines): 207 | for i, line in enumerate(lines): 208 | lines[i] = line.replace(">>>", "") 209 | 210 | 211 | def autodoc_process_docstring(app, what, name, obj, options, lines): 212 | remove_arrows_in_examples(lines) 213 | 214 | 215 | def skip(app, what, name, obj, skip, options): 216 | if name == "__init__": 217 | return False 218 | return skip 219 | 220 | 221 | def setup(app): 222 | app.connect("autodoc-process-docstring", autodoc_process_docstring) 223 | app.connect("autodoc-skip-member", skip) 224 | # add Kedro stylesheets 225 | for stylesheet in find_stylesheets(): 226 | app.add_css_file(stylesheet) 227 | # enable rendering RST tables in Markdown 228 | -------------------------------------------------------------------------------- /00 - Welcome.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "editable": true, 7 | "slideshow": { 8 | "slide_type": "slide" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# Welcome to the Kedro-Ibis tutorial!\n", 14 | "\n", 15 | "## Outline\n", 16 | "\n", 17 | "- Introduction\n", 18 | " - Who we are\n", 19 | " - Workshop material\n", 20 | " - Setup\n", 21 | " - Motivation\n", 22 | "- Expressive analytics at any scale: Introduction to Ibis\n", 23 | "- From prototype to production: Introduction to Kedro\n", 24 | "- Conclusion" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "editable": true, 31 | "slideshow": { 32 | "slide_type": "skip" 33 | }, 34 | "tags": [] 35 | }, 36 | "source": [ 37 | "These are the notebooks for the tutorial: 👇\n", 38 | "\n", 39 | "1. [Getting Started with Ibis](./01%20-%20Getting%20Started%20with%20Ibis.ipynb)\n", 40 | "2. [Switching Backends](./02%20-%20Switching%20Backends.ipynb)\n", 41 | "3. [First Steps with Kedro](./03%20-%20First%20Steps%20with%20Kedro.ipynb)" 42 | ] 43 | }, 44 | { 45 | "attachments": {}, 46 | "cell_type": "markdown", 47 | "metadata": { 48 | "editable": true, 49 | "slideshow": { 50 | "slide_type": "slide" 51 | }, 52 | "tags": [] 53 | }, 54 | "source": [ 55 | "## Introduction\n", 56 | "\n", 57 | "### Who we are\n", 58 | "\n", 59 | "| | |\n", 60 | "|--------|------|\n", 61 | "| ![Deepyaman](static/deepyaman.jpg) | **Deepyaman Datta**

Deepyaman is a software engineer at Voltron Data. Before their acquisition by Voltron Data, he was a Founding Machine Learning Engineer at Claypot AI, working on their real-time feature engineering platform. Prior to that, he led data engineering teams and asset development across a range of industries at QuantumBlack, AI by McKinsey. |\n", 62 | "| ![Juan Luis](static/juanluis.png) | **Juan Luis Cano Rodríguez**

Juan Luis (he/him/él) is an Aerospace Engineer with a passion for tech communities, outreach, and sustainability. He works at QuantumBlack, AI by McKinsey, as Product Manager for Kedro, an opinionated Python framework for creating reproducible, maintainable and modular data science code. He has worked as Developer Advocate at Read the Docs, as software engineer in the space, consulting, and banking industries, and as a Python trainer for several private and public entities. |" 63 | ] 64 | }, 65 | { 66 | "attachments": {}, 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "editable": true, 70 | "slideshow": { 71 | "slide_type": "subslide" 72 | }, 73 | "tags": [] 74 | }, 75 | "source": [ 76 | "### Workshop material\n", 77 | "\n", 78 | "**https://github.com/ibis-project/kedro-ibis-tutorial**\n", 79 | "\n", 80 | "![QR Code](static/qr.png)\n", 81 | "\n", 82 | "_Note: This will be a lot of material for a 90-minute tutorial; we'll go fast and not go too much in depth, but will be more than happy to answer questions later!_" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": { 88 | "editable": true, 89 | "slideshow": { 90 | "slide_type": "subslide" 91 | }, 92 | "tags": [] 93 | }, 94 | "source": [ 95 | "1. Open URL above\n", 96 | "2. Hit 🟩 \"Create codespace on main\"\n", 97 | "3. Open `00 - Welcome.ipynb` notebook and follow instructions\n", 98 | "\n", 99 | "\"Codespaces\"" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "editable": true, 106 | "slideshow": { 107 | "slide_type": "subslide" 108 | }, 109 | "tags": [] 110 | }, 111 | "source": [ 112 | "## Setup\n", 113 | "\n", 114 | "Let's start by downloading the [nycflights13 data](https://github.com/hadley/nycflights13); we'll use this dataset throughout the tutorial." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "editable": true, 122 | "slideshow": { 123 | "slide_type": "fragment" 124 | }, 125 | "tags": [] 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "import ibis\n", 130 | "\n", 131 | "con = ibis.connect(\"duckdb://nycflights13.ddb\")\n", 132 | "con.create_table(\n", 133 | " \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n", 134 | ")\n", 135 | "con.create_table(\n", 136 | " \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n", 137 | ")\n", 138 | "con.disconnect()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "editable": true, 145 | "slideshow": { 146 | "slide_type": "subslide" 147 | }, 148 | "tags": [] 149 | }, 150 | "source": [ 151 | "Next, we'll load the data into a local PostgreSQL database using DuckDB—[yes, you can do that](https://duckdb.org/docs/extensions/postgres.html#writing-data-to-postgres)!" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "editable": true, 159 | "slideshow": { 160 | "slide_type": "fragment" 161 | }, 162 | "tags": [] 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "!psql < sql/create_nycflights13.sql" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "editable": true, 174 | "slideshow": { 175 | "slide_type": "fragment" 176 | }, 177 | "tags": [] 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "!duckdb nycflights13.ddb < sql/load_nycflights13.sql" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": { 187 | "editable": true, 188 | "slideshow": { 189 | "slide_type": "subslide" 190 | }, 191 | "tags": [] 192 | }, 193 | "source": [ 194 | "We can confirm that our PostgreSQL database contains the tables we just populated." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "editable": true, 202 | "slideshow": { 203 | "slide_type": "fragment" 204 | }, 205 | "tags": [] 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "!psql < sql/verify_nycflights13.sql" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "editable": true, 216 | "slideshow": { 217 | "slide_type": "subslide" 218 | }, 219 | "tags": [] 220 | }, 221 | "source": [ 222 | "## Motivation\n", 223 | "\n", 224 | "In your experience doing data analytics/building data pipelines, have you ever..." 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": { 230 | "editable": true, 231 | "slideshow": { 232 | "slide_type": "fragment" 233 | }, 234 | "tags": [] 235 | }, 236 | "source": [ 237 | "- ...slurped up large amounts of data into memory, instead of pushing execution down to the source database/engine?" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "editable": true, 244 | "slideshow": { 245 | "slide_type": "fragment" 246 | }, 247 | "tags": [] 248 | }, 249 | "source": [ 250 | "- ...prototyped code in pandas, and then rewritten it in PySpark/Snowpark/some other native dataframe API?" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": { 256 | "editable": true, 257 | "slideshow": { 258 | "slide_type": "fragment" 259 | }, 260 | "tags": [] 261 | }, 262 | "source": [ 263 | "- ...implemented a proof-of-concept solution on data extracts, and then struggled massively when you needed to move to running against the production databases and scale out?" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "editable": true, 270 | "slideshow": { 271 | "slide_type": "fragment" 272 | }, 273 | "tags": [] 274 | }, 275 | "source": [ 276 | "- ...insisted on using Python across the full data engineering/data science workflow for consistency (fair enough), although dbt would have been the much better fit for non-ML pipelines, because you essentially needed a SQL workflow?" 277 | ] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Python 3 (ipykernel)", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.11.9" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 4 301 | } 302 | -------------------------------------------------------------------------------- /codespace_requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile requirements.txt -o codespace_requirements.txt 3 | aiofiles==23.2.1 4 | # via kedro-viz 5 | aiohttp==3.9.5 6 | # via gcsfs 7 | aiosignal==1.3.1 8 | # via aiohttp 9 | annotated-types==0.7.0 10 | # via pydantic 11 | antlr4-python3-runtime==4.9.3 12 | # via omegaconf 13 | anyio==3.7.1 14 | # via 15 | # httpx 16 | # jupyter-server 17 | # starlette 18 | # watchfiles 19 | # watchgod 20 | appdirs==1.4.4 21 | # via 22 | # kedro-telemetry 23 | # pins 24 | argon2-cffi==23.1.0 25 | # via jupyter-server 26 | argon2-cffi-bindings==21.2.0 27 | # via argon2-cffi 28 | arrow==1.3.0 29 | # via 30 | # cookiecutter 31 | # isoduration 32 | asttokens==2.4.1 33 | # via stack-data 34 | async-lru==2.0.4 35 | # via jupyterlab 36 | atpublic==4.1.0 37 | # via ibis-framework 38 | attrs==23.2.0 39 | # via 40 | # aiohttp 41 | # jsonschema 42 | # kedro 43 | # referencing 44 | babel==2.15.0 45 | # via jupyterlab-server 46 | beautifulsoup4==4.12.3 47 | # via nbconvert 48 | bidict==0.23.1 49 | # via ibis-framework 50 | binaryornot==0.4.4 51 | # via cookiecutter 52 | bleach==6.1.0 53 | # via nbconvert 54 | build==1.2.1 55 | # via kedro 56 | cachetools==5.3.3 57 | # via 58 | # google-auth 59 | # kedro 60 | certifi==2024.7.4 61 | # via 62 | # -r requirements.txt 63 | # httpcore 64 | # httpx 65 | # requests 66 | cffi==1.16.0 67 | # via argon2-cffi-bindings 68 | chardet==5.2.0 69 | # via binaryornot 70 | charset-normalizer==3.3.2 71 | # via requests 72 | click==8.1.7 73 | # via 74 | # click-default-group 75 | # cookiecutter 76 | # kedro 77 | # typer 78 | # uvicorn 79 | click-default-group==1.2.4 80 | # via kedro-viz 81 | comm==0.2.2 82 | # via ipykernel 83 | cookiecutter==2.6.0 84 | # via kedro 85 | debugpy==1.8.1 86 | # via ipykernel 87 | decorator==5.1.1 88 | # via 89 | # gcsfs 90 | # ipython 91 | defusedxml==0.7.1 92 | # via nbconvert 93 | dnspython==2.6.1 94 | # via email-validator 95 | duckdb==1.0.0 96 | # via ibis-framework 97 | dynaconf==3.2.5 98 | # via kedro 99 | email-validator==2.1.1 100 | # via fastapi 101 | executing==2.0.1 102 | # via stack-data 103 | fastapi==0.111.0 104 | # via kedro-viz 105 | fastapi-cli==0.0.4 106 | # via fastapi 107 | fastjsonschema==2.19.1 108 | # via nbformat 109 | fqdn==1.5.1 110 | # via jsonschema 111 | frozenlist==1.4.1 112 | # via 113 | # aiohttp 114 | # aiosignal 115 | fsspec==2024.2.0 116 | # via 117 | # gcsfs 118 | # ibis-framework 119 | # kedro 120 | # kedro-viz 121 | # pins 122 | gcsfs==2024.2.0 123 | # via pins 124 | gitdb==4.0.11 125 | # via gitpython 126 | gitpython==3.1.41 127 | # via kedro 128 | google-api-core==2.19.0 129 | # via 130 | # google-cloud-core 131 | # google-cloud-storage 132 | google-auth==2.29.0 133 | # via 134 | # gcsfs 135 | # google-api-core 136 | # google-auth-oauthlib 137 | # google-cloud-core 138 | # google-cloud-storage 139 | google-auth-oauthlib==1.2.0 140 | # via gcsfs 141 | google-cloud-core==2.4.1 142 | # via google-cloud-storage 143 | google-cloud-storage==2.16.0 144 | # via gcsfs 145 | google-crc32c==1.5.0 146 | # via 147 | # google-cloud-storage 148 | # google-resumable-media 149 | google-resumable-media==2.7.0 150 | # via google-cloud-storage 151 | googleapis-common-protos==1.63.0 152 | # via google-api-core 153 | graphql-core==3.2.3 154 | # via strawberry-graphql 155 | greenlet==3.0.3 156 | # via sqlalchemy 157 | h11==0.14.0 158 | # via 159 | # httpcore 160 | # uvicorn 161 | httpcore==1.0.5 162 | # via httpx 163 | httptools==0.6.1 164 | # via uvicorn 165 | httpx==0.27.0 166 | # via 167 | # fastapi 168 | # jupyterlab 169 | humanize==4.9.0 170 | # via pins 171 | ibis-framework==9.1.0 172 | # via 173 | # -r demo/delay-prediction/requirements.txt 174 | # ibis-ml 175 | # kedro-datasets 176 | ibis-ml==0.1.0 177 | # via -r demo/delay-prediction/requirements.txt 178 | idna==3.7 179 | # via 180 | # anyio 181 | # email-validator 182 | # httpx 183 | # jsonschema 184 | # requests 185 | # yarl 186 | importlib-metadata==7.1.0 187 | # via 188 | # kedro 189 | # pins 190 | importlib-resources==6.4.0 191 | # via 192 | # kedro 193 | # pins 194 | ipykernel==6.29.4 195 | # via jupyterlab 196 | ipython==8.24.0 197 | # via 198 | # ipykernel 199 | # kedro-viz 200 | isoduration==20.11.0 201 | # via jsonschema 202 | jedi==0.19.1 203 | # via ipython 204 | jinja2==3.1.4 205 | # via 206 | # cookiecutter 207 | # fastapi 208 | # jupyter-server 209 | # jupyterlab 210 | # jupyterlab-server 211 | # nbconvert 212 | # pins 213 | joblib==1.4.2 214 | # via 215 | # pins 216 | # scikit-learn 217 | json5==0.9.25 218 | # via jupyterlab-server 219 | jsonpointer==2.4 220 | # via jsonschema 221 | jsonschema==4.22.0 222 | # via 223 | # jupyter-events 224 | # jupyterlab-server 225 | # nbformat 226 | jsonschema-specifications==2023.12.1 227 | # via jsonschema 228 | jupyter-client==8.6.1 229 | # via 230 | # ipykernel 231 | # jupyter-server 232 | # nbclient 233 | jupyter-core==5.7.2 234 | # via 235 | # ipykernel 236 | # jupyter-client 237 | # jupyter-server 238 | # jupyterlab 239 | # nbclient 240 | # nbconvert 241 | # nbformat 242 | jupyter-events==0.10.0 243 | # via jupyter-server 244 | jupyter-lsp==2.2.5 245 | # via jupyterlab 246 | jupyter-server==2.14.0 247 | # via 248 | # jupyter-lsp 249 | # jupyterlab 250 | # jupyterlab-server 251 | # notebook-shim 252 | jupyter-server-terminals==0.5.3 253 | # via jupyter-server 254 | jupyterlab==4.1.8 255 | # via -r requirements.txt 256 | jupyterlab-pygments==0.3.0 257 | # via nbconvert 258 | jupyterlab-server==2.27.1 259 | # via jupyterlab 260 | kedro==0.19.6 261 | # via 262 | # -r demo/delay-prediction/requirements.txt 263 | # kedro-datasets 264 | # kedro-telemetry 265 | # kedro-viz 266 | kedro-datasets==3.0.1 267 | # via -r demo/delay-prediction/requirements.txt 268 | kedro-telemetry==0.4.0 269 | # via -r demo/delay-prediction/requirements.txt 270 | kedro-viz==9.1.0 271 | # via -r demo/delay-prediction/requirements.txt 272 | lazy-loader==0.4 273 | # via kedro-datasets 274 | markdown-it-py==3.0.0 275 | # via rich 276 | markupsafe==2.1.5 277 | # via 278 | # jinja2 279 | # nbconvert 280 | matplotlib-inline==0.1.7 281 | # via 282 | # ipykernel 283 | # ipython 284 | mdurl==0.1.2 285 | # via markdown-it-py 286 | mistune==3.0.2 287 | # via nbconvert 288 | more-itertools==10.2.0 289 | # via kedro 290 | multidict==6.0.5 291 | # via 292 | # aiohttp 293 | # yarl 294 | nbclient==0.10.0 295 | # via nbconvert 296 | nbconvert==7.16.4 297 | # via jupyter-server 298 | nbformat==5.10.4 299 | # via 300 | # jupyter-server 301 | # nbclient 302 | # nbconvert 303 | nest-asyncio==1.6.0 304 | # via ipykernel 305 | networkx==3.3 306 | # via kedro-viz 307 | notebook-shim==0.2.4 308 | # via jupyterlab 309 | numpy==1.26.4 310 | # via 311 | # ibis-framework 312 | # pandas 313 | # pyarrow 314 | # scikit-learn 315 | # scipy 316 | oauthlib==3.2.2 317 | # via requests-oauthlib 318 | omegaconf==2.3.0 319 | # via kedro 320 | orjson==3.10.3 321 | # via 322 | # fastapi 323 | # kedro-viz 324 | overrides==7.7.0 325 | # via jupyter-server 326 | packaging==23.2 327 | # via 328 | # build 329 | # ibis-framework 330 | # ipykernel 331 | # jupyter-server 332 | # jupyterlab 333 | # jupyterlab-server 334 | # kedro-viz 335 | # lazy-loader 336 | # nbconvert 337 | # plotly 338 | # pytoolconfig 339 | pandas==2.2.2 340 | # via 341 | # ibis-framework 342 | # kedro-viz 343 | # pins 344 | pandocfilters==1.5.1 345 | # via nbconvert 346 | parse==1.20.1 347 | # via kedro 348 | parso==0.8.4 349 | # via jedi 350 | parsy==2.1 351 | # via ibis-framework 352 | pexpect==4.9.0 353 | # via ipython 354 | pins==0.8.6 355 | # via ibis-framework 356 | platformdirs==4.2.1 357 | # via 358 | # jupyter-core 359 | # pytoolconfig 360 | plotly==5.22.0 361 | # via kedro-viz 362 | pluggy==1.5.0 363 | # via kedro 364 | polars==0.20.23 365 | # via ibis-framework 366 | pre-commit-hooks==4.6.0 367 | # via kedro 368 | prometheus-client==0.20.0 369 | # via jupyter-server 370 | prompt-toolkit==3.0.43 371 | # via ipython 372 | proto-plus==1.23.0 373 | # via google-api-core 374 | protobuf==4.25.3 375 | # via 376 | # google-api-core 377 | # googleapis-common-protos 378 | # proto-plus 379 | psutil==5.9.8 380 | # via ipykernel 381 | psycopg2==2.9.9 382 | # via ibis-framework 383 | ptyprocess==0.7.0 384 | # via 385 | # pexpect 386 | # terminado 387 | pure-eval==0.2.2 388 | # via stack-data 389 | pyarrow==16.0.0 390 | # via ibis-framework 391 | pyarrow-hotfix==0.6 392 | # via ibis-framework 393 | pyasn1==0.6.0 394 | # via 395 | # pyasn1-modules 396 | # rsa 397 | pyasn1-modules==0.4.0 398 | # via google-auth 399 | pycparser==2.22 400 | # via cffi 401 | pydantic==2.7.3 402 | # via 403 | # fastapi 404 | # kedro-viz 405 | pydantic-core==2.18.4 406 | # via pydantic 407 | pygments==2.18.0 408 | # via 409 | # ipython 410 | # nbconvert 411 | # rich 412 | pyproject-hooks==1.1.0 413 | # via build 414 | python-dateutil==2.9.0.post0 415 | # via 416 | # arrow 417 | # ibis-framework 418 | # jupyter-client 419 | # pandas 420 | # strawberry-graphql 421 | python-dotenv==1.0.1 422 | # via uvicorn 423 | python-json-logger==2.0.7 424 | # via jupyter-events 425 | python-multipart==0.0.9 426 | # via fastapi 427 | python-slugify==8.0.4 428 | # via cookiecutter 429 | pytoolconfig==1.3.1 430 | # via rope 431 | pytz==2024.1 432 | # via 433 | # ibis-framework 434 | # pandas 435 | pyyaml==6.0.1 436 | # via 437 | # cookiecutter 438 | # jupyter-events 439 | # kedro 440 | # omegaconf 441 | # pins 442 | # uvicorn 443 | pyzmq==26.0.3 444 | # via 445 | # ipykernel 446 | # jupyter-client 447 | # jupyter-server 448 | referencing==0.35.1 449 | # via 450 | # jsonschema 451 | # jsonschema-specifications 452 | # jupyter-events 453 | requests==2.31.0 454 | # via 455 | # cookiecutter 456 | # gcsfs 457 | # google-api-core 458 | # google-cloud-storage 459 | # jupyterlab-server 460 | # kedro-telemetry 461 | # pins 462 | # requests-oauthlib 463 | requests-oauthlib==2.0.0 464 | # via google-auth-oauthlib 465 | rfc3339-validator==0.1.4 466 | # via 467 | # jsonschema 468 | # jupyter-events 469 | rfc3986-validator==0.1.1 470 | # via 471 | # jsonschema 472 | # jupyter-events 473 | rich==13.7.1 474 | # via 475 | # cookiecutter 476 | # ibis-framework 477 | # kedro 478 | # typer 479 | rope==1.13.0 480 | # via kedro 481 | rpds-py==0.18.1 482 | # via 483 | # jsonschema 484 | # referencing 485 | rsa==4.9 486 | # via google-auth 487 | ruamel-yaml==0.18.6 488 | # via pre-commit-hooks 489 | ruamel-yaml-clib==0.2.8 490 | # via ruamel-yaml 491 | scikit-learn==1.5.0 492 | # via -r demo/delay-prediction/requirements.txt 493 | scipy==1.13.0 494 | # via scikit-learn 495 | secure==0.3.0 496 | # via kedro-viz 497 | send2trash==1.8.3 498 | # via jupyter-server 499 | shellingham==1.5.4 500 | # via typer 501 | six==1.16.0 502 | # via 503 | # asttokens 504 | # bleach 505 | # python-dateutil 506 | # rfc3339-validator 507 | smmap==5.0.1 508 | # via gitdb 509 | sniffio==1.3.1 510 | # via 511 | # anyio 512 | # httpx 513 | soupsieve==2.5 514 | # via beautifulsoup4 515 | sqlalchemy==2.0.30 516 | # via kedro-viz 517 | sqlglot==23.12.2 518 | # via ibis-framework 519 | stack-data==0.6.3 520 | # via ipython 521 | starlette==0.37.2 522 | # via fastapi 523 | strawberry-graphql==0.234.2 524 | # via kedro-viz 525 | tenacity==8.2.3 526 | # via plotly 527 | terminado==0.18.1 528 | # via 529 | # jupyter-server 530 | # jupyter-server-terminals 531 | text-unidecode==1.3 532 | # via python-slugify 533 | threadpoolctl==3.5.0 534 | # via scikit-learn 535 | tinycss2==1.3.0 536 | # via nbconvert 537 | toml==0.10.2 538 | # via kedro 539 | toolz==0.12.1 540 | # via ibis-framework 541 | toposort==1.10 542 | # via kedro-viz 543 | tornado==6.4 544 | # via 545 | # ipykernel 546 | # jupyter-client 547 | # jupyter-server 548 | # jupyterlab 549 | # terminado 550 | traitlets==5.14.3 551 | # via 552 | # comm 553 | # ipykernel 554 | # ipython 555 | # jupyter-client 556 | # jupyter-core 557 | # jupyter-events 558 | # jupyter-server 559 | # jupyterlab 560 | # matplotlib-inline 561 | # nbclient 562 | # nbconvert 563 | # nbformat 564 | typer==0.12.3 565 | # via fastapi-cli 566 | types-python-dateutil==2.9.0.20240316 567 | # via arrow 568 | typing-extensions==4.11.0 569 | # via 570 | # fastapi 571 | # ibis-framework 572 | # ipython 573 | # pydantic 574 | # pydantic-core 575 | # sqlalchemy 576 | # strawberry-graphql 577 | # typer 578 | tzdata==2024.1 579 | # via pandas 580 | ujson==5.10.0 581 | # via fastapi 582 | uri-template==1.3.0 583 | # via jsonschema 584 | urllib3==2.2.1 585 | # via requests 586 | uvicorn==0.29.0 587 | # via 588 | # fastapi 589 | # kedro-viz 590 | uvloop==0.19.0 591 | # via uvicorn 592 | watchfiles==0.22.0 593 | # via uvicorn 594 | watchgod==0.8.2 595 | # via kedro-viz 596 | wcwidth==0.2.13 597 | # via prompt-toolkit 598 | webcolors==1.13 599 | # via jsonschema 600 | webencodings==0.5.1 601 | # via 602 | # bleach 603 | # tinycss2 604 | websocket-client==1.8.0 605 | # via jupyter-server 606 | websockets==12.0 607 | # via uvicorn 608 | xxhash==3.4.1 609 | # via pins 610 | yarl==1.9.4 611 | # via aiohttp 612 | zipp==3.18.1 613 | # via importlib-metadata 614 | -------------------------------------------------------------------------------- /03 - First Steps with Kedro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "editable": true, 7 | "slideshow": { 8 | "slide_type": "" 9 | }, 10 | "tags": [] 11 | }, 12 | "source": [ 13 | "# First Steps with Kedro\n", 14 | "\n", 15 | "\"Kedro\"\n", 16 | "\n", 17 | "**Goal**: Create a classifier that predicts whether a flight will be delayed or not, using the [nycflights13 data](https://github.com/hadley/nycflights13)." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "editable": true, 24 | "slideshow": { 25 | "slide_type": "notes" 26 | }, 27 | "tags": [] 28 | }, 29 | "source": [ 30 | "To see the end result,\n", 31 | "\n", 32 | "```\n", 33 | "$ cd demo/delay-prediction\n", 34 | "$ kedro viz run\n", 35 | "```\n", 36 | "\n", 37 | "\"Kedro" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import ibis\n", 47 | "\n", 48 | "ibis.options.interactive = True" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## The `DataCatalog`\n", 56 | "\n", 57 | "Kedro’s [Data Catalog](https://docs.kedro.org/en/latest/data/) is a registry of all data sources available for use by the project. It offers a separate place to declare details of the datasets your projects use. Kedro provides built-in datasets for different file types and file systems so you don’t have to write any of the logic for reading or writing data.\n", 58 | "\n", 59 | "Kedro offers a range of datasets, including CSV, Excel, Parquet, Feather, HDF5, JSON, Pickle, SQL Tables, SQL Queries, Spark DataFrames, and more. They are supported with the APIs of pandas, spark, networkx, matplotlib, yaml, and beyond. It relies on fsspec to read and save data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. You can pass arguments in to load and save operations, and use versioning and credentials for data access.\n", 60 | "\n", 61 | "To start using the Data Catalog, create an instance of the `DataCatalog` class with a dictionary configuration as follows:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from kedro.io import DataCatalog" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "catalog = DataCatalog.from_config(\n", 80 | " {\n", 81 | " \"flights\": {\n", 82 | " \"type\": \"ibis.TableDataset\",\n", 83 | " \"table_name\": \"flights\",\n", 84 | " \"connection\": {\n", 85 | " \"backend\": \"duckdb\",\n", 86 | " \"database\": \"nycflights13.ddb\",\n", 87 | " \"read_only\": True,\n", 88 | " },\n", 89 | " }\n", 90 | " }\n", 91 | ")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Each entry in the dictionary represents a **dataset**, and each dataset has a **type** as well as some extra properties. Datasets are Python classes that take care of all the I/O needs in Kedro. In this case, we're using `kedro_datasets.ibis.TableDataset`, you can read [its full documentation](https://docs.kedro.org/projects/kedro-datasets/en/kedro-datasets-3.0.1/api/kedro_datasets.ibis.TableDataset.html) online.\n", 99 | "\n", 100 | "After the catalog is created, `catalog.list()` will yield a list of the available dataset names, which you can load using the `catalog.load()` method:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "catalog.list()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "flights = catalog.load(\"flights\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "Notice that the resulting object is the exact same Ibis table we were using in the previous tutorial!" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "type(flights)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "flights" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## The `OmegaConfigLoader`\n", 151 | "\n", 152 | "Instead of creating the Data Catalog by hand like this, Kedro usually stores configuration in YAML files. To load them, Kedro offers a [configuration loader](https://docs.kedro.org/en/latest/configuration/configuration_basics.html) based on the [Omegaconf](https://omegaconf.readthedocs.io/) library called the `OmegaConfigLoader`. This adds several interesting features, such as\n", 153 | "\n", 154 | "- Consolidating different configuration files into one\n", 155 | "- Substitution, templating\n", 156 | "- [Resolvers](https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html)\n", 157 | "- And [much more](https://docs.kedro.org/en/latest/configuration/advanced_configuration.html)\n", 158 | "\n", 159 | "To start using it, first dump the catalog configuration to a `catalog.yml` file, and then use `OmegaConfigLoader` as follows:" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "%%writefile catalog.yml\n", 169 | "flights:\n", 170 | " type: ibis.TableDataset\n", 171 | " table_name: flights\n", 172 | " connection:\n", 173 | " backend: duckdb\n", 174 | " database: nycflights13.ddb\n", 175 | " read_only: true" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from kedro.config import OmegaConfigLoader\n", 185 | "\n", 186 | "config_loader = OmegaConfigLoader(\n", 187 | " conf_source=\".\", # Directory where configuration files are located\n", 188 | " config_patterns={\"catalog\": [\"catalog.yml\"]}, # For simplicity for this demo\n", 189 | ")" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "catalog_config = config_loader.get(\"catalog\")\n", 199 | "catalog_config" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "As you can see, `config_loader.get(\"catalog\")` gets you the same dictionary we crafted by hand earlier.\n", 207 | "\n", 208 | "However, hardcoding the database path like that seems like an invitation to trouble. Let's declare a variable `_root` inside the YAML file using Omegaconf syntax and load the catalog config again:" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "%%writefile catalog.yml\n", 218 | "_root: /workspaces/kedro-ibis-tutorial\n", 219 | "\n", 220 | "flights:\n", 221 | " type: ibis.TableDataset\n", 222 | " table_name: flights\n", 223 | " connection:\n", 224 | " backend: duckdb\n", 225 | " database: \"${_root}/nycflights13.ddb\"\n", 226 | " read_only: true" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "catalog_config = config_loader.get(\"catalog\")\n", 236 | "catalog_config" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "catalog = DataCatalog.from_config(catalog_config)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "catalog.load(\"flights\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Nodes and pipelines\n", 262 | "\n", 263 | "Now comes the interesting part. Kedro structures the computation on Directed Acyclic Graphs (DAGs), which are created by instantiating `Pipeline` objects with a list of `Node`s. By linking the inputs and outpus of each node, Kedro is then able to perform a topological sort and produce a graph.\n", 264 | "\n", 265 | "Let's start creating a trivial pipeline with 1 node. That 1 node will be a preprocessing function that will manipulate the `dep_time`, `arr_delay`, and `air_time` columns." 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "def preprocess_flights(table):\n", 275 | " return table.mutate(\n", 276 | " dep_time=(\n", 277 | " table.dep_time.lpad(4, \"0\").substr(0, 2)\n", 278 | " + \":\"\n", 279 | " + table.dep_time.substr(-2, 2)\n", 280 | " + \":00\"\n", 281 | " ).try_cast(\"time\"),\n", 282 | " arr_delay=table.arr_delay.try_cast(int),\n", 283 | " air_time=table.air_time.try_cast(int),\n", 284 | " )" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "flights.select(\"year\", \"month\", \"day\", \"dep_time\")" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "preprocess_flights(flights).select(\"year\", \"month\", \"day\", \"dep_time\")" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "Notice that this is a plain Python function, receiving an Ibis table and returning another Ibis table.\n", 310 | "\n", 311 | "Now, let's wrap it using the `node` convenience function from Kedro:" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "from kedro.pipeline import node\n", 321 | "\n", 322 | "n0 = node(func=preprocess_flights, inputs=\"flights\", outputs=\"preprocessed_flights\")\n", 323 | "n0" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "Conceptually, a `Node` is a wrapper around a Python function that defines a single step in a pipeline. It has inputs and outputs, which are the names of the Data Catalog datasets that the function will receive and return, respectively. Therefore, you could execute it as follows:\n", 331 | "\n", 332 | "```python\n", 333 | "n0.func(\n", 334 | " *[catalog.load(input_dataset) for input_dataset in n0.inputs],\n", 335 | ")\n", 336 | "```\n", 337 | "\n", 338 | "Let's not do that though; Kedro will take care of it.\n", 339 | "\n", 340 | "The next step is to assemble the pipeline. In this case, it will only have 1 node:" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "from kedro.pipeline import pipeline\n", 350 | "\n", 351 | "pipe = pipeline([n0])\n", 352 | "pipe" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "And finally, you can now execute the pipeline. For the purposes of this tutorial, you can use Kedro's `SequentialRunner` directly:" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "from kedro.runner import SequentialRunner\n", 369 | "\n", 370 | "outputs = SequentialRunner().run(pipe, catalog=catalog)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "The output of the `.run(...)` method will be \"Any node outputs that cannot be processed by the `DataCatalog`\". Since `preprocessed_flights` is not declared in the Data Catalog, it's right there in the dictionary:" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "outputs.keys()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "outputs[\"preprocessed_flights\"]" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Exercises\n", 403 | "\n", 404 | "### Exercise 1\n", 405 | "\n", 406 | "Complete the `catalog.yml` so that `weather` is included as well.\n", 407 | "\n", 408 | "_Extra points_ if you factor the connection details in a variable." 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "%load solutions/nb03_ex01_catalog.yml" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "### Exercise 2\n", 425 | "\n", 426 | "Complete the data processing pipeline by defining a `create_model_input_table` function that combines the preprocessed flights and weather data:\n", 427 | "\n", 428 | "```python\n", 429 | "def create_model_input_table(flights, weather) -> ir.Table:\n", 430 | " ...\n", 431 | "```\n", 432 | "\n", 433 | "_Hint_: See the `join` explanation in the Ibis notebook.\n", 434 | "\n", 435 | "Then, recreate the pipeline so that it has two nodes.\n", 436 | "\n", 437 | "_Extra points_ if your node drops the null values of the resulting table and selects only a subset of the columns." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "%load solutions/nb03_ex02.py" 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 3 (ipykernel)", 453 | "language": "python", 454 | "name": "python3" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 3 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython3", 466 | "version": "3.11.9" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 4 471 | } 472 | -------------------------------------------------------------------------------- /01 - Getting Started with Ibis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting started with Ibis\n", 8 | "\n", 9 | "In the [previous notebook](./00%20-%20Welcome.ipynb), we created a DuckDB database file with the [nycflights13 data](https://github.com/hadley/nycflights13). DuckDB is fast and runs locally, so it's handy for lots of use cases, including tutorials. Let's begin by importing Ibis and connecting to the database." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import ibis\n", 19 | "\n", 20 | "con = ibis.duckdb.connect(\"nycflights13.ddb\", read_only=True)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "**Note**: When you connect to a DuckDB database file, DuckDB creates a WAL file to prevent data corruption. If you see a `nycflights13.ddb.wal` file, you can safely ignore it. It will get cleaned up automatically.\n", 28 | "\n", 29 | "Now we have a connection, we can start by looking around. Are there any tables in this database?" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "con.list_tables()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Two, in fact! Let's take a look at the `flights` table first." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "flights = con.table(\"flights\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "By default, you'll get a printable representation of the table schema, showing the name and data type of each column." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "flights" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "If we call the `head` method to peek at the data, you'll notice that we don't actually see data (yet); what's going on?" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "flights.head()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Ibis has a deferred execution model. It builds up expressions based on what you ask it to do, and then executes those expressions on request.\n", 94 | "\n", 95 | "In this case, our query isn't too involved; we want to see the first few rows of the `flights` table. We can do that by asking for the results of this query as a `pandas.DataFrame`:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "flights.head().to_pandas()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Or a `pyarrow.Table`:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "flights.head().to_pyarrow()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Or a `polars.DataFrame`:" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "flights.head().to_polars()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "We'll get into more detail about what Ibis is doing a bit later on. For now, the important point is that Ibis is deferred.\n", 144 | "\n", 145 | "## Interactive mode\n", 146 | "\n", 147 | "Remember when we said Ibis is deferred? Sometimes you want eager execution so you can explore a dataset. For the rest of this notebook, we'll turn on interactive mode, where Ibis will eagerly execute as much of the query as it needs to in order to show you the first 10 rows of the result." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "ibis.options.interactive = True" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "In interactive mode, we use `rich` to render the output inline:" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "flights.head()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Tables and columns\n", 180 | "\n", 181 | "`flights` is a table! A table is a collection of one or more columns, each with a specific datatype." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "flights" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "type(flights)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "We can look at a single column of that table using the column name as an attribute:" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "flights.carrier" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "What kind of column is `carrier`? It's a `StringColumn`!" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "type(flights.carrier)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## Ibis \"verbs\", or, stuff you can do to a table\n", 239 | "\n", 240 | "The rest of this notebook covers some of the general methods you can use to alter the output of a particular table.\n", 241 | "\n", 242 | "We'll cover, in order, `filter`, `select`, `drop`, `mutate`, `order_by`, `aggregate`, and `group_by`. Time to dive in!\n", 243 | "\n", 244 | "## Filter\n", 245 | "\n", 246 | "A filter allows you to view a subset of the rows in a table, based on some condition.\n", 247 | "\n", 248 | "For instance, we might want to only view data for JetBlue flights:" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "flights.filter(flights.carrier == \"B6\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "You can also combine multiple filters, across multiple columns.\n", 265 | "\n", 266 | "We can subset the data down to JetBlue flights from JFK:" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "scrolled": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "expr = flights.filter((flights.carrier == \"B6\") & (flights.origin == \"JFK\"))\n", 278 | "expr" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "Above, we combined two filters using `&`. You can also pass them in as individual arguments:" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "expr = flights.filter(\n", 295 | " flights.carrier == \"B6\",\n", 296 | " flights.origin == \"JFK\",\n", 297 | ")\n", 298 | "expr" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "ibis.to_sql(expr)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "## Select\n", 315 | "\n", 316 | "Filter filters, Select selects (there's a pattern here).\n", 317 | "If you only want a subset of the columns in the original table, you can select\n", 318 | "those columns explicitly.\n", 319 | "\n", 320 | "You can refer to the columns using strings:" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "flights.select(\"carrier\", \"origin\", \"dest\")" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "Or you can use explicit references to the `Column` objects:" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [ 345 | "flights.select(flights.carrier, flights.origin, flights.dest)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "Or you can mix and match:" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "flights.select(\"carrier\", \"origin\", flights.dest)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "## Drop\n", 369 | "\n", 370 | "Drop is nearly the same as Select, but rather than explicitly choosing the columns to display, we explicitly choose the columns to _not_ display.\n", 371 | "\n", 372 | "And as with `select`, you can specify the columns as strings:" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "flights.drop(\"flight\", \"tailnum\")" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "Or you can use explicit references to the `Column` objects:" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "flights.drop(flights.flight, flights.tailnum)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "Or you can mix and match:" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "flights.drop(\"flight\", flights.tailnum)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "## Mutate\n", 421 | "\n", 422 | "Everything we've seen so far has been subtractive—removing rows or columns. What about _adding_ columns?\n", 423 | "\n", 424 | "That's what `mutate` is for! You can create a new column as a function of other existing columns (for example, converting units):" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "flights.mutate(distance_km=flights.distance * 1.609)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "Or you can create a new column and populate it with some literal value:" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "flights.mutate(my_favorite_number=ibis.literal(41))" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## On immutability\n", 457 | "\n", 458 | "We've filtered, selected, dropped, and mutated this `flights` table quite a bit." 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "flights" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "And yet, notice that none of our changes persist—the base table for our query isn't altered. The query (or expression) is a recipe of things to do with the base table (`flights`).\n", 475 | "\n", 476 | "If you want to keep an expression around, you can assign it to a variable:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "expr = flights.select(\"carrier\", \"origin\")\n", 486 | "expr" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "**Note**: Every time you execute an expression (via interactive mode, or `to_pandas`, or similar), the entire expression gets executed, starting from the base table. DuckDB is very fast and this dataset is very small, so the delay is unnoticeable, but for very large datasets, it might become more pronounced. There is functionality to `cache` intermediate results that isn't covered in this tutorial, but you can [read more about it in the docs](https://ibis-project.org/reference/expression-tables.html#ibis.expr.types.relations.Table.cache).\n", 494 | "\n", 495 | "## Method chaining\n", 496 | "\n", 497 | "You can build up complicated queries by chaining together Ibis methods. The output of many Ibis methods is a table (just like `flights`!) and we can continue calling table methods until we're satisfied. Or until we end up with something that _isn't_ a table. More on that later." 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "flights.select(\"carrier\", \"origin\", \"dest\").drop(\"carrier\")" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "Not the most complicated (or useful) query, but we'll see more soon.\n", 514 | "\n", 515 | "### Exercise 1\n", 516 | "\n", 517 | "Convert the `distance` column from miles to kilometers. For an approximate result, multiply by 1.609.\n", 518 | "\n", 519 | "Two ways you might accomplish this:\n", 520 | "\n", 521 | "- Chaining `.mutate` to create the new column and `.drop` to drop the original imperial column\n", 522 | "- Using a single `.select` to create the new column as well as select the remaining columns\n", 523 | "\n", 524 | "Try both ways below! How do they compare?" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "# Convert the imperial units to metric, and drop the imperial columns.\n", 534 | "# Try this using a `.mutate` and `.drop` call.\n", 535 | "flights_metric_mutate_drop = flights" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "# Convert the imperial units to metric, and drop the imperial columns.\n", 545 | "# Try this using a single `.select` call.\n", 546 | "flights_metric_select = flights" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": {}, 552 | "source": [ 553 | "#### Solutions" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "%load solutions/nb01_ex01_mutate_drop.py" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "%load solutions/nb01_ex01_select.py" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "#### Does it matter which method you choose?\n", 579 | "\n", 580 | "In this case, no. Sometimes, there might be a small difference in the generated SQL, but they will be semantically equivalent." 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "ibis.to_sql(flights_metric_mutate_drop)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "ibis.to_sql(flights_metric_select)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "In practice, small differences in the generated SQL don't make a difference. Any modern SQL execution engine will optimize variations to the same set of operations, and there will be no measurable performance difference.\n", 606 | "\n", 607 | "## Order by\n", 608 | "\n", 609 | "Want to order your data by a given column or columns? Use `order_by`!\n", 610 | "\n", 611 | "The default ordering direction is ascending:" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "flights.order_by(flights.distance)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "We can ask Ibis to sort in descending order, too." 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": { 634 | "scrolled": true 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "flights.order_by(flights.distance.desc())" 639 | ] 640 | }, 641 | { 642 | "cell_type": "markdown", 643 | "metadata": {}, 644 | "source": [ 645 | "Let's select out a subset of the columns to keep this a bit tidier." 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "flights.order_by(flights.distance.desc()).select(\n", 655 | " \"carrier\", \"origin\", \"dest\", \"distance\"\n", 656 | ")" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "You can also call `ibis.desc` on the column name to set the order direction:" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "flights.order_by(ibis.desc(\"distance\")).select(\"carrier\", \"origin\", \"dest\", \"distance\")" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "## Aggregate\n", 680 | "\n", 681 | "Ibis has several aggregate functions available to help summarize data. All the old favorites are there: `mean`, `max`, `min`, `count`, `sum`...\n", 682 | "\n", 683 | "You can aggregate a column by calling the method on that column:" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "flights.distance.mean()" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "Or you can compute multiple aggregates using the `aggregate` method (also\n", 700 | "available as `agg` for faster typing):" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "flights.agg([flights.distance.mean(), flights.air_time.min()])" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "If you don't like the column names Ibis generates for you, choose your own!" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "flights.agg(\n", 726 | " average_distance=flights.distance.mean(),\n", 727 | " shortest_air_time=flights.air_time.min(),\n", 728 | ")" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "But aggregates really shine when paired with a `group_by`!\n", 736 | "\n", 737 | "## Group by\n", 738 | "\n", 739 | "`group_by` creates groupings of rows that have the same value for one or more columns.\n", 740 | "\n", 741 | "But it doesn't do much on its own—you can pair it with `agg` to get a result." 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": null, 747 | "metadata": {}, 748 | "outputs": [], 749 | "source": [ 750 | "flights.group_by(\"carrier\").agg()" 751 | ] 752 | }, 753 | { 754 | "cell_type": "markdown", 755 | "metadata": {}, 756 | "source": [ 757 | "Without any aggregate function specified, we get the distinct values of the grouped column.\n", 758 | "\n", 759 | "We can add a second column to the `group_by` to get the distinct pairs across both columns:" 760 | ] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "flights.group_by([\"carrier\", \"origin\"]).agg()" 769 | ] 770 | }, 771 | { 772 | "cell_type": "markdown", 773 | "metadata": {}, 774 | "source": [ 775 | "Now, if we add an aggregation function to that, we start to really open things up." 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "flights.group_by([\"carrier\", \"origin\"]).agg(flights.distance.mean())" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "By adding that `mean` to the `aggregate`, we now have a concise way to calculate aggregates over each of the distinct groups in the `group_by`. And we can calculate as many aggregates as we need." 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [ 800 | "flights.group_by([\"carrier\", \"origin\"]).agg(\n", 801 | " [flights.distance.mean(), flights.air_time.min()]\n", 802 | ")" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "If we need more specific groups, we can add to the `group_by`." 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "flights.group_by([\"carrier\", \"origin\", \"dest\"]).agg(\n", 819 | " [flights.distance.mean(), flights.air_time.min()]\n", 820 | ")" 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": {}, 826 | "source": [ 827 | "## Cast\n", 828 | "\n", 829 | "Sometimes when you parse data, _especially_ from CSVs, the types get a bit messed up. Or you might be loading in a `parquet` file where everything is defined as a `string`. We can clean that up pretty quickly.\n", 830 | "\n", 831 | "You can cast from floats to ints:" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "metadata": {}, 838 | "outputs": [], 839 | "source": [ 840 | "(flights.distance * 1.609).cast(\"int32\")" 841 | ] 842 | }, 843 | { 844 | "cell_type": "markdown", 845 | "metadata": {}, 846 | "source": [ 847 | "And from ints to floats:" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "flights.year.cast(\"float64\") # this is a terrible idea" 857 | ] 858 | }, 859 | { 860 | "cell_type": "markdown", 861 | "metadata": {}, 862 | "source": [ 863 | "You can cast numeric columns to strings:" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "flights.year.cast(\"str\") # or \"string\"" 873 | ] 874 | }, 875 | { 876 | "cell_type": "markdown", 877 | "metadata": {}, 878 | "source": [ 879 | "And numeric strings to numbers:" 880 | ] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "execution_count": null, 885 | "metadata": {}, 886 | "outputs": [], 887 | "source": [ 888 | "flights.year.cast(\"str\").cast(\"int64\")" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "But Ibis will yell if you try to cast a non-numeric string to a number:" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "flights.carrier.cast(\"int32\")" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "If we know that a column _should_ have a particular data type, but don't want a few bad apples (rows) to spoil the bunch, `try_cast` will fall back to `NULL` or `NaN` for values where the cast fails:" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "flights.arr_delay.try_cast(int)" 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "metadata": {}, 926 | "source": [ 927 | "## Drop NA\n", 928 | "\n", 929 | "Does what it says on the box—drop the `NULL`s from a dataset." 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": null, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "flights.dropna()" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "## Exercises\n", 946 | "\n", 947 | "Time to use what we've learned to answer some flight questions.\n", 948 | "\n", 949 | "### Exercise 2\n", 950 | "\n", 951 | "Which airlines (`carrier`) had the longest average arrival delays (`arr_delay`) in June 2013?\n", 952 | "\n", 953 | "#### Solution\n", 954 | "\n", 955 | "Note that there are several ways these queries could be written—it's fine if your solution doesn't look like ours, as long as the results are the same." 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": null, 961 | "metadata": {}, 962 | "outputs": [], 963 | "source": [ 964 | "%load solutions/nb01_ex02.py" 965 | ] 966 | }, 967 | { 968 | "cell_type": "markdown", 969 | "metadata": {}, 970 | "source": [ 971 | "### Exercise 3\n", 972 | "\n", 973 | "Which NYC airport has the lowest percentage of outbound flights arriving 30 or more minutes late?\n", 974 | "\n", 975 | "#### Solution" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "metadata": {}, 982 | "outputs": [], 983 | "source": [ 984 | "%load solutions/nb01_ex03.py" 985 | ] 986 | }, 987 | { 988 | "cell_type": "markdown", 989 | "metadata": {}, 990 | "source": [ 991 | "## A brief digression on the SQL Ibis generates\n", 992 | "\n", 993 | "Maybe you've heard that SQL has a standard? This is true, and also misleading. The SQL standard is more of a suggestion, and there are myriad SQL _dialects_.\n", 994 | "\n", 995 | "Ibis compiles expressions into the appropriate SQL dialect for the backend you are using. In this case, we started with a DuckDB table, so we get DuckDB SQL:" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [ 1004 | "ibis.to_sql(flights_metric_mutate_drop)" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "markdown", 1009 | "metadata": {}, 1010 | "source": [ 1011 | "But if you want to use a _different_ dialect, you can pass the dialect name:" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": null, 1017 | "metadata": {}, 1018 | "outputs": [], 1019 | "source": [ 1020 | "ibis.to_sql(flights_metric_mutate_drop, dialect=\"postgres\")" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## Join\n", 1028 | "\n", 1029 | "No dataframe library is complete without joins! Ibis supports several kinds of joins between table expressions: `inner_join`, `cross_join`, `left_join`, `outer_join`, `semi_join`, and `anti_join`. The `join` table method is, by default, the same as `inner_join`.\n", 1030 | "\n", 1031 | "Remember the other table in our database?" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": null, 1037 | "metadata": {}, 1038 | "outputs": [], 1039 | "source": [ 1040 | "weather = con.table(\"weather\")\n", 1041 | "weather" 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "markdown", 1046 | "metadata": {}, 1047 | "source": [ 1048 | "We can join the two tables on the `origin` column:" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": null, 1054 | "metadata": {}, 1055 | "outputs": [], 1056 | "source": [ 1057 | "flights.join(weather, \"origin\")" 1058 | ] 1059 | }, 1060 | { 1061 | "cell_type": "markdown", 1062 | "metadata": {}, 1063 | "source": [ 1064 | "Of course, we should only join on the weather at the time corresponding to each flight:" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": null, 1070 | "metadata": {}, 1071 | "outputs": [], 1072 | "source": [ 1073 | "flights.join(weather, [\"origin\", \"time_hour\"])" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "markdown", 1078 | "metadata": {}, 1079 | "source": [ 1080 | "The `on` condition can also be specified as an expression, which is particularly useful if you have columns with different names or non-equi-join logic." 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": null, 1086 | "metadata": {}, 1087 | "outputs": [], 1088 | "source": [ 1089 | "renamed = weather.rename(location=\"origin\")\n", 1090 | "flights.join(\n", 1091 | " renamed,\n", 1092 | " (flights.origin != renamed.location) & (flights.time_hour == renamed.time_hour),\n", 1093 | ")" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "metadata": {}, 1099 | "source": [ 1100 | "The Ibis `join` syntax is quite expressive, so we won't cover all the variations now; for more examples, read the [docs](https://ibis-project.org/reference/expression-tables#ibis.expr.types.relations.Table.join).\n", 1101 | "\n", 1102 | "Before we move on, let's terminate the DuckDB connection for good measure. If you skip this step, you may run into an error later on in the tutorial:\n", 1103 | "\n", 1104 | " IO Error: Could not set lock on file \"/workspaces/kedro-ibis-tutorial/nycflights13.ddb\": Conflicting lock is held in /usr/local/bin/python3.11 (PID 1234). However, you would be able to open this database in read-only mode, e.g. by using the -readonly parameter in the CLI. See also https://duckdb.org/docs/connect/concurrency" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "code", 1109 | "execution_count": null, 1110 | "metadata": {}, 1111 | "outputs": [], 1112 | "source": [ 1113 | "con.disconnect()" 1114 | ] 1115 | } 1116 | ], 1117 | "metadata": { 1118 | "kernelspec": { 1119 | "display_name": "Python 3 (ipykernel)", 1120 | "language": "python", 1121 | "name": "python3" 1122 | }, 1123 | "language_info": { 1124 | "codemirror_mode": { 1125 | "name": "ipython", 1126 | "version": 3 1127 | }, 1128 | "file_extension": ".py", 1129 | "mimetype": "text/x-python", 1130 | "name": "python", 1131 | "nbconvert_exporter": "python", 1132 | "pygments_lexer": "ipython3", 1133 | "version": "3.11.9" 1134 | } 1135 | }, 1136 | "nbformat": 4, 1137 | "nbformat_minor": 4 1138 | } 1139 | --------------------------------------------------------------------------------