├── demo
    └── delay-prediction
    │   ├── conf
    │       ├── local
    │       │   └── .gitkeep
    │       ├── base
    │       │   ├── parameters.yml
    │       │   ├── parameters_data_processing.yml
    │       │   ├── parameters_evaluation.yml
    │       │   ├── parameters_training.yml
    │       │   └── catalog.yml
    │       ├── README.md
    │       └── logging.yml
    │   ├── data
    │       ├── 01_raw
    │       │   ├── .gitkeep
    │       │   └── shuttles.xlsx
    │       ├── 03_primary
    │       │   └── .gitkeep
    │       ├── 04_feature
    │       │   └── .gitkeep
    │       ├── 06_models
    │       │   └── .gitkeep
    │       ├── 08_reporting
    │       │   └── .gitkeep
    │       ├── 02_intermediate
    │       │   └── .gitkeep
    │       ├── 05_model_input
    │       │   └── .gitkeep
    │       └── 07_model_output
    │       │   └── .gitkeep
    │   ├── notebooks
    │       └── .gitkeep
    │   ├── tests
    │       ├── __init__.py
    │       ├── pipelines
    │       │   ├── __init__.py
    │       │   └── data_science
    │       │   │   └── test_pipeline.py
    │       └── test_run.py
    │   ├── src
    │       └── delay_prediction
    │       │   ├── pipelines
    │       │       ├── __init__.py
    │       │       ├── model_evaluation
    │       │       │   ├── __init__.py
    │       │       │   ├── pipeline.py
    │       │       │   └── nodes.py
    │       │       ├── model_training
    │       │       │   ├── __init__.py
    │       │       │   ├── pipeline.py
    │       │       │   └── nodes.py
    │       │       └── data_processing
    │       │       │   ├── __init__.py
    │       │       │   ├── pipeline.py
    │       │       │   └── nodes.py
    │       │   ├── __init__.py
    │       │   ├── pipeline_registry.py
    │       │   ├── __main__.py
    │       │   └── settings.py
    │   ├── requirements.txt
    │   ├── docs
    │       └── source
    │       │   ├── index.rst
    │       │   └── conf.py
    │   ├── pyproject.toml
    │   ├── .gitignore
    │   └── README.md
├── static
    ├── qr.png
    ├── deepyaman.jpg
    ├── juanluis.png
    ├── codespaces.png
    ├── kedro-final-pipeline.png
    └── kedro-horizontal-color-on-light.png
├── requirements.txt
├── .gitignore
├── sql
    ├── verify_nycflights13.sql
    ├── load_nycflights13.sql
    └── create_nycflights13.sql
├── solutions
    ├── nb01_ex01_mutate_drop.py
    ├── nb01_ex03.py
    ├── nb03_ex01_catalog.yml
    ├── nb01_ex02.py
    ├── nb01_ex01_select.py
    └── nb03_ex02.py
├── .devcontainer
    ├── Dockerfile
    ├── compose.yaml
    └── devcontainer.json
├── README.md
├── 02 - Switching Backends.ipynb
├── 00 - Welcome.ipynb
├── codespace_requirements.txt
├── 03 - First Steps with Kedro.ipynb
└── 01 - Getting Started with Ibis.ipynb


/demo/delay-prediction/conf/local/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/01_raw/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/base/parameters.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/03_primary/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/04_feature/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/06_models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/08_reporting/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/02_intermediate/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/05_model_input/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/07_model_output/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/tests/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/base/parameters_data_processing.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/static/qr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/qr.png


--------------------------------------------------------------------------------
/static/deepyaman.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/deepyaman.jpg


--------------------------------------------------------------------------------
/static/juanluis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/juanluis.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r demo/delay-prediction/requirements.txt
2 | 
3 | certifi>=2024.07.04
4 | jupyterlab==4.1.8
5 | 


--------------------------------------------------------------------------------
/static/codespaces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/codespaces.png


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/__init__.py:
--------------------------------------------------------------------------------
1 | """Delay Prediction
2 | """
3 | 
4 | __version__ = "0.1"
5 | 


--------------------------------------------------------------------------------
/static/kedro-final-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/kedro-final-pipeline.png


--------------------------------------------------------------------------------
/static/kedro-horizontal-color-on-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/static/kedro-horizontal-color-on-light.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.ipynb_checkpoints
 2 | *.ddb
 3 | *.parquet
 4 | *.wal
 5 | __pycache__
 6 | *.html
 7 | *_files/
 8 | # pixi environments
 9 | .pixi
10 | 
11 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/data/01_raw/shuttles.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ibis-project/kedro-ibis-tutorial/HEAD/demo/delay-prediction/data/01_raw/shuttles.xlsx


--------------------------------------------------------------------------------
/sql/verify_nycflights13.sql:
--------------------------------------------------------------------------------
1 | SELECT 'flights' table_name, COUNT(*) FROM flights UNION
2 | SELECT 'weather' table_name, COUNT(*) FROM weather
3 | ORDER BY table_name;
4 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import create_pipeline
2 | 
3 | __all__ = ["create_pipeline"]
4 | 
5 | __version__ = "0.1"
6 | 


--------------------------------------------------------------------------------
/sql/load_nycflights13.sql:
--------------------------------------------------------------------------------
1 | ATTACH 'dbname=postgres user=postgres' AS postgres_db (TYPE POSTGRES);
2 | INSERT INTO postgres_db.flights FROM flights;
3 | INSERT INTO postgres_db.weather FROM weather;
4 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_training/__init__.py:
--------------------------------------------------------------------------------
1 | """Complete Data Science pipeline for the spaceflights tutorial"""
2 | 
3 | from .pipeline import create_pipeline  # NOQA
4 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | """Complete Data Processing pipeline for the spaceflights tutorial"""
2 | 
3 | from .pipeline import create_pipeline  # NOQA
4 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/requirements.txt:
--------------------------------------------------------------------------------
1 | ibis-framework[duckdb,polars,examples]~=9.1
2 | ibis-ml
3 | kedro~=0.19.6
4 | kedro-datasets[ibis-duckdb,ibis-postgres]~=3.0.1
5 | kedro-telemetry>=0.3.1
6 | kedro-viz~=9.1.0
7 | scikit-learn~=1.0
8 | 


--------------------------------------------------------------------------------
/solutions/nb01_ex01_mutate_drop.py:
--------------------------------------------------------------------------------
1 | # Convert the imperial units to metric, and drop the imperial columns.
2 | flights_metric_mutate_drop = (
3 |     flights.mutate(distance_km=flights.distance * 1.609).drop("distance")
4 | )
5 | 
6 | flights_metric_mutate_drop
7 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/base/parameters_evaluation.yml:
--------------------------------------------------------------------------------
1 | # This is a boilerplate parameters config generated for pipeline 'evaluation'
2 | # using Kedro 0.19.6.
3 | #
4 | # Documentation for this file format can be found in "Parameters"
5 | # Link: https://docs.kedro.org/en/0.19.6/configuration/parameters.html
6 | 


--------------------------------------------------------------------------------
/solutions/nb01_ex03.py:
--------------------------------------------------------------------------------
 1 | # Which NYC airport has the lowest percentage of outbound flights
 2 | # arriving 30 or more minutes late?
 3 | sol3 = (
 4 |     flights.group_by("origin")
 5 |     .agg((flights.arr_delay.try_cast(int) >= 30).mean())
 6 |     .order_by("Mean(GreaterEqual(TryCast(arr_delay, Int64), 30))")
 7 | )
 8 | 
 9 | sol3
10 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/base/parameters_training.yml:
--------------------------------------------------------------------------------
1 | # This is a boilerplate parameters config generated for pipeline 'training'
2 | # using Kedro 0.19.6.
3 | #
4 | # Documentation for this file format can be found in "Parameters"
5 | # Link: https://docs.kedro.org/en/0.19.6/configuration/parameters.html
6 | 
7 | model_options:
8 |   random_state: 222
9 | 


--------------------------------------------------------------------------------
/solutions/nb03_ex01_catalog.yml:
--------------------------------------------------------------------------------
 1 | _root_folder: /workspaces/kedro-ibis-tutorial
 2 | 
 3 | _connection:
 4 |   backend: duckdb
 5 |   database: "${_root_folder}/nycflights13.ddb"
 6 | 
 7 | flights:
 8 |   type: ibis.TableDataset
 9 |   table_name: flights
10 |   connection: ${_connection}
11 | 
12 | weather:
13 |   type: ibis.TableDataset
14 |   table_name: weather
15 |   connection: ${_connection}
16 | 


--------------------------------------------------------------------------------
/solutions/nb01_ex02.py:
--------------------------------------------------------------------------------
 1 | # Which airlines had the longest average arrival delays in June 2013?
 2 | sol2 = (
 3 |     flights.filter(
 4 |         [
 5 |             flights.month == 6,
 6 |             flights.year == 2013,
 7 |         ]
 8 |     )
 9 |     .group_by("carrier")
10 |     .agg(average_arr_delay=flights.arr_delay.try_cast(int).mean())
11 |     .order_by(ibis.desc("average_arr_delay"))
12 | )
13 | 
14 | sol2
15 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/pipeline.py:
--------------------------------------------------------------------------------
 1 | from kedro.pipeline import Pipeline, node, pipeline
 2 | 
 3 | from .nodes import evaluate_model
 4 | 
 5 | 
 6 | def create_pipeline(**kwargs) -> Pipeline:
 7 |     return pipeline([
 8 |         node(
 9 |             func=evaluate_model,
10 |             inputs=["classifier", "X_test", "y_test"],
11 |             outputs=None,
12 |         ),
13 |     ])
14 | 


--------------------------------------------------------------------------------
/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM mcr.microsoft.com/vscode/devcontainers/python:3.11
 2 | 
 3 | RUN apt-get update -y && \
 4 |     DEBIAN_FRONTEND=noninteractive \
 5 |     apt-get install -y --no-install-recommends graphviz postgresql-client curl ca-certificates && \
 6 |     rm -rf /var/lib/apt/lists/*
 7 | 
 8 | RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 9 | 
10 | COPY codespace_requirements.txt /
11 | 
12 | RUN /root/.cargo/bin/uv pip install --system -r codespace_requirements.txt
13 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. delay_prediction documentation master file, created by sphinx-quickstart.
 2 |    You can adapt this file completely to your liking, but it should at least
 3 |    contain the root `toctree` directive.
 4 | 
 5 | Welcome to project delay_prediction's API docs!
 6 | =============================================
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 4
10 | 
11 |    modules
12 | 
13 | 
14 | Indices and tables
15 | ==================
16 | 
17 | * :ref:`genindex`
18 | * :ref:`modindex`
19 | * :ref:`search`
20 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipeline_registry.py:
--------------------------------------------------------------------------------
 1 | """Project pipelines."""
 2 | from typing import Dict
 3 | 
 4 | from kedro.framework.project import find_pipelines
 5 | from kedro.pipeline import Pipeline
 6 | 
 7 | 
 8 | def register_pipelines() -> Dict[str, Pipeline]:
 9 |     """Register the project's pipelines.
10 | 
11 |     Returns:
12 |         A mapping from pipeline names to ``Pipeline`` objects.
13 |     """
14 |     pipelines = find_pipelines()
15 |     pipelines["__default__"] = sum(pipelines.values())
16 |     return pipelines
17 | 


--------------------------------------------------------------------------------
/solutions/nb01_ex01_select.py:
--------------------------------------------------------------------------------
 1 | # Convert the imperial units to metric, and drop the imperial columns.
 2 | flights_metric_select = flights.select(
 3 |     "year",
 4 |     "month",
 5 |     "day",
 6 |     "dep_time",
 7 |     "sched_dep_time",
 8 |     "dep_delay",
 9 |     "arr_time",
10 |     "sched_arr_time",
11 |     "arr_delay",
12 |     "carrier",
13 |     "flight",
14 |     "tailnum",
15 |     "origin",
16 |     "dest",
17 |     "air_time",
18 |     "hour",
19 |     "minute",
20 |     "time_hour",
21 |     distance_km=flights.distance * 1.609,
22 | )
23 | 
24 | flights_metric_select
25 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/data_processing/pipeline.py:
--------------------------------------------------------------------------------
 1 | from kedro.pipeline import Pipeline, node, pipeline
 2 | 
 3 | from .nodes import create_model_input_table, preprocess_flights
 4 | 
 5 | 
 6 | def create_pipeline(**kwargs) -> Pipeline:
 7 |     return pipeline(
 8 |         [
 9 |             node(
10 |                 func=preprocess_flights,
11 |                 inputs="flights",
12 |                 outputs="preprocessed_flights",
13 |             ),
14 |             node(
15 |                 func=create_model_input_table,
16 |                 inputs=["preprocessed_flights", "weather"],
17 |                 outputs="model_input_table",
18 |             ),
19 |         ]
20 |     )
21 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_training/pipeline.py:
--------------------------------------------------------------------------------
 1 | from kedro.pipeline import Pipeline, node, pipeline
 2 | 
 3 | from .nodes import split_data, train_model
 4 | 
 5 | 
 6 | def create_pipeline(**kwargs) -> Pipeline:
 7 |     return pipeline(
 8 |         [
 9 |             node(
10 |                 func=split_data,
11 |                 inputs=["model_input_table", "params:model_options.random_state"],
12 |                 outputs=["X_train", "X_test", "y_train", "y_test"],
13 |             ),
14 |             node(
15 |                 func=train_model,
16 |                 inputs=["X_train", "y_train"],
17 |                 outputs="classifier",
18 |             ),
19 |         ]
20 |     )
21 | 


--------------------------------------------------------------------------------
/solutions/nb03_ex02.py:
--------------------------------------------------------------------------------
 1 | def create_model_input_table(flights, weather):
 2 |     return (
 3 |         flights.mutate(
 4 |             arr_delay=flights.arr_delay >= 30,
 5 |             date=flights.time_hour.date(),
 6 |         )
 7 |         .inner_join(weather, ["origin", "time_hour"])
 8 |         .select(
 9 |             "dep_time",
10 |             "flight",
11 |             "origin",
12 |             "dest",
13 |             "air_time",
14 |             "distance",
15 |             "carrier",
16 |             "date",
17 |             "arr_delay",
18 |             "time_hour",
19 |         )
20 |         .dropna()
21 |     )
22 | 
23 | pipe = pipeline([
24 |     n0,
25 |     node(
26 |         func=create_model_input_table,
27 |         inputs=["preprocessed_flights", "weather"],
28 |         outputs="model_input_table",
29 |     ),
30 | ])
31 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_evaluation/nodes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | from sklearn.metrics import accuracy_score
 7 | from sklearn.pipeline import Pipeline
 8 | 
 9 | if TYPE_CHECKING:
10 |     import ibis.expr.types as ir
11 | 
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def evaluate_model(
17 |     pipe: Pipeline, X_test: ir.Table, y_test: ir.Column
18 | ):
19 |     """Calculates and logs the coefficient of determination.
20 | 
21 |     Args:
22 |         pipe: Trained model.
23 |         X_test: Testing data of independent features.
24 |         y_test: Testing data for price.
25 |     """
26 |     y_pred = pipe.predict(X_test)
27 |     score = accuracy_score(y_test, y_pred)
28 |     logger.info("Model has an accuracy of %.3f on test data.", score)
29 | 


--------------------------------------------------------------------------------
/.devcontainer/compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   kedro-ibis-tutorial:
 3 |     build:
 4 |       context: ..
 5 |       dockerfile: .devcontainer/Dockerfile
 6 |     volumes:
 7 |       - ../..:/workspaces:cached
 8 |     command: sleep infinity
 9 |     network_mode: service:postgres
10 |     environment:
11 |       # elephant-shaped turtles all the way down
12 |       PGPASSWORD: postgres
13 |       PGHOST: postgres
14 |       PGUSER: postgres
15 |       PGDATABASE: postgres
16 |   postgres:
17 |     restart: unless-stopped
18 |     environment:
19 |       POSTGRES_PASSWORD: postgres
20 |       POSTGRES_DB: postgres
21 |       POSTGRES_USER: postgres
22 |     image: postgres:15
23 |     healthcheck:
24 |       interval: 1s
25 |       retries: 20
26 |       test:
27 |         - CMD
28 |         - pg_isready
29 |     volumes:
30 |       - postgres:/var/lib/postgresql/data
31 | 
32 | volumes:
33 |   postgres:
34 | 


--------------------------------------------------------------------------------
/sql/create_nycflights13.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS flights CASCADE;
 2 | CREATE TABLE flights (
 3 |     "year" BIGINT,
 4 |     "month" BIGINT,
 5 |     "day" BIGINT,
 6 |     "dep_time" VARCHAR,
 7 |     "sched_dep_time" BIGINT,
 8 |     "dep_delay" VARCHAR,
 9 |     "arr_time" VARCHAR,
10 |     "sched_arr_time" BIGINT,
11 |     "arr_delay" VARCHAR,
12 |     "carrier" VARCHAR,
13 |     "flight" BIGINT,
14 |     "tailnum" VARCHAR,
15 |     "origin" VARCHAR,
16 |     "dest" VARCHAR,
17 |     "air_time" VARCHAR,
18 |     "distance" BIGINT,
19 |     "hour" BIGINT,
20 |     "minute" BIGINT,
21 |     "time_hour" TIMESTAMP(6)
22 | );
23 | 
24 | DROP TABLE IF EXISTS weather CASCADE;
25 | CREATE TABLE weather (
26 |     "origin" VARCHAR,
27 |     "year" BIGINT,
28 |     "month" BIGINT,
29 |     "day" BIGINT,
30 |     "hour" BIGINT,
31 |     "temp" VARCHAR,
32 |     "dewp" VARCHAR,
33 |     "humid" VARCHAR,
34 |     "wind_dir" VARCHAR,
35 |     "wind_speed" VARCHAR,
36 |     "wind_gust" VARCHAR,
37 |     "precip" DOUBLE PRECISION,
38 |     "pressure" VARCHAR,
39 |     "visib" DOUBLE PRECISION,
40 |     "time_hour" TIMESTAMP(6)
41 | );
42 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/README.md:
--------------------------------------------------------------------------------
 1 | # What is this for?
 2 | 
 3 | This folder should be used to store configuration files used by Kedro or by separate tools.
 4 | 
 5 | This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**.
 6 | 
 7 | ## Local configuration
 8 | 
 9 | The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys).
10 | 
11 | > *Note:* Please do not check in any local configuration to version control.
12 | 
13 | ## Base configuration
14 | 
15 | The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members.
16 | 
17 | WARNING: Please do not put access credentials in the base configuration folder.
18 | 
19 | ## Find out more
20 | You can find out more about configuration from the [user guide documentation](https://docs.kedro.org/en/stable/configuration/configuration_basics.html).
21 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/tests/test_run.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains an example test.
 3 | 
 4 | Tests should be placed in ``src/tests``, in modules that mirror your
 5 | project's structure, and in files named test_*.py. They are simply functions
 6 | named ``test_*`` which test a unit of logic.
 7 | """
 8 | from pathlib import Path
 9 | 
10 | import pytest
11 | from kedro.config import OmegaConfigLoader
12 | from kedro.framework.context import KedroContext
13 | from kedro.framework.hooks import _create_hook_manager
14 | 
15 | 
16 | @pytest.fixture
17 | def config_loader():
18 |     return OmegaConfigLoader(conf_source=str(Path.cwd()))
19 | 
20 | 
21 | @pytest.fixture
22 | def project_context(config_loader):
23 |     return KedroContext(
24 |         package_name="delay_prediction",
25 |         project_path=Path.cwd(),
26 |         config_loader=config_loader,
27 |         hook_manager=_create_hook_manager(),
28 |     )
29 | 
30 | 
31 | # The tests below are here for the demonstration purpose
32 | # and should be replaced with the ones testing the project
33 | # functionality
34 | class TestProjectContext:
35 |     def test_project_path(self, project_context):
36 |         assert project_context.project_path == Path.cwd()
37 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/logging.yml:
--------------------------------------------------------------------------------
 1 | # To enable this custom logging configuration, set KEDRO_LOGGING_CONFIG to the path of this file.
 2 | # More information available at https://docs.kedro.org/en/stable/logging/logging.html
 3 | version: 1
 4 | 
 5 | disable_existing_loggers: False
 6 | 
 7 | formatters:
 8 |   simple:
 9 |     format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10 | 
11 | handlers:
12 |   console:
13 |     class: logging.StreamHandler
14 |     level: INFO
15 |     formatter: simple
16 |     stream: ext://sys.stdout
17 | 
18 |   info_file_handler:
19 |     class: logging.handlers.RotatingFileHandler
20 |     level: INFO
21 |     formatter: simple
22 |     filename: info.log
23 |     maxBytes: 10485760 # 10MB
24 |     backupCount: 20
25 |     encoding: utf8
26 |     delay: True
27 | 
28 |   rich:
29 |     class: kedro.logging.RichHandler
30 |     rich_tracebacks: True
31 |     # Advance options for customisation.
32 |     # See https://docs.kedro.org/en/stable/logging/logging.html#project-side-logging-configuration
33 |     # tracebacks_show_locals: False
34 | 
35 | loggers:
36 |   kedro:
37 |     level: INFO
38 | 
39 |   delay_prediction:
40 |     level: INFO
41 | 
42 | root:
43 |   handlers: [rich, info_file_handler]
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Kedro-Ibis Tutorial
 2 | 
 3 | This tutorial is designed to be run via GitHub codespaces. If you do not have a GitHub account, please create one before the tutorial (https://github.com/join). Other than that, just bring yourself and a laptop with a web browser, and everything else should be good to go!
 4 | 
 5 | ## Codespace setup
 6 | 
 7 | First, create a codespace for the repository. Click the **<> Code** button, then click the **Codespaces** tab. Select **Create codespace on main**. Once created, select **Open in browser**.
 8 | 
 9 | ## References
10 | 
11 | Kedro is an open-source Python framework for creating reproducible, maintainable, and modular data science and engineering code. It is an incubation-stage project of the LF AI & Data Foundation. To learn more about Kedro, visit the [Kedro website](https://kedro.org/) and [join our community on Slack](https://slack.kedro.org/).
12 | 
13 | Ibis is an open-source Python dataframe library that works with any data system. Visit the [Ibis project website](https://ibis-project.org/) to learn more, or [join our community on Zulip](https://ibis-project.zulipchat.com/).
14 | 
15 | This tutorial also uses IbisML, a new library for building scalable ML pipelines using Ibis. More information about IbisML can be found on the [IbisML website](https://ibis-project.github.io/ibis-ml/). There is a [dedicated IbisML stream on Zulip](https://ibis-project.zulipchat.com/#narrow/stream/426262-ibis-ml).
16 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dockerComposeFile": "compose.yaml",
 3 |   "service": "kedro-ibis-tutorial",
 4 |   "runServices": ["postgres"],
 5 |   "forwardPorts": ["postgres:5432"],
 6 |   "workspaceFolder": "/workspaces/${localWorkspaceFolderBasename}",
 7 |   "customizations": {
 8 |     "codespaces": {
 9 |       "openFiles": ["00 - Welcome.ipynb"]
10 |     },
11 |     "vscode": {
12 |       "settings": {
13 |         "extensions.ignoreRecommendations": true,
14 |         "notifications.hideList": true,
15 |         "notifications.hideToasts": true,
16 |         "notifications.doNotDisturbMode": true,
17 |         "update.showReleaseNotes": false,
18 |         "vsicons.dontShowNewVersionMessage": true,
19 |         "workbench.welcomePage.walkthroughs.openOnInstall": false,
20 |         "python.defaultInterpreterPath": "/usr/local/bin/python",
21 |         "jupyter.kernels.excludePythonEnvironments": [
22 |           "/usr/bin/python3",
23 |           "/bin/python3"
24 |         ]
25 |       },
26 |       "extensions": [
27 |         "ms-toolsai.jupyter",
28 |         "ms-python.python",
29 |         "quarto.quarto",
30 |         "donjayamanne.vscode-default-python-kernel"
31 |       ]
32 |     }
33 |   },
34 |   "features": {
35 |     "ghcr.io/eitsupi/devcontainer-features/duckdb-cli:1": {
36 |       "extensions": "httpfs,sqlite,postgres,parquet,json,arrow",
37 |       "version": "0.10.2"
38 |     },
39 |     "ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {
40 |       "version": "1.5.13"
41 |     }
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/__main__.py:
--------------------------------------------------------------------------------
 1 | """Delay Prediction file for ensuring the package is executable
 2 | as `delay-prediction` and `python -m delay_prediction`
 3 | """
 4 | import importlib
 5 | from pathlib import Path
 6 | 
 7 | from kedro.framework.cli.utils import KedroCliError, load_entry_points
 8 | from kedro.framework.project import configure_project
 9 | 
10 | 
11 | def _find_run_command(package_name):
12 |     try:
13 |         project_cli = importlib.import_module(f"{package_name}.cli")
14 |         # fail gracefully if cli.py does not exist
15 |     except ModuleNotFoundError as exc:
16 |         if f"{package_name}.cli" not in str(exc):
17 |             raise
18 |         plugins = load_entry_points("project")
19 |         run = _find_run_command_in_plugins(plugins) if plugins else None
20 |         if run:
21 |             # use run command from installed plugin if it exists
22 |             return run
23 |         # use run command from the framework project
24 |         from kedro.framework.cli.project import run
25 | 
26 |         return run
27 |     # fail badly if cli.py exists, but has no `cli` in it
28 |     if not hasattr(project_cli, "cli"):
29 |         raise KedroCliError(f"Cannot load commands from {package_name}.cli")
30 |     return project_cli.run
31 | 
32 | 
33 | def _find_run_command_in_plugins(plugins):
34 |     for group in plugins:
35 |         if "run" in group.commands:
36 |             return group.commands["run"]
37 | 
38 | 
39 | def main(*args, **kwargs):
40 |     package_name = Path(__file__).parent.name
41 |     configure_project(package_name)
42 |     run = _find_run_command(package_name)
43 |     run(*args, **kwargs)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "delay_prediction"
 7 | readme = "README.md"
 8 | dynamic = ["dependencies", "version"]
 9 | 
10 | [project.scripts]
11 | delay-prediction = "delay_prediction.__main__:main"
12 | 
13 | [project.entry-points."kedro.hooks"]
14 | 
15 | [project.optional-dependencies]
16 | docs = [
17 |     "docutils<0.18.0",
18 |     "sphinx~=3.4.3",
19 |     "sphinx_rtd_theme==0.5.1",
20 |     "nbsphinx==0.8.1",
21 |     "sphinx-autodoc-typehints==1.11.1",
22 |     "sphinx_copybutton==0.3.1",
23 |     "ipykernel>=5.3, <7.0",
24 |     "Jinja2<3.1.0",
25 |     "myst-parser~=0.17.2",
26 | ]
27 | 
28 | [tool.setuptools.dynamic]
29 | dependencies = {file = "requirements.txt"}
30 | version = {attr = "delay_prediction.__version__"}
31 | 
32 | [tool.setuptools.packages.find]
33 | where = ["src"]
34 | namespaces = false
35 | 
36 | [tool.kedro]
37 | package_name = "delay_prediction"
38 | project_name = "Delay Prediction"
39 | kedro_init_version = "0.19.6"
40 | tools = ['None']
41 | example_pipeline = "False"
42 | source_dir = "src"
43 | 
44 | [tool.pytest.ini_options]
45 | addopts = """
46 | --cov-report term-missing \
47 | --cov src/delay_prediction -ra"""
48 | 
49 | [tool.coverage.report]
50 | fail_under = 0
51 | show_missing = true
52 | exclude_lines = ["pragma: no cover", "raise NotImplementedError"]
53 | 
54 | [tool.ruff]
55 | line-length = 88
56 | show-fixes = true
57 | 
58 | [tool.ruff.format]
59 | docstring-code-format = true
60 | 
61 | [tool.ruff.lint]
62 | select = [
63 |     "F",   # Pyflakes
64 |     "W",   # pycodestyle
65 |     "E",   # pycodestyle
66 |     "I",   # isort
67 |     "UP",  # pyupgrade
68 |     "PL",  # Pylint
69 |     "T201", # Print Statement
70 | ]
71 | ignore = ["E501"]  # Ruff format takes care of line-too-long
72 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/settings.py:
--------------------------------------------------------------------------------
 1 | """Project settings. There is no need to edit this file unless you want to change values
 2 | from the Kedro defaults. For further information, including these default values, see
 3 | https://docs.kedro.org/en/stable/kedro_project_setup/settings.html."""
 4 | 
 5 | # Instantiated project hooks.
 6 | # For example, after creating a hooks.py and defining a ProjectHooks class there, do
 7 | # from delay_prediction.hooks import ProjectHooks
 8 | 
 9 | # Hooks are executed in a Last-In-First-Out (LIFO) order.
10 | # HOOKS = (ProjectHooks(),)
11 | 
12 | # Installed plugins for which to disable hook auto-registration.
13 | # DISABLE_HOOKS_FOR_PLUGINS = ("kedro-viz",)
14 | 
15 | # Class that manages storing KedroSession data.
16 | # from kedro.framework.session.store import BaseSessionStore
17 | # SESSION_STORE_CLASS = BaseSessionStore
18 | # Keyword arguments to pass to the `SESSION_STORE_CLASS` constructor.
19 | # SESSION_STORE_ARGS = {
20 | #     "path": "./sessions"
21 | # }
22 | 
23 | # Directory that holds configuration.
24 | # CONF_SOURCE = "conf"
25 | 
26 | # Class that manages how configuration is loaded.
27 | from kedro.config import OmegaConfigLoader  # noqa: E402
28 | 
29 | CONFIG_LOADER_CLASS = OmegaConfigLoader
30 | # Keyword arguments to pass to the `CONFIG_LOADER_CLASS` constructor.
31 | CONFIG_LOADER_ARGS = {
32 |     "base_env": "base",
33 |     "default_run_env": "local",
34 |     # "config_patterns": {
35 |     #     "spark" : ["spark*/"],
36 |     #     "parameters": ["parameters*", "parameters*/**", "**/parameters*"],
37 |     # }
38 | }
39 | 
40 | # Class that manages Kedro's library components.
41 | # from kedro.framework.context import KedroContext
42 | # CONTEXT_CLASS = KedroContext
43 | 
44 | # Class that manages the Data Catalog.
45 | # from kedro.io import DataCatalog
46 | # DATA_CATALOG_CLASS = DataCatalog
47 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/data_processing/nodes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TYPE_CHECKING
 4 | 
 5 | import ibis.selectors as s
 6 | from ibis import _
 7 | 
 8 | if TYPE_CHECKING:
 9 |     import ibis.expr.types as ir
10 | 
11 | 
12 | def _replace_na_values(t: ir.Table) -> ir.Table:
13 |     return t.mutate(s.across(s.of_type("string"), _.nullif("NA")))
14 | 
15 | 
16 | def preprocess_flights(flights: ir.Table) -> ir.Table:
17 |     """Preprocesses the data for flights.
18 | 
19 |     Args:
20 |         flights: Raw data.
21 |     Returns:
22 |         Preprocessed data, with `dep_time` converted to a time and
23 |         `arr_delay` and `air_time` converted to integers.
24 |     """
25 |     return _replace_na_values(flights).mutate(
26 |         dep_time=(
27 |             _.dep_time.lpad(4, "0").substr(0, 2)
28 |             + ":"
29 |             + _.dep_time.substr(-2, 2)
30 |             + ":00"
31 |         ).cast("time"),
32 |         arr_delay=_.arr_delay.cast(int),
33 |         air_time=_.air_time.cast(int),
34 |     )
35 | 
36 | 
37 | def create_model_input_table(flights: ir.Table, weather: ir.Table) -> ir.Table:
38 |     """Combines all data to create a model input table.
39 | 
40 |     Args:
41 |         flights: Preprocessed data for flights.
42 |         weather: Raw data for weather.
43 |     Returns:
44 |         Model input table.
45 |     """
46 |     return (
47 |         flights.mutate(
48 |             # Convert the arrival delay to a factor
49 |             arr_delay=flights.arr_delay >= 30,
50 |             # We will use the date (not date-time) in the recipe below
51 |             date=flights.time_hour.date(),
52 |         )
53 |         # Include the weather data
54 |         .inner_join(weather, ["origin", "time_hour"])
55 |         # Only retain the specific columns we will use
56 |         .select(
57 |             "dep_time",
58 |             "flight",
59 |             "origin",
60 |             "dest",
61 |             "air_time",
62 |             "distance",
63 |             "carrier",
64 |             "date",
65 |             "arr_delay",
66 |             "time_hour",
67 |         )
68 |         # Exclude missing data
69 |         .dropna()
70 |     )
71 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/tests/pipelines/data_science/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | import pytest
 4 | from kedro.io import DataCatalog
 5 | from kedro.runner import SequentialRunner
 6 | from delay_prediction.pipelines.data_science import create_pipeline as create_ds_pipeline
 7 | from delay_prediction.pipelines.data_science.nodes import split_data
 8 | 
 9 | @pytest.fixture
10 | def dummy_data():
11 |     return pd.DataFrame(
12 |         {
13 |             "engines": [1, 2, 3],
14 |             "crew": [4, 5, 6],
15 |             "passenger_capacity": [5, 6, 7],
16 |             "price": [120, 290, 30],
17 |         }
18 |     )
19 | 
20 | @pytest.fixture
21 | def dummy_parameters():
22 |     parameters = {
23 |         "model_options": {
24 |             "test_size": 0.2,
25 |             "random_state": 3,
26 |             "features": ["engines", "passenger_capacity", "crew"],
27 |         }
28 |     }
29 |     return parameters
30 | 
31 | 
32 | def test_split_data(dummy_data, dummy_parameters):
33 |     X_train, X_test, y_train, y_test = split_data(
34 |         dummy_data, dummy_parameters["model_options"]
35 |     )
36 |     assert len(X_train) == 2
37 |     assert len(y_train) == 2
38 |     assert len(X_test) == 1
39 |     assert len(y_test) == 1
40 | 
41 | def test_split_data_missing_price(dummy_data, dummy_parameters):
42 |     dummy_data_missing_price = dummy_data.drop(columns="price")
43 |     with pytest.raises(KeyError) as e_info:
44 |         X_train, X_test, y_train, y_test = split_data(dummy_data_missing_price, dummy_parameters["model_options"])
45 | 
46 |     assert "price" in str(e_info.value)
47 | 
48 | def test_data_science_pipeline(caplog, dummy_data, dummy_parameters):
49 |     pipeline = (
50 |         create_ds_pipeline()
51 |         .from_nodes("split_data_node")
52 |         .to_nodes("evaluate_model_node")
53 |     )
54 |     catalog = DataCatalog()
55 |     catalog.add_feed_dict(
56 |         {
57 |             "model_input_table" : dummy_data,
58 |             "params:model_options": dummy_parameters["model_options"],
59 |         }
60 |     )
61 | 
62 |     caplog.set_level(logging.DEBUG, logger="kedro")
63 |     successful_run_msg = "Pipeline execution completed successfully."
64 | 
65 |     SequentialRunner().run(pipeline, catalog)
66 | 
67 |     assert successful_run_msg in caplog.text


--------------------------------------------------------------------------------
/demo/delay-prediction/conf/base/catalog.yml:
--------------------------------------------------------------------------------
 1 | # Here you can define all your data sets by using simple YAML syntax.
 2 | #
 3 | # Documentation for this file format can be found in "The Data Catalog"
 4 | # Link: https://docs.kedro.org/en/stable/data/data_catalog.html
 5 | #
 6 | # We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
 7 | #
 8 | # An example data set definition can look as follows:
 9 | #
10 | #bikes:
11 | #  type: pandas.CSVDataset
12 | #  filepath: "data/01_raw/bikes.csv"
13 | #
14 | #weather:
15 | #  type: spark.SparkDataset
16 | #  filepath: s3a://your_bucket/data/01_raw/weather*
17 | #  file_format: csv
18 | #  credentials: dev_s3
19 | #  load_args:
20 | #    header: True
21 | #    inferSchema: True
22 | #  save_args:
23 | #    sep: '|'
24 | #    header: True
25 | #
26 | #scooters:
27 | #  type: pandas.SQLTableDataset
28 | #  credentials: scooters_credentials
29 | #  table_name: scooters
30 | #  load_args:
31 | #    index_col: ['name']
32 | #    columns: ['name', 'gear']
33 | #  save_args:
34 | #    if_exists: 'replace'
35 | #    # if_exists: 'fail'
36 | #    # if_exists: 'append'
37 | #
38 | # The Data Catalog supports being able to reference the same file using two different Dataset implementations
39 | # (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
40 | # https://docs.kedro.org/en/stable/data/data_catalog.html
41 | 
42 | _root_folder: /workspaces/kedro-ibis-tutorial
43 | 
44 | _connection:
45 |   backend: duckdb
46 |   database: "${_root_folder}/nycflights13.ddb"
47 | 
48 | flights:
49 |   type: ibis.TableDataset
50 |   table_name: flights
51 |   connection: ${_connection}
52 | 
53 | weather:
54 |   type: ibis.TableDataset
55 |   table_name: weather
56 |   connection: ${_connection}
57 | 
58 | preprocessed_flights:
59 |   type: ibis.TableDataset
60 |   table_name: preprocessed_flights
61 |   connection: ${_connection}
62 | 
63 | model_input_table:
64 |   type: ibis.TableDataset
65 |   table_name: model_input_table
66 |   connection: ${_connection}
67 |   save_args:
68 |     materialized: table
69 | 
70 | classifier:
71 |   type: pickle.PickleDataset
72 |   filepath: data/06_models/classifier.pickle
73 |   versioned: true
74 | 
75 | X_test:
76 |   type: ibis.TableDataset
77 |   table_name: X_test
78 |   connection: ${_connection}
79 | 
80 | y_test:
81 |   type: ibis.TableDataset
82 |   table_name: y_test
83 |   connection: ${_connection}
84 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/.gitignore:
--------------------------------------------------------------------------------
  1 | ##########################
  2 | # KEDRO PROJECT
  3 | 
  4 | # ignore all local configuration
  5 | conf/local/**
  6 | !conf/local/.gitkeep
  7 | 
  8 | # ignore potentially sensitive credentials files
  9 | conf/**/*credentials*
 10 | 
 11 | # ignore everything in the following folders
 12 | data/**
 13 | 
 14 | # except their sub-folders
 15 | !data/**/
 16 | 
 17 | # also keep all .gitkeep files
 18 | !.gitkeep
 19 | 
 20 | # keep also the example dataset
 21 | !data/01_raw/*
 22 | 
 23 | 
 24 | ##########################
 25 | # Common files
 26 | 
 27 | # IntelliJ
 28 | .idea/
 29 | *.iml
 30 | out/
 31 | .idea_modules/
 32 | 
 33 | ### macOS
 34 | *.DS_Store
 35 | .AppleDouble
 36 | .LSOverride
 37 | .Trashes
 38 | 
 39 | # Vim
 40 | *~
 41 | .*.swo
 42 | .*.swp
 43 | 
 44 | # emacs
 45 | *~
 46 | \#*\#
 47 | /.emacs.desktop
 48 | /.emacs.desktop.lock
 49 | *.elc
 50 | 
 51 | # JIRA plugin
 52 | atlassian-ide-plugin.xml
 53 | 
 54 | # C extensions
 55 | *.so
 56 | 
 57 | ### Python template
 58 | # Byte-compiled / optimized / DLL files
 59 | __pycache__/
 60 | *.py[cod]
 61 | *$py.class
 62 | 
 63 | # Distribution / packaging
 64 | .Python
 65 | build/
 66 | develop-eggs/
 67 | dist/
 68 | downloads/
 69 | eggs/
 70 | .eggs/
 71 | lib/
 72 | lib64/
 73 | parts/
 74 | sdist/
 75 | var/
 76 | wheels/
 77 | *.egg-info/
 78 | .installed.cfg
 79 | *.egg
 80 | MANIFEST
 81 | 
 82 | # PyInstaller
 83 | #  Usually these files are written by a python script from a template
 84 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 85 | *.manifest
 86 | *.spec
 87 | 
 88 | # Installer logs
 89 | pip-log.txt
 90 | pip-delete-this-directory.txt
 91 | 
 92 | # Unit test / coverage reports
 93 | htmlcov/
 94 | .tox/
 95 | .coverage
 96 | .coverage.*
 97 | .cache
 98 | nosetests.xml
 99 | coverage.xml
100 | *.cover
101 | .hypothesis/
102 | 
103 | # Translations
104 | *.mo
105 | *.pot
106 | 
107 | # Django stuff:
108 | *.log
109 | .static_storage/
110 | .media/
111 | local_settings.py
112 | 
113 | # Flask stuff:
114 | instance/
115 | .webassets-cache
116 | 
117 | # Scrapy stuff:
118 | .scrapy
119 | 
120 | # Sphinx documentation
121 | docs/_build/
122 | 
123 | # PyBuilder
124 | target/
125 | 
126 | # Jupyter Notebook
127 | .ipynb_checkpoints
128 | 
129 | # pyenv
130 | .python-version
131 | 
132 | # celery beat schedule file
133 | celerybeat-schedule
134 | 
135 | # SageMath parsed files
136 | *.sage.py
137 | 
138 | # Environments
139 | .env
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/src/delay_prediction/pipelines/model_training/nodes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import random
 4 | from typing import TYPE_CHECKING
 5 | 
 6 | import ibis
 7 | import ibis_ml as ml
 8 | from sklearn.linear_model import LogisticRegression
 9 | from sklearn.pipeline import Pipeline
10 | 
11 | if TYPE_CHECKING:
12 |     import ibis.expr.types as ir
13 | 
14 | 
15 | def split_data(
16 |     flight_data: ir.Table,
17 |     random_state: int = 42,
18 | ) -> tuple[ir.Table, ir.Column, ir.Table, ir.Column]:
19 |     """Splits data into training and test sets.
20 | 
21 |     Args:
22 |         data: Data containing features and target.
23 |     Returns:
24 |         Split data.
25 |     """
26 |     flight_data_with_unique_key = flight_data.mutate(
27 |         unique_key=ibis.literal(",").join(
28 |             [flight_data.carrier, flight_data.flight.cast(str), flight_data.date.cast(str)]
29 |         )
30 |     )
31 | 
32 |     # Fix the random numbers by setting the seed
33 |     # This enables the analysis to be reproducible when random numbers are used
34 |     random.seed(random_state)
35 | 
36 |     # Put 3/4 of the data into the training set
37 |     random_key = str(random.getrandbits(256))
38 |     data_split = flight_data_with_unique_key.mutate(
39 |         train=(flight_data_with_unique_key.unique_key + random_key).hash().abs() % 4 < 3
40 |     )
41 | 
42 |     # Create data frames for the two sets:
43 |     train_data = data_split[data_split.train].drop("unique_key", "train")
44 |     test_data = data_split[~data_split.train].drop("unique_key", "train")
45 | 
46 |     X_train = train_data.drop("arr_delay")
47 |     X_test = test_data.drop("arr_delay")
48 |     y_train = train_data.arr_delay
49 |     y_test = test_data.arr_delay
50 |     return X_train, X_test, y_train, y_test
51 | 
52 | 
53 | def train_model(X_train: ir.Table, y_train: ir.Column) -> Pipeline:
54 |     """Trains the logistic regression model.
55 | 
56 |     Args:
57 |         X_train: Training data of independent features.
58 |         y_train: Training data for whether a plane arrived more than 30
59 |             minutes late.
60 | 
61 |     Returns:
62 |         Trained model.
63 |     """
64 |     flights_rec = ml.Recipe(
65 |         ml.ExpandDate("date", components=["dow", "month"]),
66 |         ml.Drop("date"),
67 |         ml.TargetEncode(ml.nominal()),
68 |         ml.DropZeroVariance(ml.everything()),
69 |         ml.MutateAt("dep_time", ibis._.hour() * 60 + ibis._.minute()),
70 |         ml.MutateAt(ml.timestamp(), ibis._.epoch_seconds()),
71 |         # By default, PyTorch requires that the type of `X` is `np.float32`.
72 |         # https://discuss.pytorch.org/t/mat1-and-mat2-must-have-the-same-dtype-but-got-double-and-float/197555/2
73 |         ml.Cast(ml.numeric(), "float32"),
74 |     )
75 |     pipe = Pipeline([("flights_rec", flights_rec), ("lr_mod", LogisticRegression())])
76 |     pipe.fit(X_train, y_train)
77 |     return pipe
78 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/README.md:
--------------------------------------------------------------------------------
 1 | # Delay Prediction
 2 | 
 3 | ## Overview
 4 | 
 5 | This is your new Kedro project, which was generated using `kedro 0.19.6`.
 6 | 
 7 | Take a look at the [Kedro documentation](https://docs.kedro.org) to get started.
 8 | 
 9 | ## Rules and guidelines
10 | 
11 | In order to get the best out of the template:
12 | 
13 | * Don't remove any lines from the `.gitignore` file we provide
14 | * Make sure your results can be reproduced by following a [data engineering convention](https://docs.kedro.org/en/stable/faq/faq.html#what-is-data-engineering-convention)
15 | * Don't commit data to your repository
16 | * Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/`
17 | 
18 | ## How to install dependencies
19 | 
20 | Declare any dependencies in `requirements.txt` for `pip` installation.
21 | 
22 | To install them, run:
23 | 
24 | ```
25 | pip install -r requirements.txt
26 | ```
27 | 
28 | ## How to run your Kedro pipeline
29 | 
30 | You can run your Kedro project with:
31 | 
32 | ```
33 | kedro run
34 | ```
35 | 
36 | ## How to test your Kedro project
37 | 
38 | Have a look at the files `src/tests/test_run.py` and `src/tests/pipelines/data_science/test_pipeline.py` for instructions on how to write your tests. Run the tests as follows:
39 | 
40 | ```
41 | pytest
42 | ```
43 | 
44 | To configure the coverage threshold, look at the `.coveragerc` file.
45 | 
46 | ## Project dependencies
47 | 
48 | To see and update the dependency requirements for your project use `requirements.txt`. You can install the project requirements with `pip install -r requirements.txt`.
49 | 
50 | [Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies)
51 | 
52 | ## How to work with Kedro and notebooks
53 | 
54 | > Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `catalog`, `context`, `pipelines` and `session`.
55 | >
56 | > Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r requirements.txt` you will not need to take any extra steps before you use them.
57 | 
58 | ### Jupyter
59 | To use Jupyter notebooks in your Kedro project, you need to install Jupyter:
60 | 
61 | ```
62 | pip install jupyter
63 | ```
64 | 
65 | After installing Jupyter, you can start a local notebook server:
66 | 
67 | ```
68 | kedro jupyter notebook
69 | ```
70 | 
71 | ### JupyterLab
72 | To use JupyterLab, you need to install it:
73 | 
74 | ```
75 | pip install jupyterlab
76 | ```
77 | 
78 | You can also start JupyterLab:
79 | 
80 | ```
81 | kedro jupyter lab
82 | ```
83 | 
84 | ### IPython
85 | And if you want to run an IPython session:
86 | 
87 | ```
88 | kedro ipython
89 | ```
90 | 
91 | ### How to ignore notebook output cells in `git`
92 | To automatically strip out all output cell contents before committing to `git`, you can use tools like [`nbstripout`](https://github.com/kynan/nbstripout). For example, you can add a hook in `.git/config` with `nbstripout --install`. This will run `nbstripout` before anything is committed to `git`.
93 | 
94 | > *Note:* Your output cells will be retained locally.
95 | 
96 | ## Package your Kedro project
97 | 
98 | [Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html)
99 | 


--------------------------------------------------------------------------------
/02 - Switching Backends.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {},
  6 |       "source": [
  7 |         "# Switching Backends\n",
  8 |         "\n",
  9 |         "One use case for Ibis's portable API is the ability to develop a query locally (using DuckDB, for example) on a subset of the data, then rerun that same query on the full dataset (using BigQuery, for example) without rewriting your code.\n",
 10 |         "\n",
 11 |         "In this notebook, we'll run some of the queries we developed in the previous notebook, using the Postgres database we populated in the Welcome notebook."
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "code",
 16 |       "execution_count": null,
 17 |       "metadata": {},
 18 |       "outputs": [],
 19 |       "source": [
 20 |         "import ibis\n",
 21 |         "\n",
 22 |         "ibis.options.interactive = True"
 23 |       ]
 24 |     },
 25 |     {
 26 |       "cell_type": "markdown",
 27 |       "metadata": {},
 28 |       "source": [
 29 |         "Create a connection, just as we did with DuckDB..."
 30 |       ]
 31 |     },
 32 |     {
 33 |       "cell_type": "code",
 34 |       "execution_count": null,
 35 |       "metadata": {},
 36 |       "outputs": [],
 37 |       "source": [
 38 |         "con = ibis.postgres.connect()"
 39 |       ]
 40 |     },
 41 |     {
 42 |       "cell_type": "markdown",
 43 |       "metadata": {},
 44 |       "source": [
 45 |         "...grab a reference to the flights table..."
 46 |       ]
 47 |     },
 48 |     {
 49 |       "cell_type": "code",
 50 |       "execution_count": null,
 51 |       "metadata": {},
 52 |       "outputs": [],
 53 |       "source": [
 54 |         "flights = con.table(\"flights\")"
 55 |       ]
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {},
 60 |       "source": [
 61 |         "...and copy-paste some queries!"
 62 |       ]
 63 |     },
 64 |     {
 65 |       "cell_type": "code",
 66 |       "execution_count": null,
 67 |       "metadata": {},
 68 |       "outputs": [],
 69 |       "source": [
 70 |         "flights.order_by(ibis.desc(\"distance\")).select(\"carrier\", \"origin\", \"dest\", \"distance\")"
 71 |       ]
 72 |     },
 73 |     {
 74 |       "cell_type": "code",
 75 |       "execution_count": null,
 76 |       "metadata": {},
 77 |       "outputs": [],
 78 |       "source": [
 79 |         "flights.group_by([\"carrier\", \"origin\"]).agg(\n",
 80 |         "    [flights.distance.mean().cast(\"float32\"), flights.air_time.min()]\n",
 81 |         ")"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "markdown",
 86 |       "metadata": {},
 87 |       "source": [
 88 |         "Later on in the tutorial, we'll see more practical examples of running the same Ibis code against multiple backends."
 89 |       ]
 90 |     }
 91 |   ],
 92 |   "metadata": {
 93 |     "kernelspec": {
 94 |       "display_name": "Python 3 (ipykernel)",
 95 |       "language": "python",
 96 |       "name": "python3"
 97 |     },
 98 |     "language_info": {
 99 |       "codemirror_mode": {
100 |         "name": "ipython",
101 |         "version": 3
102 |       },
103 |       "file_extension": ".py",
104 |       "mimetype": "text/x-python",
105 |       "name": "python",
106 |       "nbconvert_exporter": "python",
107 |       "pygments_lexer": "ipython3",
108 |       "version": "3.11.9"
109 |     }
110 |   },
111 |   "nbformat": 4,
112 |   "nbformat_minor": 4
113 | }
114 | 


--------------------------------------------------------------------------------
/demo/delay-prediction/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | # delay_prediction documentation build
  6 | # configuration file, created by sphinx-quickstart.
  7 | #
  8 | # This file is execfile()d with the current directory set to its
  9 | # containing dir.
 10 | #
 11 | # Note that not all possible configuration values are present in this
 12 | # autogenerated file.
 13 | #
 14 | # All configuration values have a default; values that are commented out
 15 | # serve to show the default.
 16 | 
 17 | # If extensions (or modules to document with autodoc) are in another directory,
 18 | # add these directories to sys.path here. If the directory is relative to the
 19 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 20 | #
 21 | import re
 22 | 
 23 | from kedro.framework.cli.utils import find_stylesheets
 24 | from delay_prediction import __version__ as release
 25 | 
 26 | # -- Project information -----------------------------------------------------
 27 | 
 28 | project = "delay_prediction"
 29 | author = "Kedro"
 30 | 
 31 | # The short X.Y version.
 32 | version = re.match(r"^([0-9]+\.[0-9]+).*", release).group(1)
 33 | 
 34 | # -- General configuration ---------------------------------------------------
 35 | 
 36 | # If your documentation needs a minimal Sphinx version, state it here.
 37 | #
 38 | # needs_sphinx = '1.0'
 39 | 
 40 | # Add any Sphinx extension module names here, as strings. They can be
 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 42 | # ones.
 43 | extensions = [
 44 |     "sphinx.ext.autodoc",
 45 |     "sphinx.ext.napoleon",
 46 |     "sphinx_autodoc_typehints",
 47 |     "sphinx.ext.doctest",
 48 |     "sphinx.ext.todo",
 49 |     "sphinx.ext.coverage",
 50 |     "sphinx.ext.ifconfig",
 51 |     "sphinx.ext.viewcode",
 52 |     "sphinx.ext.mathjax",
 53 |     "nbsphinx",
 54 |     "sphinx_copybutton",
 55 |     "myst_parser",
 56 | ]
 57 | 
 58 | # enable autosummary plugin (table of contents for modules/classes/class
 59 | # methods)
 60 | autosummary_generate = True
 61 | 
 62 | # Add any paths that contain templates here, relative to this directory.
 63 | templates_path = ["_templates"]
 64 | 
 65 | # The suffix(es) of source filenames.
 66 | # You can specify multiple suffix as a list of string:
 67 | #
 68 | source_suffix = {".rst": "restructuredtext", ".md": "markdown"}
 69 | 
 70 | # The master toctree document.
 71 | master_doc = "index"
 72 | 
 73 | # The language for content autogenerated by Sphinx. Refer to documentation
 74 | # for a list of supported languages.
 75 | #
 76 | # This is also used if you do content translation via gettext catalogs.
 77 | # Usually you set "language" from the command line for these cases.
 78 | language = None
 79 | 
 80 | # List of patterns, relative to source directory, that match files and
 81 | # directories to ignore when looking for source files.
 82 | # This pattern also affects html_static_path and html_extra_path .
 83 | exclude_patterns = ["_build", "**.ipynb_checkpoints"]
 84 | 
 85 | # The name of the Pygments (syntax highlighting) style to use.
 86 | pygments_style = "sphinx"
 87 | 
 88 | # -- Options for HTML output -------------------------------------------------
 89 | 
 90 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 91 | # a list of builtin themes.
 92 | #
 93 | html_theme = "sphinx_rtd_theme"
 94 | 
 95 | # Theme options are theme-specific and customize the look and feel of a theme
 96 | # further.  For a list of options available for each theme, see the
 97 | # documentation.
 98 | #
 99 | html_theme_options = {"collapse_navigation": False, "style_external_links": True}
100 | 
101 | # Add any paths that contain custom static files (such as style sheets) here,
102 | # relative to this directory. They are copied after the builtin static files,
103 | # so a file named "default.css" will overwrite the builtin "default.css".
104 | html_static_path = ["_static"]
105 | 
106 | # Custom sidebar templates, must be a dictionary that maps document names
107 | # to template names.
108 | #
109 | # The default sidebars (for documents that don't match any pattern) are
110 | # defined by theme itself.  Builtin themes are using these templates by
111 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
112 | # 'searchbox.html']``.
113 | #
114 | # html_sidebars = {}
115 | 
116 | html_show_sourcelink = False
117 | 
118 | # Removes, from all docs, the copyright footer.
119 | html_show_copyright = False
120 | 
121 | # -- Options for HTMLHelp output ---------------------------------------------
122 | 
123 | # Output file base name for HTML help builder.
124 | htmlhelp_basename = "delay_predictiondoc"
125 | 
126 | # -- Options for LaTeX output ------------------------------------------------
127 | 
128 | latex_elements = {
129 |     # The paper size ('letterpaper' or 'a4paper').
130 |     #
131 |     # 'papersize': 'letterpaper',
132 |     #
133 |     # The font size ('10pt', '11pt' or '12pt').
134 |     #
135 |     # 'pointsize': '10pt',
136 |     #
137 |     # Additional stuff for the LaTeX preamble.
138 |     #
139 |     # 'preamble': '',
140 |     #
141 |     # Latex figure (float) alignment
142 |     #
143 |     # 'figure_align': 'htbp',
144 | }
145 | 
146 | # Grouping the document tree into LaTeX files. List of tuples
147 | # (source start file, target name, title,
148 | #  author, documentclass [howto, manual, or own class]).
149 | latex_documents = [
150 |     (
151 |         master_doc,
152 |         "delay_prediction.tex",
153 |         "delay_prediction Documentation",
154 |         "Kedro",
155 |         "manual",
156 |     )
157 | ]
158 | 
159 | # -- Options for manual page output ------------------------------------------
160 | 
161 | # One entry per manual page. List of tuples
162 | # (source start file, name, description, authors, manual section).
163 | man_pages = [
164 |     (
165 |         master_doc,
166 |         "delay_prediction",
167 |         "delay_prediction Documentation",
168 |         [author],
169 |         1,
170 |     )
171 | ]
172 | 
173 | # -- Options for Texinfo output ----------------------------------------------
174 | 
175 | # Grouping the document tree into Texinfo files. List of tuples
176 | # (source start file, target name, title, author,
177 | #  dir menu entry, description, category)
178 | texinfo_documents = [
179 |     (
180 |         master_doc,
181 |         "delay_prediction",
182 |         "delay_prediction Documentation",
183 |         author,
184 |         "delay_prediction",
185 |         "Project delay_prediction codebase.",
186 |         "Data-Science",
187 |     )
188 | ]
189 | 
190 | # -- Options for todo extension ----------------------------------------------
191 | 
192 | # If true, `todo` and `todoList` produce output, else they produce nothing.
193 | todo_include_todos = False
194 | 
195 | # -- Extension configuration -------------------------------------------------
196 | 
197 | # nbsphinx_prolog = """
198 | # see here for prolog/epilog details:
199 | # https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html
200 | # """
201 | 
202 | # -- NBconvert kernel config -------------------------------------------------
203 | nbsphinx_kernel_name = "python3"
204 | 
205 | 
206 | def remove_arrows_in_examples(lines):
207 |     for i, line in enumerate(lines):
208 |         lines[i] = line.replace(">>>", "")
209 | 
210 | 
211 | def autodoc_process_docstring(app, what, name, obj, options, lines):
212 |     remove_arrows_in_examples(lines)
213 | 
214 | 
215 | def skip(app, what, name, obj, skip, options):
216 |     if name == "__init__":
217 |         return False
218 |     return skip
219 | 
220 | 
221 | def setup(app):
222 |     app.connect("autodoc-process-docstring", autodoc_process_docstring)
223 |     app.connect("autodoc-skip-member", skip)
224 |     # add Kedro stylesheets
225 |     for stylesheet in find_stylesheets():
226 |         app.add_css_file(stylesheet)
227 |     # enable rendering RST tables in Markdown
228 | 


--------------------------------------------------------------------------------
/00 - Welcome.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "editable": true,
  7 |     "slideshow": {
  8 |      "slide_type": "slide"
  9 |     },
 10 |     "tags": []
 11 |    },
 12 |    "source": [
 13 |     "# Welcome to the Kedro-Ibis tutorial!\n",
 14 |     "\n",
 15 |     "## Outline\n",
 16 |     "\n",
 17 |     "- Introduction\n",
 18 |     "  - Who we are\n",
 19 |     "  - Workshop material\n",
 20 |     "  - Setup\n",
 21 |     "  - Motivation\n",
 22 |     "- Expressive analytics at any scale: Introduction to Ibis\n",
 23 |     "- From prototype to production: Introduction to Kedro\n",
 24 |     "- Conclusion"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "editable": true,
 31 |     "slideshow": {
 32 |      "slide_type": "skip"
 33 |     },
 34 |     "tags": []
 35 |    },
 36 |    "source": [
 37 |     "These are the notebooks for the tutorial: 👇\n",
 38 |     "\n",
 39 |     "1. [Getting Started with Ibis](./01%20-%20Getting%20Started%20with%20Ibis.ipynb)\n",
 40 |     "2. [Switching Backends](./02%20-%20Switching%20Backends.ipynb)\n",
 41 |     "3. [First Steps with Kedro](./03%20-%20First%20Steps%20with%20Kedro.ipynb)"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "attachments": {},
 46 |    "cell_type": "markdown",
 47 |    "metadata": {
 48 |     "editable": true,
 49 |     "slideshow": {
 50 |      "slide_type": "slide"
 51 |     },
 52 |     "tags": []
 53 |    },
 54 |    "source": [
 55 |     "## Introduction\n",
 56 |     "\n",
 57 |     "### Who we are\n",
 58 |     "\n",
 59 |     "|  |  |\n",
 60 |     "|--------|------|\n",
 61 |     "| ![Deepyaman](static/deepyaman.jpg) | **Deepyaman Datta**<br><br>Deepyaman is a software engineer at Voltron Data. Before their acquisition by Voltron Data, he was a Founding Machine Learning Engineer at  Claypot AI, working on their real-time feature engineering platform. Prior to that, he led data engineering teams and asset development across a range of  industries at QuantumBlack, AI by McKinsey. |\n",
 62 |     "| ![Juan Luis](static/juanluis.png) | **Juan Luis Cano Rodríguez**<br><br>Juan Luis (he/him/él) is an Aerospace Engineer with a passion for tech communities, outreach, and sustainability. He works at QuantumBlack, AI by McKinsey, as Product Manager for Kedro, an  opinionated Python framework for creating reproducible, maintainable and modular data science code. He has worked as Developer Advocate at Read  the Docs, as software engineer in the space, consulting, and banking industries, and as a Python trainer for several private and public entities. |"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "attachments": {},
 67 |    "cell_type": "markdown",
 68 |    "metadata": {
 69 |     "editable": true,
 70 |     "slideshow": {
 71 |      "slide_type": "subslide"
 72 |     },
 73 |     "tags": []
 74 |    },
 75 |    "source": [
 76 |     "### Workshop material\n",
 77 |     "\n",
 78 |     "**https://github.com/ibis-project/kedro-ibis-tutorial**\n",
 79 |     "\n",
 80 |     "![QR Code](static/qr.png)\n",
 81 |     "\n",
 82 |     "_Note: This will be a lot of material for a 90-minute tutorial; we'll go fast and not go too much in depth, but will be more than happy to answer questions later!_"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {
 88 |     "editable": true,
 89 |     "slideshow": {
 90 |      "slide_type": "subslide"
 91 |     },
 92 |     "tags": []
 93 |    },
 94 |    "source": [
 95 |     "1. Open URL above\n",
 96 |     "2. Hit 🟩 \"Create codespace on main\"\n",
 97 |     "3. Open `00 - Welcome.ipynb` notebook and follow instructions\n",
 98 |     "\n",
 99 |     "<img src=\"static/codespaces.png\" width=\"400\" alt=\"Codespaces\">"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {
105 |     "editable": true,
106 |     "slideshow": {
107 |      "slide_type": "subslide"
108 |     },
109 |     "tags": []
110 |    },
111 |    "source": [
112 |     "## Setup\n",
113 |     "\n",
114 |     "Let's start by downloading the [nycflights13 data](https://github.com/hadley/nycflights13); we'll use this dataset throughout the tutorial."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "editable": true,
122 |     "slideshow": {
123 |      "slide_type": "fragment"
124 |     },
125 |     "tags": []
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "import ibis\n",
130 |     "\n",
131 |     "con = ibis.connect(\"duckdb://nycflights13.ddb\")\n",
132 |     "con.create_table(\n",
133 |     "    \"flights\", ibis.examples.nycflights13_flights.fetch().to_pyarrow(), overwrite=True\n",
134 |     ")\n",
135 |     "con.create_table(\n",
136 |     "    \"weather\", ibis.examples.nycflights13_weather.fetch().to_pyarrow(), overwrite=True\n",
137 |     ")\n",
138 |     "con.disconnect()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "editable": true,
145 |     "slideshow": {
146 |      "slide_type": "subslide"
147 |     },
148 |     "tags": []
149 |    },
150 |    "source": [
151 |     "Next, we'll load the data into a local PostgreSQL database using DuckDB—[yes, you can do that](https://duckdb.org/docs/extensions/postgres.html#writing-data-to-postgres)!"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "editable": true,
159 |     "slideshow": {
160 |      "slide_type": "fragment"
161 |     },
162 |     "tags": []
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "!psql < sql/create_nycflights13.sql"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "editable": true,
174 |     "slideshow": {
175 |      "slide_type": "fragment"
176 |     },
177 |     "tags": []
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "!duckdb nycflights13.ddb < sql/load_nycflights13.sql"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {
187 |     "editable": true,
188 |     "slideshow": {
189 |      "slide_type": "subslide"
190 |     },
191 |     "tags": []
192 |    },
193 |    "source": [
194 |     "We can confirm that our PostgreSQL database contains the tables we just populated."
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {
201 |     "editable": true,
202 |     "slideshow": {
203 |      "slide_type": "fragment"
204 |     },
205 |     "tags": []
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "!psql < sql/verify_nycflights13.sql"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {
215 |     "editable": true,
216 |     "slideshow": {
217 |      "slide_type": "subslide"
218 |     },
219 |     "tags": []
220 |    },
221 |    "source": [
222 |     "## Motivation\n",
223 |     "\n",
224 |     "In your experience doing data analytics/building data pipelines, have you ever..."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {
230 |     "editable": true,
231 |     "slideshow": {
232 |      "slide_type": "fragment"
233 |     },
234 |     "tags": []
235 |    },
236 |    "source": [
237 |     "- ...slurped up large amounts of data into memory, instead of pushing execution down to the source database/engine?"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {
243 |     "editable": true,
244 |     "slideshow": {
245 |      "slide_type": "fragment"
246 |     },
247 |     "tags": []
248 |    },
249 |    "source": [
250 |     "- ...prototyped code in pandas, and then rewritten it in PySpark/Snowpark/some other native dataframe API?"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {
256 |     "editable": true,
257 |     "slideshow": {
258 |      "slide_type": "fragment"
259 |     },
260 |     "tags": []
261 |    },
262 |    "source": [
263 |     "- ...implemented a proof-of-concept solution on data extracts, and then struggled massively when you needed to move to running against the production databases and scale out?"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {
269 |     "editable": true,
270 |     "slideshow": {
271 |      "slide_type": "fragment"
272 |     },
273 |     "tags": []
274 |    },
275 |    "source": [
276 |     "- ...insisted on using Python across the full data engineering/data science workflow for consistency (fair enough), although dbt would have been the much better fit for non-ML pipelines, because you essentially needed a SQL workflow?"
277 |    ]
278 |   }
279 |  ],
280 |  "metadata": {
281 |   "kernelspec": {
282 |    "display_name": "Python 3 (ipykernel)",
283 |    "language": "python",
284 |    "name": "python3"
285 |   },
286 |   "language_info": {
287 |    "codemirror_mode": {
288 |     "name": "ipython",
289 |     "version": 3
290 |    },
291 |    "file_extension": ".py",
292 |    "mimetype": "text/x-python",
293 |    "name": "python",
294 |    "nbconvert_exporter": "python",
295 |    "pygments_lexer": "ipython3",
296 |    "version": "3.11.9"
297 |   }
298 |  },
299 |  "nbformat": 4,
300 |  "nbformat_minor": 4
301 | }
302 | 


--------------------------------------------------------------------------------
/codespace_requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file was autogenerated by uv via the following command:
  2 | #    uv pip compile requirements.txt -o codespace_requirements.txt
  3 | aiofiles==23.2.1
  4 |     # via kedro-viz
  5 | aiohttp==3.9.5
  6 |     # via gcsfs
  7 | aiosignal==1.3.1
  8 |     # via aiohttp
  9 | annotated-types==0.7.0
 10 |     # via pydantic
 11 | antlr4-python3-runtime==4.9.3
 12 |     # via omegaconf
 13 | anyio==3.7.1
 14 |     # via
 15 |     #   httpx
 16 |     #   jupyter-server
 17 |     #   starlette
 18 |     #   watchfiles
 19 |     #   watchgod
 20 | appdirs==1.4.4
 21 |     # via
 22 |     #   kedro-telemetry
 23 |     #   pins
 24 | argon2-cffi==23.1.0
 25 |     # via jupyter-server
 26 | argon2-cffi-bindings==21.2.0
 27 |     # via argon2-cffi
 28 | arrow==1.3.0
 29 |     # via
 30 |     #   cookiecutter
 31 |     #   isoduration
 32 | asttokens==2.4.1
 33 |     # via stack-data
 34 | async-lru==2.0.4
 35 |     # via jupyterlab
 36 | atpublic==4.1.0
 37 |     # via ibis-framework
 38 | attrs==23.2.0
 39 |     # via
 40 |     #   aiohttp
 41 |     #   jsonschema
 42 |     #   kedro
 43 |     #   referencing
 44 | babel==2.15.0
 45 |     # via jupyterlab-server
 46 | beautifulsoup4==4.12.3
 47 |     # via nbconvert
 48 | bidict==0.23.1
 49 |     # via ibis-framework
 50 | binaryornot==0.4.4
 51 |     # via cookiecutter
 52 | bleach==6.1.0
 53 |     # via nbconvert
 54 | build==1.2.1
 55 |     # via kedro
 56 | cachetools==5.3.3
 57 |     # via
 58 |     #   google-auth
 59 |     #   kedro
 60 | certifi==2024.7.4
 61 |     # via
 62 |     #   -r requirements.txt
 63 |     #   httpcore
 64 |     #   httpx
 65 |     #   requests
 66 | cffi==1.16.0
 67 |     # via argon2-cffi-bindings
 68 | chardet==5.2.0
 69 |     # via binaryornot
 70 | charset-normalizer==3.3.2
 71 |     # via requests
 72 | click==8.1.7
 73 |     # via
 74 |     #   click-default-group
 75 |     #   cookiecutter
 76 |     #   kedro
 77 |     #   typer
 78 |     #   uvicorn
 79 | click-default-group==1.2.4
 80 |     # via kedro-viz
 81 | comm==0.2.2
 82 |     # via ipykernel
 83 | cookiecutter==2.6.0
 84 |     # via kedro
 85 | debugpy==1.8.1
 86 |     # via ipykernel
 87 | decorator==5.1.1
 88 |     # via
 89 |     #   gcsfs
 90 |     #   ipython
 91 | defusedxml==0.7.1
 92 |     # via nbconvert
 93 | dnspython==2.6.1
 94 |     # via email-validator
 95 | duckdb==1.0.0
 96 |     # via ibis-framework
 97 | dynaconf==3.2.5
 98 |     # via kedro
 99 | email-validator==2.1.1
100 |     # via fastapi
101 | executing==2.0.1
102 |     # via stack-data
103 | fastapi==0.111.0
104 |     # via kedro-viz
105 | fastapi-cli==0.0.4
106 |     # via fastapi
107 | fastjsonschema==2.19.1
108 |     # via nbformat
109 | fqdn==1.5.1
110 |     # via jsonschema
111 | frozenlist==1.4.1
112 |     # via
113 |     #   aiohttp
114 |     #   aiosignal
115 | fsspec==2024.2.0
116 |     # via
117 |     #   gcsfs
118 |     #   ibis-framework
119 |     #   kedro
120 |     #   kedro-viz
121 |     #   pins
122 | gcsfs==2024.2.0
123 |     # via pins
124 | gitdb==4.0.11
125 |     # via gitpython
126 | gitpython==3.1.41
127 |     # via kedro
128 | google-api-core==2.19.0
129 |     # via
130 |     #   google-cloud-core
131 |     #   google-cloud-storage
132 | google-auth==2.29.0
133 |     # via
134 |     #   gcsfs
135 |     #   google-api-core
136 |     #   google-auth-oauthlib
137 |     #   google-cloud-core
138 |     #   google-cloud-storage
139 | google-auth-oauthlib==1.2.0
140 |     # via gcsfs
141 | google-cloud-core==2.4.1
142 |     # via google-cloud-storage
143 | google-cloud-storage==2.16.0
144 |     # via gcsfs
145 | google-crc32c==1.5.0
146 |     # via
147 |     #   google-cloud-storage
148 |     #   google-resumable-media
149 | google-resumable-media==2.7.0
150 |     # via google-cloud-storage
151 | googleapis-common-protos==1.63.0
152 |     # via google-api-core
153 | graphql-core==3.2.3
154 |     # via strawberry-graphql
155 | greenlet==3.0.3
156 |     # via sqlalchemy
157 | h11==0.14.0
158 |     # via
159 |     #   httpcore
160 |     #   uvicorn
161 | httpcore==1.0.5
162 |     # via httpx
163 | httptools==0.6.1
164 |     # via uvicorn
165 | httpx==0.27.0
166 |     # via
167 |     #   fastapi
168 |     #   jupyterlab
169 | humanize==4.9.0
170 |     # via pins
171 | ibis-framework==9.1.0
172 |     # via
173 |     #   -r demo/delay-prediction/requirements.txt
174 |     #   ibis-ml
175 |     #   kedro-datasets
176 | ibis-ml==0.1.0
177 |     # via -r demo/delay-prediction/requirements.txt
178 | idna==3.7
179 |     # via
180 |     #   anyio
181 |     #   email-validator
182 |     #   httpx
183 |     #   jsonschema
184 |     #   requests
185 |     #   yarl
186 | importlib-metadata==7.1.0
187 |     # via
188 |     #   kedro
189 |     #   pins
190 | importlib-resources==6.4.0
191 |     # via
192 |     #   kedro
193 |     #   pins
194 | ipykernel==6.29.4
195 |     # via jupyterlab
196 | ipython==8.24.0
197 |     # via
198 |     #   ipykernel
199 |     #   kedro-viz
200 | isoduration==20.11.0
201 |     # via jsonschema
202 | jedi==0.19.1
203 |     # via ipython
204 | jinja2==3.1.4
205 |     # via
206 |     #   cookiecutter
207 |     #   fastapi
208 |     #   jupyter-server
209 |     #   jupyterlab
210 |     #   jupyterlab-server
211 |     #   nbconvert
212 |     #   pins
213 | joblib==1.4.2
214 |     # via
215 |     #   pins
216 |     #   scikit-learn
217 | json5==0.9.25
218 |     # via jupyterlab-server
219 | jsonpointer==2.4
220 |     # via jsonschema
221 | jsonschema==4.22.0
222 |     # via
223 |     #   jupyter-events
224 |     #   jupyterlab-server
225 |     #   nbformat
226 | jsonschema-specifications==2023.12.1
227 |     # via jsonschema
228 | jupyter-client==8.6.1
229 |     # via
230 |     #   ipykernel
231 |     #   jupyter-server
232 |     #   nbclient
233 | jupyter-core==5.7.2
234 |     # via
235 |     #   ipykernel
236 |     #   jupyter-client
237 |     #   jupyter-server
238 |     #   jupyterlab
239 |     #   nbclient
240 |     #   nbconvert
241 |     #   nbformat
242 | jupyter-events==0.10.0
243 |     # via jupyter-server
244 | jupyter-lsp==2.2.5
245 |     # via jupyterlab
246 | jupyter-server==2.14.0
247 |     # via
248 |     #   jupyter-lsp
249 |     #   jupyterlab
250 |     #   jupyterlab-server
251 |     #   notebook-shim
252 | jupyter-server-terminals==0.5.3
253 |     # via jupyter-server
254 | jupyterlab==4.1.8
255 |     # via -r requirements.txt
256 | jupyterlab-pygments==0.3.0
257 |     # via nbconvert
258 | jupyterlab-server==2.27.1
259 |     # via jupyterlab
260 | kedro==0.19.6
261 |     # via
262 |     #   -r demo/delay-prediction/requirements.txt
263 |     #   kedro-datasets
264 |     #   kedro-telemetry
265 |     #   kedro-viz
266 | kedro-datasets==3.0.1
267 |     # via -r demo/delay-prediction/requirements.txt
268 | kedro-telemetry==0.4.0
269 |     # via -r demo/delay-prediction/requirements.txt
270 | kedro-viz==9.1.0
271 |     # via -r demo/delay-prediction/requirements.txt
272 | lazy-loader==0.4
273 |     # via kedro-datasets
274 | markdown-it-py==3.0.0
275 |     # via rich
276 | markupsafe==2.1.5
277 |     # via
278 |     #   jinja2
279 |     #   nbconvert
280 | matplotlib-inline==0.1.7
281 |     # via
282 |     #   ipykernel
283 |     #   ipython
284 | mdurl==0.1.2
285 |     # via markdown-it-py
286 | mistune==3.0.2
287 |     # via nbconvert
288 | more-itertools==10.2.0
289 |     # via kedro
290 | multidict==6.0.5
291 |     # via
292 |     #   aiohttp
293 |     #   yarl
294 | nbclient==0.10.0
295 |     # via nbconvert
296 | nbconvert==7.16.4
297 |     # via jupyter-server
298 | nbformat==5.10.4
299 |     # via
300 |     #   jupyter-server
301 |     #   nbclient
302 |     #   nbconvert
303 | nest-asyncio==1.6.0
304 |     # via ipykernel
305 | networkx==3.3
306 |     # via kedro-viz
307 | notebook-shim==0.2.4
308 |     # via jupyterlab
309 | numpy==1.26.4
310 |     # via
311 |     #   ibis-framework
312 |     #   pandas
313 |     #   pyarrow
314 |     #   scikit-learn
315 |     #   scipy
316 | oauthlib==3.2.2
317 |     # via requests-oauthlib
318 | omegaconf==2.3.0
319 |     # via kedro
320 | orjson==3.10.3
321 |     # via
322 |     #   fastapi
323 |     #   kedro-viz
324 | overrides==7.7.0
325 |     # via jupyter-server
326 | packaging==23.2
327 |     # via
328 |     #   build
329 |     #   ibis-framework
330 |     #   ipykernel
331 |     #   jupyter-server
332 |     #   jupyterlab
333 |     #   jupyterlab-server
334 |     #   kedro-viz
335 |     #   lazy-loader
336 |     #   nbconvert
337 |     #   plotly
338 |     #   pytoolconfig
339 | pandas==2.2.2
340 |     # via
341 |     #   ibis-framework
342 |     #   kedro-viz
343 |     #   pins
344 | pandocfilters==1.5.1
345 |     # via nbconvert
346 | parse==1.20.1
347 |     # via kedro
348 | parso==0.8.4
349 |     # via jedi
350 | parsy==2.1
351 |     # via ibis-framework
352 | pexpect==4.9.0
353 |     # via ipython
354 | pins==0.8.6
355 |     # via ibis-framework
356 | platformdirs==4.2.1
357 |     # via
358 |     #   jupyter-core
359 |     #   pytoolconfig
360 | plotly==5.22.0
361 |     # via kedro-viz
362 | pluggy==1.5.0
363 |     # via kedro
364 | polars==0.20.23
365 |     # via ibis-framework
366 | pre-commit-hooks==4.6.0
367 |     # via kedro
368 | prometheus-client==0.20.0
369 |     # via jupyter-server
370 | prompt-toolkit==3.0.43
371 |     # via ipython
372 | proto-plus==1.23.0
373 |     # via google-api-core
374 | protobuf==4.25.3
375 |     # via
376 |     #   google-api-core
377 |     #   googleapis-common-protos
378 |     #   proto-plus
379 | psutil==5.9.8
380 |     # via ipykernel
381 | psycopg2==2.9.9
382 |     # via ibis-framework
383 | ptyprocess==0.7.0
384 |     # via
385 |     #   pexpect
386 |     #   terminado
387 | pure-eval==0.2.2
388 |     # via stack-data
389 | pyarrow==16.0.0
390 |     # via ibis-framework
391 | pyarrow-hotfix==0.6
392 |     # via ibis-framework
393 | pyasn1==0.6.0
394 |     # via
395 |     #   pyasn1-modules
396 |     #   rsa
397 | pyasn1-modules==0.4.0
398 |     # via google-auth
399 | pycparser==2.22
400 |     # via cffi
401 | pydantic==2.7.3
402 |     # via
403 |     #   fastapi
404 |     #   kedro-viz
405 | pydantic-core==2.18.4
406 |     # via pydantic
407 | pygments==2.18.0
408 |     # via
409 |     #   ipython
410 |     #   nbconvert
411 |     #   rich
412 | pyproject-hooks==1.1.0
413 |     # via build
414 | python-dateutil==2.9.0.post0
415 |     # via
416 |     #   arrow
417 |     #   ibis-framework
418 |     #   jupyter-client
419 |     #   pandas
420 |     #   strawberry-graphql
421 | python-dotenv==1.0.1
422 |     # via uvicorn
423 | python-json-logger==2.0.7
424 |     # via jupyter-events
425 | python-multipart==0.0.9
426 |     # via fastapi
427 | python-slugify==8.0.4
428 |     # via cookiecutter
429 | pytoolconfig==1.3.1
430 |     # via rope
431 | pytz==2024.1
432 |     # via
433 |     #   ibis-framework
434 |     #   pandas
435 | pyyaml==6.0.1
436 |     # via
437 |     #   cookiecutter
438 |     #   jupyter-events
439 |     #   kedro
440 |     #   omegaconf
441 |     #   pins
442 |     #   uvicorn
443 | pyzmq==26.0.3
444 |     # via
445 |     #   ipykernel
446 |     #   jupyter-client
447 |     #   jupyter-server
448 | referencing==0.35.1
449 |     # via
450 |     #   jsonschema
451 |     #   jsonschema-specifications
452 |     #   jupyter-events
453 | requests==2.31.0
454 |     # via
455 |     #   cookiecutter
456 |     #   gcsfs
457 |     #   google-api-core
458 |     #   google-cloud-storage
459 |     #   jupyterlab-server
460 |     #   kedro-telemetry
461 |     #   pins
462 |     #   requests-oauthlib
463 | requests-oauthlib==2.0.0
464 |     # via google-auth-oauthlib
465 | rfc3339-validator==0.1.4
466 |     # via
467 |     #   jsonschema
468 |     #   jupyter-events
469 | rfc3986-validator==0.1.1
470 |     # via
471 |     #   jsonschema
472 |     #   jupyter-events
473 | rich==13.7.1
474 |     # via
475 |     #   cookiecutter
476 |     #   ibis-framework
477 |     #   kedro
478 |     #   typer
479 | rope==1.13.0
480 |     # via kedro
481 | rpds-py==0.18.1
482 |     # via
483 |     #   jsonschema
484 |     #   referencing
485 | rsa==4.9
486 |     # via google-auth
487 | ruamel-yaml==0.18.6
488 |     # via pre-commit-hooks
489 | ruamel-yaml-clib==0.2.8
490 |     # via ruamel-yaml
491 | scikit-learn==1.5.0
492 |     # via -r demo/delay-prediction/requirements.txt
493 | scipy==1.13.0
494 |     # via scikit-learn
495 | secure==0.3.0
496 |     # via kedro-viz
497 | send2trash==1.8.3
498 |     # via jupyter-server
499 | shellingham==1.5.4
500 |     # via typer
501 | six==1.16.0
502 |     # via
503 |     #   asttokens
504 |     #   bleach
505 |     #   python-dateutil
506 |     #   rfc3339-validator
507 | smmap==5.0.1
508 |     # via gitdb
509 | sniffio==1.3.1
510 |     # via
511 |     #   anyio
512 |     #   httpx
513 | soupsieve==2.5
514 |     # via beautifulsoup4
515 | sqlalchemy==2.0.30
516 |     # via kedro-viz
517 | sqlglot==23.12.2
518 |     # via ibis-framework
519 | stack-data==0.6.3
520 |     # via ipython
521 | starlette==0.37.2
522 |     # via fastapi
523 | strawberry-graphql==0.234.2
524 |     # via kedro-viz
525 | tenacity==8.2.3
526 |     # via plotly
527 | terminado==0.18.1
528 |     # via
529 |     #   jupyter-server
530 |     #   jupyter-server-terminals
531 | text-unidecode==1.3
532 |     # via python-slugify
533 | threadpoolctl==3.5.0
534 |     # via scikit-learn
535 | tinycss2==1.3.0
536 |     # via nbconvert
537 | toml==0.10.2
538 |     # via kedro
539 | toolz==0.12.1
540 |     # via ibis-framework
541 | toposort==1.10
542 |     # via kedro-viz
543 | tornado==6.4
544 |     # via
545 |     #   ipykernel
546 |     #   jupyter-client
547 |     #   jupyter-server
548 |     #   jupyterlab
549 |     #   terminado
550 | traitlets==5.14.3
551 |     # via
552 |     #   comm
553 |     #   ipykernel
554 |     #   ipython
555 |     #   jupyter-client
556 |     #   jupyter-core
557 |     #   jupyter-events
558 |     #   jupyter-server
559 |     #   jupyterlab
560 |     #   matplotlib-inline
561 |     #   nbclient
562 |     #   nbconvert
563 |     #   nbformat
564 | typer==0.12.3
565 |     # via fastapi-cli
566 | types-python-dateutil==2.9.0.20240316
567 |     # via arrow
568 | typing-extensions==4.11.0
569 |     # via
570 |     #   fastapi
571 |     #   ibis-framework
572 |     #   ipython
573 |     #   pydantic
574 |     #   pydantic-core
575 |     #   sqlalchemy
576 |     #   strawberry-graphql
577 |     #   typer
578 | tzdata==2024.1
579 |     # via pandas
580 | ujson==5.10.0
581 |     # via fastapi
582 | uri-template==1.3.0
583 |     # via jsonschema
584 | urllib3==2.2.1
585 |     # via requests
586 | uvicorn==0.29.0
587 |     # via
588 |     #   fastapi
589 |     #   kedro-viz
590 | uvloop==0.19.0
591 |     # via uvicorn
592 | watchfiles==0.22.0
593 |     # via uvicorn
594 | watchgod==0.8.2
595 |     # via kedro-viz
596 | wcwidth==0.2.13
597 |     # via prompt-toolkit
598 | webcolors==1.13
599 |     # via jsonschema
600 | webencodings==0.5.1
601 |     # via
602 |     #   bleach
603 |     #   tinycss2
604 | websocket-client==1.8.0
605 |     # via jupyter-server
606 | websockets==12.0
607 |     # via uvicorn
608 | xxhash==3.4.1
609 |     # via pins
610 | yarl==1.9.4
611 |     # via aiohttp
612 | zipp==3.18.1
613 |     # via importlib-metadata
614 | 


--------------------------------------------------------------------------------
/03 - First Steps with Kedro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "editable": true,
  7 |     "slideshow": {
  8 |      "slide_type": ""
  9 |     },
 10 |     "tags": []
 11 |    },
 12 |    "source": [
 13 |     "# First Steps with Kedro\n",
 14 |     "\n",
 15 |     "<img src=\"static/kedro-horizontal-color-on-light.png\" width=\"400\" alt=\"Kedro\">\n",
 16 |     "\n",
 17 |     "**Goal**: Create a classifier that predicts whether a flight will be delayed or not, using the [nycflights13 data](https://github.com/hadley/nycflights13)."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {
 23 |     "editable": true,
 24 |     "slideshow": {
 25 |      "slide_type": "notes"
 26 |     },
 27 |     "tags": []
 28 |    },
 29 |    "source": [
 30 |     "To see the end result,\n",
 31 |     "\n",
 32 |     "```\n",
 33 |     "$ cd demo/delay-prediction\n",
 34 |     "$ kedro viz run\n",
 35 |     "```\n",
 36 |     "\n",
 37 |     "<img src=\"static/kedro-final-pipeline.png\" width=\"600\" alt=\"Kedro final pipeline\">"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import ibis\n",
 47 |     "\n",
 48 |     "ibis.options.interactive = True"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## The `DataCatalog`\n",
 56 |     "\n",
 57 |     "Kedro’s [Data Catalog](https://docs.kedro.org/en/latest/data/) is a registry of all data sources available for use by the project. It offers a separate place to declare details of the datasets your projects use. Kedro provides built-in datasets for different file types and file systems so you don’t have to write any of the logic for reading or writing data.\n",
 58 |     "\n",
 59 |     "Kedro offers a range of datasets, including CSV, Excel, Parquet, Feather, HDF5, JSON, Pickle, SQL Tables, SQL Queries, Spark DataFrames, and more. They are supported with the APIs of pandas, spark, networkx, matplotlib, yaml, and beyond. It relies on fsspec to read and save data from a variety of data stores including local file systems, network file systems, cloud object stores, and Hadoop. You can pass arguments in to load and save operations, and use versioning and credentials for data access.\n",
 60 |     "\n",
 61 |     "To start using the Data Catalog, create an instance of the `DataCatalog` class with a dictionary configuration as follows:"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from kedro.io import DataCatalog"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "catalog = DataCatalog.from_config(\n",
 80 |     "    {\n",
 81 |     "        \"flights\": {\n",
 82 |     "            \"type\": \"ibis.TableDataset\",\n",
 83 |     "            \"table_name\": \"flights\",\n",
 84 |     "            \"connection\": {\n",
 85 |     "                \"backend\": \"duckdb\",\n",
 86 |     "                \"database\": \"nycflights13.ddb\",\n",
 87 |     "                \"read_only\": True,\n",
 88 |     "            },\n",
 89 |     "        }\n",
 90 |     "    }\n",
 91 |     ")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "Each entry in the dictionary represents a **dataset**, and each dataset has a **type** as well as some extra properties. Datasets are Python classes that take care of all the I/O needs in Kedro. In this case, we're using `kedro_datasets.ibis.TableDataset`, you can read [its full documentation](https://docs.kedro.org/projects/kedro-datasets/en/kedro-datasets-3.0.1/api/kedro_datasets.ibis.TableDataset.html) online.\n",
 99 |     "\n",
100 |     "After the catalog is created, `catalog.list()` will yield a list of the available dataset names, which you can load using the `catalog.load(<dataset_name>)` method:"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "catalog.list()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "flights = catalog.load(\"flights\")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "Notice that the resulting object is the exact same Ibis table we were using in the previous tutorial!"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "type(flights)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "flights"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## The `OmegaConfigLoader`\n",
151 |     "\n",
152 |     "Instead of creating the Data Catalog by hand like this, Kedro usually stores configuration in YAML files. To load them, Kedro offers a [configuration loader](https://docs.kedro.org/en/latest/configuration/configuration_basics.html) based on the [Omegaconf](https://omegaconf.readthedocs.io/) library called the `OmegaConfigLoader`. This adds several interesting features, such as\n",
153 |     "\n",
154 |     "- Consolidating different configuration files into one\n",
155 |     "- Substitution, templating\n",
156 |     "- [Resolvers](https://omegaconf.readthedocs.io/en/2.3_branch/custom_resolvers.html)\n",
157 |     "- And [much more](https://docs.kedro.org/en/latest/configuration/advanced_configuration.html)\n",
158 |     "\n",
159 |     "To start using it, first dump the catalog configuration to a `catalog.yml` file, and then use `OmegaConfigLoader` as follows:"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "%%writefile catalog.yml\n",
169 |     "flights:\n",
170 |     "  type: ibis.TableDataset\n",
171 |     "  table_name: flights\n",
172 |     "  connection:\n",
173 |     "    backend: duckdb\n",
174 |     "    database: nycflights13.ddb\n",
175 |     "    read_only: true"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "from kedro.config import OmegaConfigLoader\n",
185 |     "\n",
186 |     "config_loader = OmegaConfigLoader(\n",
187 |     "    conf_source=\".\",  # Directory where configuration files are located\n",
188 |     "    config_patterns={\"catalog\": [\"catalog.yml\"]},  # For simplicity for this demo\n",
189 |     ")"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "catalog_config = config_loader.get(\"catalog\")\n",
199 |     "catalog_config"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "As you can see, `config_loader.get(\"catalog\")` gets you the same dictionary we crafted by hand earlier.\n",
207 |     "\n",
208 |     "However, hardcoding the database path like that seems like an invitation to trouble. Let's declare a variable `_root` inside the YAML file using Omegaconf syntax and load the catalog config again:"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "%%writefile catalog.yml\n",
218 |     "_root: /workspaces/kedro-ibis-tutorial\n",
219 |     "\n",
220 |     "flights:\n",
221 |     "  type: ibis.TableDataset\n",
222 |     "  table_name: flights\n",
223 |     "  connection:\n",
224 |     "    backend: duckdb\n",
225 |     "    database: \"${_root}/nycflights13.ddb\"\n",
226 |     "    read_only: true"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "catalog_config = config_loader.get(\"catalog\")\n",
236 |     "catalog_config"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "catalog = DataCatalog.from_config(catalog_config)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "catalog.load(\"flights\")"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "## Nodes and pipelines\n",
262 |     "\n",
263 |     "Now comes the interesting part. Kedro structures the computation on Directed Acyclic Graphs (DAGs), which are created by instantiating `Pipeline` objects with a list of `Node`s. By linking the inputs and outpus of each node, Kedro is then able to perform a topological sort and produce a graph.\n",
264 |     "\n",
265 |     "Let's start creating a trivial pipeline with 1 node. That 1 node will be a preprocessing function that will manipulate the `dep_time`, `arr_delay`, and `air_time` columns."
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "def preprocess_flights(table):\n",
275 |     "    return table.mutate(\n",
276 |     "        dep_time=(\n",
277 |     "            table.dep_time.lpad(4, \"0\").substr(0, 2)\n",
278 |     "            + \":\"\n",
279 |     "            + table.dep_time.substr(-2, 2)\n",
280 |     "            + \":00\"\n",
281 |     "        ).try_cast(\"time\"),\n",
282 |     "        arr_delay=table.arr_delay.try_cast(int),\n",
283 |     "        air_time=table.air_time.try_cast(int),\n",
284 |     "    )"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {},
291 |    "outputs": [],
292 |    "source": [
293 |     "flights.select(\"year\", \"month\", \"day\", \"dep_time\")"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "preprocess_flights(flights).select(\"year\", \"month\", \"day\", \"dep_time\")"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "Notice that this is a plain Python function, receiving an Ibis table and returning another Ibis table.\n",
310 |     "\n",
311 |     "Now, let's wrap it using the `node` convenience function from Kedro:"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "from kedro.pipeline import node\n",
321 |     "\n",
322 |     "n0 = node(func=preprocess_flights, inputs=\"flights\", outputs=\"preprocessed_flights\")\n",
323 |     "n0"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "Conceptually, a `Node` is a wrapper around a Python function that defines a single step in a pipeline. It has inputs and outputs, which are the names of the Data Catalog datasets that the function will receive and return, respectively. Therefore, you could execute it as follows:\n",
331 |     "\n",
332 |     "```python\n",
333 |     "n0.func(\n",
334 |     "    *[catalog.load(input_dataset) for input_dataset in n0.inputs],\n",
335 |     ")\n",
336 |     "```\n",
337 |     "\n",
338 |     "Let's not do that though; Kedro will take care of it.\n",
339 |     "\n",
340 |     "The next step is to assemble the pipeline. In this case, it will only have 1 node:"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "from kedro.pipeline import pipeline\n",
350 |     "\n",
351 |     "pipe = pipeline([n0])\n",
352 |     "pipe"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "And finally, you can now execute the pipeline. For the purposes of this tutorial, you can use Kedro's `SequentialRunner` directly:"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": null,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "from kedro.runner import SequentialRunner\n",
369 |     "\n",
370 |     "outputs = SequentialRunner().run(pipe, catalog=catalog)"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "The output of the `.run(...)` method will be \"Any node outputs that cannot be processed by the `DataCatalog`\". Since `preprocessed_flights` is not declared in the Data Catalog, it's right there in the dictionary:"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "outputs.keys()"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "outputs[\"preprocessed_flights\"]"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "## Exercises\n",
403 |     "\n",
404 |     "### Exercise 1\n",
405 |     "\n",
406 |     "Complete the `catalog.yml` so that `weather` is included as well.\n",
407 |     "\n",
408 |     "_Extra points_ if you factor the connection details in a variable."
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": null,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": [
417 |     "%load solutions/nb03_ex01_catalog.yml"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "### Exercise 2\n",
425 |     "\n",
426 |     "Complete the data processing pipeline by defining a `create_model_input_table` function that combines the preprocessed flights and weather data:\n",
427 |     "\n",
428 |     "```python\n",
429 |     "def create_model_input_table(flights, weather) -> ir.Table:\n",
430 |     "    ...\n",
431 |     "```\n",
432 |     "\n",
433 |     "_Hint_: See the `join` explanation in the Ibis notebook.\n",
434 |     "\n",
435 |     "Then, recreate the pipeline so that it has two nodes.\n",
436 |     "\n",
437 |     "_Extra points_ if your node drops the null values of the resulting table and selects only a subset of the columns."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {},
444 |    "outputs": [],
445 |    "source": [
446 |     "%load solutions/nb03_ex02.py"
447 |    ]
448 |   }
449 |  ],
450 |  "metadata": {
451 |   "kernelspec": {
452 |    "display_name": "Python 3 (ipykernel)",
453 |    "language": "python",
454 |    "name": "python3"
455 |   },
456 |   "language_info": {
457 |    "codemirror_mode": {
458 |     "name": "ipython",
459 |     "version": 3
460 |    },
461 |    "file_extension": ".py",
462 |    "mimetype": "text/x-python",
463 |    "name": "python",
464 |    "nbconvert_exporter": "python",
465 |    "pygments_lexer": "ipython3",
466 |    "version": "3.11.9"
467 |   }
468 |  },
469 |  "nbformat": 4,
470 |  "nbformat_minor": 4
471 | }
472 | 


--------------------------------------------------------------------------------
/01 - Getting Started with Ibis.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "cells": [
   3 |     {
   4 |       "cell_type": "markdown",
   5 |       "metadata": {},
   6 |       "source": [
   7 |         "# Getting started with Ibis\n",
   8 |         "\n",
   9 |         "In the [previous notebook](./00%20-%20Welcome.ipynb), we created a DuckDB database file with the [nycflights13 data](https://github.com/hadley/nycflights13). DuckDB is fast and runs locally, so it's handy for lots of use cases, including tutorials. Let's begin by importing Ibis and connecting to the database."
  10 |       ]
  11 |     },
  12 |     {
  13 |       "cell_type": "code",
  14 |       "execution_count": null,
  15 |       "metadata": {},
  16 |       "outputs": [],
  17 |       "source": [
  18 |         "import ibis\n",
  19 |         "\n",
  20 |         "con = ibis.duckdb.connect(\"nycflights13.ddb\", read_only=True)"
  21 |       ]
  22 |     },
  23 |     {
  24 |       "cell_type": "markdown",
  25 |       "metadata": {},
  26 |       "source": [
  27 |         "**Note**: When you connect to a DuckDB database file, DuckDB creates a WAL file to prevent data corruption. If you see a `nycflights13.ddb.wal` file, you can safely ignore it. It will get cleaned up automatically.\n",
  28 |         "\n",
  29 |         "Now we have a connection, we can start by looking around. Are there any tables in this database?"
  30 |       ]
  31 |     },
  32 |     {
  33 |       "cell_type": "code",
  34 |       "execution_count": null,
  35 |       "metadata": {},
  36 |       "outputs": [],
  37 |       "source": [
  38 |         "con.list_tables()"
  39 |       ]
  40 |     },
  41 |     {
  42 |       "cell_type": "markdown",
  43 |       "metadata": {},
  44 |       "source": [
  45 |         "Two, in fact! Let's take a look at the `flights` table first."
  46 |       ]
  47 |     },
  48 |     {
  49 |       "cell_type": "code",
  50 |       "execution_count": null,
  51 |       "metadata": {},
  52 |       "outputs": [],
  53 |       "source": [
  54 |         "flights = con.table(\"flights\")"
  55 |       ]
  56 |     },
  57 |     {
  58 |       "cell_type": "markdown",
  59 |       "metadata": {},
  60 |       "source": [
  61 |         "By default, you'll get a printable representation of the table schema, showing the name and data type of each column."
  62 |       ]
  63 |     },
  64 |     {
  65 |       "cell_type": "code",
  66 |       "execution_count": null,
  67 |       "metadata": {},
  68 |       "outputs": [],
  69 |       "source": [
  70 |         "flights"
  71 |       ]
  72 |     },
  73 |     {
  74 |       "cell_type": "markdown",
  75 |       "metadata": {},
  76 |       "source": [
  77 |         "If we call the `head` method to peek at the data, you'll notice that we don't actually see data (yet); what's going on?"
  78 |       ]
  79 |     },
  80 |     {
  81 |       "cell_type": "code",
  82 |       "execution_count": null,
  83 |       "metadata": {},
  84 |       "outputs": [],
  85 |       "source": [
  86 |         "flights.head()"
  87 |       ]
  88 |     },
  89 |     {
  90 |       "cell_type": "markdown",
  91 |       "metadata": {},
  92 |       "source": [
  93 |         "Ibis has a deferred execution model. It builds up expressions based on what you ask it to do, and then executes those expressions on request.\n",
  94 |         "\n",
  95 |         "In this case, our query isn't too involved; we want to see the first few rows of the `flights` table. We can do that by asking for the results of this query as a `pandas.DataFrame`:"
  96 |       ]
  97 |     },
  98 |     {
  99 |       "cell_type": "code",
 100 |       "execution_count": null,
 101 |       "metadata": {},
 102 |       "outputs": [],
 103 |       "source": [
 104 |         "flights.head().to_pandas()"
 105 |       ]
 106 |     },
 107 |     {
 108 |       "cell_type": "markdown",
 109 |       "metadata": {},
 110 |       "source": [
 111 |         "Or a `pyarrow.Table`:"
 112 |       ]
 113 |     },
 114 |     {
 115 |       "cell_type": "code",
 116 |       "execution_count": null,
 117 |       "metadata": {},
 118 |       "outputs": [],
 119 |       "source": [
 120 |         "flights.head().to_pyarrow()"
 121 |       ]
 122 |     },
 123 |     {
 124 |       "cell_type": "markdown",
 125 |       "metadata": {},
 126 |       "source": [
 127 |         "Or a `polars.DataFrame`:"
 128 |       ]
 129 |     },
 130 |     {
 131 |       "cell_type": "code",
 132 |       "execution_count": null,
 133 |       "metadata": {},
 134 |       "outputs": [],
 135 |       "source": [
 136 |         "flights.head().to_polars()"
 137 |       ]
 138 |     },
 139 |     {
 140 |       "cell_type": "markdown",
 141 |       "metadata": {},
 142 |       "source": [
 143 |         "We'll get into more detail about what Ibis is doing a bit later on. For now, the important point is that Ibis is deferred.\n",
 144 |         "\n",
 145 |         "## Interactive mode\n",
 146 |         "\n",
 147 |         "Remember when we said Ibis is deferred? Sometimes you want eager execution so you can explore a dataset. For the rest of this notebook, we'll turn on interactive mode, where Ibis will eagerly execute as much of the query as it needs to in order to show you the first 10 rows of the result."
 148 |       ]
 149 |     },
 150 |     {
 151 |       "cell_type": "code",
 152 |       "execution_count": null,
 153 |       "metadata": {},
 154 |       "outputs": [],
 155 |       "source": [
 156 |         "ibis.options.interactive = True"
 157 |       ]
 158 |     },
 159 |     {
 160 |       "cell_type": "markdown",
 161 |       "metadata": {},
 162 |       "source": [
 163 |         "In interactive mode, we use `rich` to render the output inline:"
 164 |       ]
 165 |     },
 166 |     {
 167 |       "cell_type": "code",
 168 |       "execution_count": null,
 169 |       "metadata": {},
 170 |       "outputs": [],
 171 |       "source": [
 172 |         "flights.head()"
 173 |       ]
 174 |     },
 175 |     {
 176 |       "cell_type": "markdown",
 177 |       "metadata": {},
 178 |       "source": [
 179 |         "## Tables and columns\n",
 180 |         "\n",
 181 |         "`flights` is a table!  A table is a collection of one or more columns, each with a specific datatype."
 182 |       ]
 183 |     },
 184 |     {
 185 |       "cell_type": "code",
 186 |       "execution_count": null,
 187 |       "metadata": {},
 188 |       "outputs": [],
 189 |       "source": [
 190 |         "flights"
 191 |       ]
 192 |     },
 193 |     {
 194 |       "cell_type": "code",
 195 |       "execution_count": null,
 196 |       "metadata": {},
 197 |       "outputs": [],
 198 |       "source": [
 199 |         "type(flights)"
 200 |       ]
 201 |     },
 202 |     {
 203 |       "cell_type": "markdown",
 204 |       "metadata": {},
 205 |       "source": [
 206 |         "We can look at a single column of that table using the column name as an attribute:"
 207 |       ]
 208 |     },
 209 |     {
 210 |       "cell_type": "code",
 211 |       "execution_count": null,
 212 |       "metadata": {},
 213 |       "outputs": [],
 214 |       "source": [
 215 |         "flights.carrier"
 216 |       ]
 217 |     },
 218 |     {
 219 |       "cell_type": "markdown",
 220 |       "metadata": {},
 221 |       "source": [
 222 |         "What kind of column is `carrier`? It's a `StringColumn`!"
 223 |       ]
 224 |     },
 225 |     {
 226 |       "cell_type": "code",
 227 |       "execution_count": null,
 228 |       "metadata": {},
 229 |       "outputs": [],
 230 |       "source": [
 231 |         "type(flights.carrier)"
 232 |       ]
 233 |     },
 234 |     {
 235 |       "cell_type": "markdown",
 236 |       "metadata": {},
 237 |       "source": [
 238 |         "## Ibis \"verbs\", or, stuff you can do to a table\n",
 239 |         "\n",
 240 |         "The rest of this notebook covers some of the general methods you can use to alter the output of a particular table.\n",
 241 |         "\n",
 242 |         "We'll cover, in order, `filter`, `select`, `drop`, `mutate`, `order_by`, `aggregate`, and `group_by`. Time to dive in!\n",
 243 |         "\n",
 244 |         "## Filter\n",
 245 |         "\n",
 246 |         "A filter allows you to view a subset of the rows in a table, based on some condition.\n",
 247 |         "\n",
 248 |         "For instance, we might want to only view data for JetBlue flights:"
 249 |       ]
 250 |     },
 251 |     {
 252 |       "cell_type": "code",
 253 |       "execution_count": null,
 254 |       "metadata": {},
 255 |       "outputs": [],
 256 |       "source": [
 257 |         "flights.filter(flights.carrier == \"B6\")"
 258 |       ]
 259 |     },
 260 |     {
 261 |       "cell_type": "markdown",
 262 |       "metadata": {},
 263 |       "source": [
 264 |         "You can also combine multiple filters, across multiple columns.\n",
 265 |         "\n",
 266 |         "We can subset the data down to JetBlue flights from JFK:"
 267 |       ]
 268 |     },
 269 |     {
 270 |       "cell_type": "code",
 271 |       "execution_count": null,
 272 |       "metadata": {
 273 |         "scrolled": true
 274 |       },
 275 |       "outputs": [],
 276 |       "source": [
 277 |         "expr = flights.filter((flights.carrier == \"B6\") & (flights.origin == \"JFK\"))\n",
 278 |         "expr"
 279 |       ]
 280 |     },
 281 |     {
 282 |       "cell_type": "markdown",
 283 |       "metadata": {},
 284 |       "source": [
 285 |         "Above, we combined two filters using `&`. You can also pass them in as individual arguments:"
 286 |       ]
 287 |     },
 288 |     {
 289 |       "cell_type": "code",
 290 |       "execution_count": null,
 291 |       "metadata": {},
 292 |       "outputs": [],
 293 |       "source": [
 294 |         "expr = flights.filter(\n",
 295 |         "    flights.carrier == \"B6\",\n",
 296 |         "    flights.origin == \"JFK\",\n",
 297 |         ")\n",
 298 |         "expr"
 299 |       ]
 300 |     },
 301 |     {
 302 |       "cell_type": "code",
 303 |       "execution_count": null,
 304 |       "metadata": {},
 305 |       "outputs": [],
 306 |       "source": [
 307 |         "ibis.to_sql(expr)"
 308 |       ]
 309 |     },
 310 |     {
 311 |       "cell_type": "markdown",
 312 |       "metadata": {},
 313 |       "source": [
 314 |         "## Select\n",
 315 |         "\n",
 316 |         "Filter filters, Select selects (there's a pattern here).\n",
 317 |         "If you only want a subset of the columns in the original table, you can select\n",
 318 |         "those columns explicitly.\n",
 319 |         "\n",
 320 |         "You can refer to the columns using strings:"
 321 |       ]
 322 |     },
 323 |     {
 324 |       "cell_type": "code",
 325 |       "execution_count": null,
 326 |       "metadata": {},
 327 |       "outputs": [],
 328 |       "source": [
 329 |         "flights.select(\"carrier\", \"origin\", \"dest\")"
 330 |       ]
 331 |     },
 332 |     {
 333 |       "cell_type": "markdown",
 334 |       "metadata": {},
 335 |       "source": [
 336 |         "Or you can use explicit references to the `Column` objects:"
 337 |       ]
 338 |     },
 339 |     {
 340 |       "cell_type": "code",
 341 |       "execution_count": null,
 342 |       "metadata": {},
 343 |       "outputs": [],
 344 |       "source": [
 345 |         "flights.select(flights.carrier, flights.origin, flights.dest)"
 346 |       ]
 347 |     },
 348 |     {
 349 |       "cell_type": "markdown",
 350 |       "metadata": {},
 351 |       "source": [
 352 |         "Or you can mix and match:"
 353 |       ]
 354 |     },
 355 |     {
 356 |       "cell_type": "code",
 357 |       "execution_count": null,
 358 |       "metadata": {},
 359 |       "outputs": [],
 360 |       "source": [
 361 |         "flights.select(\"carrier\", \"origin\", flights.dest)"
 362 |       ]
 363 |     },
 364 |     {
 365 |       "cell_type": "markdown",
 366 |       "metadata": {},
 367 |       "source": [
 368 |         "## Drop\n",
 369 |         "\n",
 370 |         "Drop is nearly the same as Select, but rather than explicitly choosing the columns to display, we explicitly choose the columns to _not_ display.\n",
 371 |         "\n",
 372 |         "And as with `select`, you can specify the columns as strings:"
 373 |       ]
 374 |     },
 375 |     {
 376 |       "cell_type": "code",
 377 |       "execution_count": null,
 378 |       "metadata": {},
 379 |       "outputs": [],
 380 |       "source": [
 381 |         "flights.drop(\"flight\", \"tailnum\")"
 382 |       ]
 383 |     },
 384 |     {
 385 |       "cell_type": "markdown",
 386 |       "metadata": {},
 387 |       "source": [
 388 |         "Or you can use explicit references to the `Column` objects:"
 389 |       ]
 390 |     },
 391 |     {
 392 |       "cell_type": "code",
 393 |       "execution_count": null,
 394 |       "metadata": {},
 395 |       "outputs": [],
 396 |       "source": [
 397 |         "flights.drop(flights.flight, flights.tailnum)"
 398 |       ]
 399 |     },
 400 |     {
 401 |       "cell_type": "markdown",
 402 |       "metadata": {},
 403 |       "source": [
 404 |         "Or you can mix and match:"
 405 |       ]
 406 |     },
 407 |     {
 408 |       "cell_type": "code",
 409 |       "execution_count": null,
 410 |       "metadata": {},
 411 |       "outputs": [],
 412 |       "source": [
 413 |         "flights.drop(\"flight\", flights.tailnum)"
 414 |       ]
 415 |     },
 416 |     {
 417 |       "cell_type": "markdown",
 418 |       "metadata": {},
 419 |       "source": [
 420 |         "## Mutate\n",
 421 |         "\n",
 422 |         "Everything we've seen so far has been subtractive—removing rows or columns. What about _adding_ columns?\n",
 423 |         "\n",
 424 |         "That's what `mutate` is for! You can create a new column as a function of other existing columns (for example, converting units):"
 425 |       ]
 426 |     },
 427 |     {
 428 |       "cell_type": "code",
 429 |       "execution_count": null,
 430 |       "metadata": {},
 431 |       "outputs": [],
 432 |       "source": [
 433 |         "flights.mutate(distance_km=flights.distance * 1.609)"
 434 |       ]
 435 |     },
 436 |     {
 437 |       "cell_type": "markdown",
 438 |       "metadata": {},
 439 |       "source": [
 440 |         "Or you can create a new column and populate it with some literal value:"
 441 |       ]
 442 |     },
 443 |     {
 444 |       "cell_type": "code",
 445 |       "execution_count": null,
 446 |       "metadata": {},
 447 |       "outputs": [],
 448 |       "source": [
 449 |         "flights.mutate(my_favorite_number=ibis.literal(41))"
 450 |       ]
 451 |     },
 452 |     {
 453 |       "cell_type": "markdown",
 454 |       "metadata": {},
 455 |       "source": [
 456 |         "## On immutability\n",
 457 |         "\n",
 458 |         "We've filtered, selected, dropped, and mutated this `flights` table quite a bit."
 459 |       ]
 460 |     },
 461 |     {
 462 |       "cell_type": "code",
 463 |       "execution_count": null,
 464 |       "metadata": {},
 465 |       "outputs": [],
 466 |       "source": [
 467 |         "flights"
 468 |       ]
 469 |     },
 470 |     {
 471 |       "cell_type": "markdown",
 472 |       "metadata": {},
 473 |       "source": [
 474 |         "And yet, notice that none of our changes persist—the base table for our query isn't altered. The query (or expression) is a recipe of things to do with the base table (`flights`).\n",
 475 |         "\n",
 476 |         "If you want to keep an expression around, you can assign it to a variable:"
 477 |       ]
 478 |     },
 479 |     {
 480 |       "cell_type": "code",
 481 |       "execution_count": null,
 482 |       "metadata": {},
 483 |       "outputs": [],
 484 |       "source": [
 485 |         "expr = flights.select(\"carrier\", \"origin\")\n",
 486 |         "expr"
 487 |       ]
 488 |     },
 489 |     {
 490 |       "cell_type": "markdown",
 491 |       "metadata": {},
 492 |       "source": [
 493 |         "**Note**: Every time you execute an expression (via interactive mode, or `to_pandas`, or similar), the entire expression gets executed, starting from the base table.  DuckDB is very fast and this dataset is very small, so the delay is unnoticeable, but for very large datasets, it might become more pronounced.  There is functionality to `cache` intermediate results that isn't covered in this tutorial, but you can [read more about it in the docs](https://ibis-project.org/reference/expression-tables.html#ibis.expr.types.relations.Table.cache).\n",
 494 |         "\n",
 495 |         "## Method chaining\n",
 496 |         "\n",
 497 |         "You can build up complicated queries by chaining together Ibis methods. The output of many Ibis methods is a table (just like `flights`!) and we can continue calling table methods until we're satisfied. Or until we end up with something that _isn't_ a table. More on that later."
 498 |       ]
 499 |     },
 500 |     {
 501 |       "cell_type": "code",
 502 |       "execution_count": null,
 503 |       "metadata": {},
 504 |       "outputs": [],
 505 |       "source": [
 506 |         "flights.select(\"carrier\", \"origin\", \"dest\").drop(\"carrier\")"
 507 |       ]
 508 |     },
 509 |     {
 510 |       "cell_type": "markdown",
 511 |       "metadata": {},
 512 |       "source": [
 513 |         "Not the most complicated (or useful) query, but we'll see more soon.\n",
 514 |         "\n",
 515 |         "### Exercise 1\n",
 516 |         "\n",
 517 |         "Convert the `distance` column from miles to kilometers. For an approximate result, multiply by 1.609.\n",
 518 |         "\n",
 519 |         "Two ways you might accomplish this:\n",
 520 |         "\n",
 521 |         "- Chaining `.mutate` to create the new column and `.drop` to drop the original imperial column\n",
 522 |         "- Using a single `.select` to create the new column as well as select the remaining columns\n",
 523 |         "\n",
 524 |         "Try both ways below! How do they compare?"
 525 |       ]
 526 |     },
 527 |     {
 528 |       "cell_type": "code",
 529 |       "execution_count": null,
 530 |       "metadata": {},
 531 |       "outputs": [],
 532 |       "source": [
 533 |         "# Convert the imperial units to metric, and drop the imperial columns.\n",
 534 |         "# Try this using a `.mutate` and `.drop` call.\n",
 535 |         "flights_metric_mutate_drop = flights"
 536 |       ]
 537 |     },
 538 |     {
 539 |       "cell_type": "code",
 540 |       "execution_count": null,
 541 |       "metadata": {},
 542 |       "outputs": [],
 543 |       "source": [
 544 |         "# Convert the imperial units to metric, and drop the imperial columns.\n",
 545 |         "# Try this using a single `.select` call.\n",
 546 |         "flights_metric_select = flights"
 547 |       ]
 548 |     },
 549 |     {
 550 |       "cell_type": "markdown",
 551 |       "metadata": {},
 552 |       "source": [
 553 |         "#### Solutions"
 554 |       ]
 555 |     },
 556 |     {
 557 |       "cell_type": "code",
 558 |       "execution_count": null,
 559 |       "metadata": {},
 560 |       "outputs": [],
 561 |       "source": [
 562 |         "%load solutions/nb01_ex01_mutate_drop.py"
 563 |       ]
 564 |     },
 565 |     {
 566 |       "cell_type": "code",
 567 |       "execution_count": null,
 568 |       "metadata": {},
 569 |       "outputs": [],
 570 |       "source": [
 571 |         "%load solutions/nb01_ex01_select.py"
 572 |       ]
 573 |     },
 574 |     {
 575 |       "cell_type": "markdown",
 576 |       "metadata": {},
 577 |       "source": [
 578 |         "#### Does it matter which method you choose?\n",
 579 |         "\n",
 580 |         "In this case, no. Sometimes, there might be a small difference in the generated SQL, but they will be semantically equivalent."
 581 |       ]
 582 |     },
 583 |     {
 584 |       "cell_type": "code",
 585 |       "execution_count": null,
 586 |       "metadata": {},
 587 |       "outputs": [],
 588 |       "source": [
 589 |         "ibis.to_sql(flights_metric_mutate_drop)"
 590 |       ]
 591 |     },
 592 |     {
 593 |       "cell_type": "code",
 594 |       "execution_count": null,
 595 |       "metadata": {},
 596 |       "outputs": [],
 597 |       "source": [
 598 |         "ibis.to_sql(flights_metric_select)"
 599 |       ]
 600 |     },
 601 |     {
 602 |       "cell_type": "markdown",
 603 |       "metadata": {},
 604 |       "source": [
 605 |         "In practice, small differences in the generated SQL don't make a difference. Any modern SQL execution engine will optimize variations to the same set of operations, and there will be no measurable performance difference.\n",
 606 |         "\n",
 607 |         "## Order by\n",
 608 |         "\n",
 609 |         "Want to order your data by a given column or columns?  Use `order_by`!\n",
 610 |         "\n",
 611 |         "The default ordering direction is ascending:"
 612 |       ]
 613 |     },
 614 |     {
 615 |       "cell_type": "code",
 616 |       "execution_count": null,
 617 |       "metadata": {},
 618 |       "outputs": [],
 619 |       "source": [
 620 |         "flights.order_by(flights.distance)"
 621 |       ]
 622 |     },
 623 |     {
 624 |       "cell_type": "markdown",
 625 |       "metadata": {},
 626 |       "source": [
 627 |         "We can ask Ibis to sort in descending order, too."
 628 |       ]
 629 |     },
 630 |     {
 631 |       "cell_type": "code",
 632 |       "execution_count": null,
 633 |       "metadata": {
 634 |         "scrolled": true
 635 |       },
 636 |       "outputs": [],
 637 |       "source": [
 638 |         "flights.order_by(flights.distance.desc())"
 639 |       ]
 640 |     },
 641 |     {
 642 |       "cell_type": "markdown",
 643 |       "metadata": {},
 644 |       "source": [
 645 |         "Let's select out a subset of the columns to keep this a bit tidier."
 646 |       ]
 647 |     },
 648 |     {
 649 |       "cell_type": "code",
 650 |       "execution_count": null,
 651 |       "metadata": {},
 652 |       "outputs": [],
 653 |       "source": [
 654 |         "flights.order_by(flights.distance.desc()).select(\n",
 655 |         "    \"carrier\", \"origin\", \"dest\", \"distance\"\n",
 656 |         ")"
 657 |       ]
 658 |     },
 659 |     {
 660 |       "cell_type": "markdown",
 661 |       "metadata": {},
 662 |       "source": [
 663 |         "You can also call `ibis.desc` on the column name to set the order direction:"
 664 |       ]
 665 |     },
 666 |     {
 667 |       "cell_type": "code",
 668 |       "execution_count": null,
 669 |       "metadata": {},
 670 |       "outputs": [],
 671 |       "source": [
 672 |         "flights.order_by(ibis.desc(\"distance\")).select(\"carrier\", \"origin\", \"dest\", \"distance\")"
 673 |       ]
 674 |     },
 675 |     {
 676 |       "cell_type": "markdown",
 677 |       "metadata": {},
 678 |       "source": [
 679 |         "## Aggregate\n",
 680 |         "\n",
 681 |         "Ibis has several aggregate functions available to help summarize data.  All the old favorites are there: `mean`, `max`, `min`, `count`, `sum`...\n",
 682 |         "\n",
 683 |         "You can aggregate a column by calling the method on that column:"
 684 |       ]
 685 |     },
 686 |     {
 687 |       "cell_type": "code",
 688 |       "execution_count": null,
 689 |       "metadata": {},
 690 |       "outputs": [],
 691 |       "source": [
 692 |         "flights.distance.mean()"
 693 |       ]
 694 |     },
 695 |     {
 696 |       "cell_type": "markdown",
 697 |       "metadata": {},
 698 |       "source": [
 699 |         "Or you can compute multiple aggregates using the `aggregate` method (also\n",
 700 |         "available as `agg` for faster typing):"
 701 |       ]
 702 |     },
 703 |     {
 704 |       "cell_type": "code",
 705 |       "execution_count": null,
 706 |       "metadata": {},
 707 |       "outputs": [],
 708 |       "source": [
 709 |         "flights.agg([flights.distance.mean(), flights.air_time.min()])"
 710 |       ]
 711 |     },
 712 |     {
 713 |       "cell_type": "markdown",
 714 |       "metadata": {},
 715 |       "source": [
 716 |         "If you don't like the column names Ibis generates for you, choose your own!"
 717 |       ]
 718 |     },
 719 |     {
 720 |       "cell_type": "code",
 721 |       "execution_count": null,
 722 |       "metadata": {},
 723 |       "outputs": [],
 724 |       "source": [
 725 |         "flights.agg(\n",
 726 |         "    average_distance=flights.distance.mean(),\n",
 727 |         "    shortest_air_time=flights.air_time.min(),\n",
 728 |         ")"
 729 |       ]
 730 |     },
 731 |     {
 732 |       "cell_type": "markdown",
 733 |       "metadata": {},
 734 |       "source": [
 735 |         "But aggregates really shine when paired with a `group_by`!\n",
 736 |         "\n",
 737 |         "## Group by\n",
 738 |         "\n",
 739 |         "`group_by` creates groupings of rows that have the same value for one or more columns.\n",
 740 |         "\n",
 741 |         "But it doesn't do much on its own—you can pair it with `agg` to get a result."
 742 |       ]
 743 |     },
 744 |     {
 745 |       "cell_type": "code",
 746 |       "execution_count": null,
 747 |       "metadata": {},
 748 |       "outputs": [],
 749 |       "source": [
 750 |         "flights.group_by(\"carrier\").agg()"
 751 |       ]
 752 |     },
 753 |     {
 754 |       "cell_type": "markdown",
 755 |       "metadata": {},
 756 |       "source": [
 757 |         "Without any aggregate function specified, we get the distinct values of the grouped column.\n",
 758 |         "\n",
 759 |         "We can add a second column to the `group_by` to get the distinct pairs across both columns:"
 760 |       ]
 761 |     },
 762 |     {
 763 |       "cell_type": "code",
 764 |       "execution_count": null,
 765 |       "metadata": {},
 766 |       "outputs": [],
 767 |       "source": [
 768 |         "flights.group_by([\"carrier\", \"origin\"]).agg()"
 769 |       ]
 770 |     },
 771 |     {
 772 |       "cell_type": "markdown",
 773 |       "metadata": {},
 774 |       "source": [
 775 |         "Now, if we add an aggregation function to that, we start to really open things up."
 776 |       ]
 777 |     },
 778 |     {
 779 |       "cell_type": "code",
 780 |       "execution_count": null,
 781 |       "metadata": {},
 782 |       "outputs": [],
 783 |       "source": [
 784 |         "flights.group_by([\"carrier\", \"origin\"]).agg(flights.distance.mean())"
 785 |       ]
 786 |     },
 787 |     {
 788 |       "cell_type": "markdown",
 789 |       "metadata": {},
 790 |       "source": [
 791 |         "By adding that `mean` to the `aggregate`, we now have a concise way to calculate aggregates over each of the distinct groups in the `group_by`. And we can calculate as many aggregates as we need."
 792 |       ]
 793 |     },
 794 |     {
 795 |       "cell_type": "code",
 796 |       "execution_count": null,
 797 |       "metadata": {},
 798 |       "outputs": [],
 799 |       "source": [
 800 |         "flights.group_by([\"carrier\", \"origin\"]).agg(\n",
 801 |         "    [flights.distance.mean(), flights.air_time.min()]\n",
 802 |         ")"
 803 |       ]
 804 |     },
 805 |     {
 806 |       "cell_type": "markdown",
 807 |       "metadata": {},
 808 |       "source": [
 809 |         "If we need more specific groups, we can add to the `group_by`."
 810 |       ]
 811 |     },
 812 |     {
 813 |       "cell_type": "code",
 814 |       "execution_count": null,
 815 |       "metadata": {},
 816 |       "outputs": [],
 817 |       "source": [
 818 |         "flights.group_by([\"carrier\", \"origin\", \"dest\"]).agg(\n",
 819 |         "    [flights.distance.mean(), flights.air_time.min()]\n",
 820 |         ")"
 821 |       ]
 822 |     },
 823 |     {
 824 |       "cell_type": "markdown",
 825 |       "metadata": {},
 826 |       "source": [
 827 |         "## Cast\n",
 828 |         "\n",
 829 |         "Sometimes when you parse data, _especially_ from CSVs, the types get a bit messed up. Or you might be loading in a `parquet` file where everything is defined as a `string`. We can clean that up pretty quickly.\n",
 830 |         "\n",
 831 |         "You can cast from floats to ints:"
 832 |       ]
 833 |     },
 834 |     {
 835 |       "cell_type": "code",
 836 |       "execution_count": null,
 837 |       "metadata": {},
 838 |       "outputs": [],
 839 |       "source": [
 840 |         "(flights.distance * 1.609).cast(\"int32\")"
 841 |       ]
 842 |     },
 843 |     {
 844 |       "cell_type": "markdown",
 845 |       "metadata": {},
 846 |       "source": [
 847 |         "And from ints to floats:"
 848 |       ]
 849 |     },
 850 |     {
 851 |       "cell_type": "code",
 852 |       "execution_count": null,
 853 |       "metadata": {},
 854 |       "outputs": [],
 855 |       "source": [
 856 |         "flights.year.cast(\"float64\")  # this is a terrible idea"
 857 |       ]
 858 |     },
 859 |     {
 860 |       "cell_type": "markdown",
 861 |       "metadata": {},
 862 |       "source": [
 863 |         "You can cast numeric columns to strings:"
 864 |       ]
 865 |     },
 866 |     {
 867 |       "cell_type": "code",
 868 |       "execution_count": null,
 869 |       "metadata": {},
 870 |       "outputs": [],
 871 |       "source": [
 872 |         "flights.year.cast(\"str\")  # or \"string\""
 873 |       ]
 874 |     },
 875 |     {
 876 |       "cell_type": "markdown",
 877 |       "metadata": {},
 878 |       "source": [
 879 |         "And numeric strings to numbers:"
 880 |       ]
 881 |     },
 882 |     {
 883 |       "cell_type": "code",
 884 |       "execution_count": null,
 885 |       "metadata": {},
 886 |       "outputs": [],
 887 |       "source": [
 888 |         "flights.year.cast(\"str\").cast(\"int64\")"
 889 |       ]
 890 |     },
 891 |     {
 892 |       "cell_type": "markdown",
 893 |       "metadata": {},
 894 |       "source": [
 895 |         "But Ibis will yell if you try to cast a non-numeric string to a number:"
 896 |       ]
 897 |     },
 898 |     {
 899 |       "cell_type": "code",
 900 |       "execution_count": null,
 901 |       "metadata": {},
 902 |       "outputs": [],
 903 |       "source": [
 904 |         "flights.carrier.cast(\"int32\")"
 905 |       ]
 906 |     },
 907 |     {
 908 |       "cell_type": "markdown",
 909 |       "metadata": {},
 910 |       "source": [
 911 |         "If we know that a column _should_ have a particular data type, but don't want a few bad apples (rows) to spoil the bunch, `try_cast` will fall back to `NULL` or `NaN` for values where the cast fails:"
 912 |       ]
 913 |     },
 914 |     {
 915 |       "cell_type": "code",
 916 |       "execution_count": null,
 917 |       "metadata": {},
 918 |       "outputs": [],
 919 |       "source": [
 920 |         "flights.arr_delay.try_cast(int)"
 921 |       ]
 922 |     },
 923 |     {
 924 |       "cell_type": "markdown",
 925 |       "metadata": {},
 926 |       "source": [
 927 |         "## Drop NA\n",
 928 |         "\n",
 929 |         "Does what it says on the box—drop the `NULL`s from a dataset."
 930 |       ]
 931 |     },
 932 |     {
 933 |       "cell_type": "code",
 934 |       "execution_count": null,
 935 |       "metadata": {},
 936 |       "outputs": [],
 937 |       "source": [
 938 |         "flights.dropna()"
 939 |       ]
 940 |     },
 941 |     {
 942 |       "cell_type": "markdown",
 943 |       "metadata": {},
 944 |       "source": [
 945 |         "## Exercises\n",
 946 |         "\n",
 947 |         "Time to use what we've learned to answer some flight questions.\n",
 948 |         "\n",
 949 |         "### Exercise 2\n",
 950 |         "\n",
 951 |         "Which airlines (`carrier`) had the longest average arrival delays (`arr_delay`) in June 2013?\n",
 952 |         "\n",
 953 |         "#### Solution\n",
 954 |         "\n",
 955 |         "Note that there are several ways these queries could be written—it's fine if your solution doesn't look like ours, as long as the results are the same."
 956 |       ]
 957 |     },
 958 |     {
 959 |       "cell_type": "code",
 960 |       "execution_count": null,
 961 |       "metadata": {},
 962 |       "outputs": [],
 963 |       "source": [
 964 |         "%load solutions/nb01_ex02.py"
 965 |       ]
 966 |     },
 967 |     {
 968 |       "cell_type": "markdown",
 969 |       "metadata": {},
 970 |       "source": [
 971 |         "### Exercise 3\n",
 972 |         "\n",
 973 |         "Which NYC airport has the lowest percentage of outbound flights arriving 30 or more minutes late?\n",
 974 |         "\n",
 975 |         "#### Solution"
 976 |       ]
 977 |     },
 978 |     {
 979 |       "cell_type": "code",
 980 |       "execution_count": null,
 981 |       "metadata": {},
 982 |       "outputs": [],
 983 |       "source": [
 984 |         "%load solutions/nb01_ex03.py"
 985 |       ]
 986 |     },
 987 |     {
 988 |       "cell_type": "markdown",
 989 |       "metadata": {},
 990 |       "source": [
 991 |         "## A brief digression on the SQL Ibis generates\n",
 992 |         "\n",
 993 |         "Maybe you've heard that SQL has a standard?  This is true, and also misleading. The SQL standard is more of a suggestion, and there are myriad SQL _dialects_.\n",
 994 |         "\n",
 995 |         "Ibis compiles expressions into the appropriate SQL dialect for the backend you are using. In this case, we started with a DuckDB table, so we get DuckDB SQL:"
 996 |       ]
 997 |     },
 998 |     {
 999 |       "cell_type": "code",
1000 |       "execution_count": null,
1001 |       "metadata": {},
1002 |       "outputs": [],
1003 |       "source": [
1004 |         "ibis.to_sql(flights_metric_mutate_drop)"
1005 |       ]
1006 |     },
1007 |     {
1008 |       "cell_type": "markdown",
1009 |       "metadata": {},
1010 |       "source": [
1011 |         "But if you want to use a _different_ dialect, you can pass the dialect name:"
1012 |       ]
1013 |     },
1014 |     {
1015 |       "cell_type": "code",
1016 |       "execution_count": null,
1017 |       "metadata": {},
1018 |       "outputs": [],
1019 |       "source": [
1020 |         "ibis.to_sql(flights_metric_mutate_drop, dialect=\"postgres\")"
1021 |       ]
1022 |     },
1023 |     {
1024 |       "cell_type": "markdown",
1025 |       "metadata": {},
1026 |       "source": [
1027 |         "## Join\n",
1028 |         "\n",
1029 |         "No dataframe library is complete without joins! Ibis supports several kinds of joins between table expressions: `inner_join`, `cross_join`, `left_join`, `outer_join`, `semi_join`, and `anti_join`. The `join` table method is, by default, the same as `inner_join`.\n",
1030 |         "\n",
1031 |         "Remember the other table in our database?"
1032 |       ]
1033 |     },
1034 |     {
1035 |       "cell_type": "code",
1036 |       "execution_count": null,
1037 |       "metadata": {},
1038 |       "outputs": [],
1039 |       "source": [
1040 |         "weather = con.table(\"weather\")\n",
1041 |         "weather"
1042 |       ]
1043 |     },
1044 |     {
1045 |       "cell_type": "markdown",
1046 |       "metadata": {},
1047 |       "source": [
1048 |         "We can join the two tables on the `origin` column:"
1049 |       ]
1050 |     },
1051 |     {
1052 |       "cell_type": "code",
1053 |       "execution_count": null,
1054 |       "metadata": {},
1055 |       "outputs": [],
1056 |       "source": [
1057 |         "flights.join(weather, \"origin\")"
1058 |       ]
1059 |     },
1060 |     {
1061 |       "cell_type": "markdown",
1062 |       "metadata": {},
1063 |       "source": [
1064 |         "Of course, we should only join on the weather at the time corresponding to each flight:"
1065 |       ]
1066 |     },
1067 |     {
1068 |       "cell_type": "code",
1069 |       "execution_count": null,
1070 |       "metadata": {},
1071 |       "outputs": [],
1072 |       "source": [
1073 |         "flights.join(weather, [\"origin\", \"time_hour\"])"
1074 |       ]
1075 |     },
1076 |     {
1077 |       "cell_type": "markdown",
1078 |       "metadata": {},
1079 |       "source": [
1080 |         "The `on` condition can also be specified as an expression, which is particularly useful if you have columns with different names or non-equi-join logic."
1081 |       ]
1082 |     },
1083 |     {
1084 |       "cell_type": "code",
1085 |       "execution_count": null,
1086 |       "metadata": {},
1087 |       "outputs": [],
1088 |       "source": [
1089 |         "renamed = weather.rename(location=\"origin\")\n",
1090 |         "flights.join(\n",
1091 |         "    renamed,\n",
1092 |         "    (flights.origin != renamed.location) & (flights.time_hour == renamed.time_hour),\n",
1093 |         ")"
1094 |       ]
1095 |     },
1096 |     {
1097 |       "cell_type": "markdown",
1098 |       "metadata": {},
1099 |       "source": [
1100 |         "The Ibis `join` syntax is quite expressive, so we won't cover all the variations now; for more examples, read the [docs](https://ibis-project.org/reference/expression-tables#ibis.expr.types.relations.Table.join).\n",
1101 |         "\n",
1102 |         "Before we move on, let's terminate the DuckDB connection for good measure. If you skip this step, you may run into an error later on in the tutorial:\n",
1103 |         "\n",
1104 |         "    IO Error: Could not set lock on file \"/workspaces/kedro-ibis-tutorial/nycflights13.ddb\": Conflicting lock is held in /usr/local/bin/python3.11 (PID 1234). However, you would be able to open this database in read-only mode, e.g. by using the -readonly parameter in the CLI. See also https://duckdb.org/docs/connect/concurrency"
1105 |       ]
1106 |     },
1107 |     {
1108 |       "cell_type": "code",
1109 |       "execution_count": null,
1110 |       "metadata": {},
1111 |       "outputs": [],
1112 |       "source": [
1113 |         "con.disconnect()"
1114 |       ]
1115 |     }
1116 |   ],
1117 |   "metadata": {
1118 |     "kernelspec": {
1119 |       "display_name": "Python 3 (ipykernel)",
1120 |       "language": "python",
1121 |       "name": "python3"
1122 |     },
1123 |     "language_info": {
1124 |       "codemirror_mode": {
1125 |         "name": "ipython",
1126 |         "version": 3
1127 |       },
1128 |       "file_extension": ".py",
1129 |       "mimetype": "text/x-python",
1130 |       "name": "python",
1131 |       "nbconvert_exporter": "python",
1132 |       "pygments_lexer": "ipython3",
1133 |       "version": "3.11.9"
1134 |     }
1135 |   },
1136 |   "nbformat": 4,
1137 |   "nbformat_minor": 4
1138 | }
1139 | 


--------------------------------------------------------------------------------