├── tests
    ├── __init__.py
    ├── tpch
    │   ├── __init__.py
    │   ├── test_optimization.py
    │   ├── README.md
    │   ├── plotting.py
    │   ├── generate_answers.py
    │   ├── utils.py
    │   ├── visualize.ipynb
    │   ├── test_dask.py
    │   ├── test_correctness.py
    │   └── test_duckdb.py
    ├── benchmarks
    │   ├── __init__.py
    │   ├── test_csv.py
    │   ├── test_custom.py
    │   ├── test_zarr.py
    │   ├── test_futures.py
    │   ├── test_spill.py
    │   ├── test_dataframe.py
    │   ├── test_join.py
    │   ├── test_work_stealing.py
    │   ├── test_rechunk.py
    │   ├── test_parquet.py
    │   ├── test_h2o.py
    │   └── test_xarray.py
    ├── runtime
    │   ├── __init__.py
    │   ├── test_cluster_creation.py
    │   ├── test_build.py
    │   └── test_xgboost.py
    ├── stability
    │   ├── __init__.py
    │   ├── test_install_plugins.py
    │   ├── test_array.py
    │   └── test_deadlock.py
    ├── workflows
    │   ├── __init__.py
    │   ├── test_uber_lyft.py
    │   ├── test_xgboost_optuna.py
    │   ├── test_embarrassingly_parallel.py
    │   ├── test_snowflake.py
    │   └── test_from_csv_to_parquet.py
    ├── geospatial
    │   ├── workloads
    │   │   ├── __init__.py
    │   │   ├── rechunking.py
    │   │   ├── zonal_average.py
    │   │   ├── regridding.py
    │   │   ├── atmospheric_circulation.py
    │   │   ├── cloud_optimize.py
    │   │   ├── satellite_filtering.py
    │   │   └── climatology.py
    │   ├── utils.py
    │   ├── test_cloud_optimize.py
    │   ├── test_rechunking.py
    │   ├── test_regridding.py
    │   ├── test_atmospheric_circulation.py
    │   ├── test_satellite_filtering.py
    │   ├── test_zonal_average.py
    │   └── test_climatology.py
    └── test_utils_test.py
├── alembic
    ├── README
    ├── versions
    │   ├── 2d2405ad763b_drop_tpc_h_data.py
    │   ├── 1095dfdfc4ae_add_column_for_memray_profiles_url.py
    │   ├── aa1fc9fdc665_add_column_for_py_spy_profiles_url.py
    │   ├── 25053f75e09f_add_dask_expr_version_tracking_migration.py
    │   ├── 912c8e30690a_remove_tests_in_test_shuffle_py.py
    │   ├── 924e9b1430e1_spark_test_bankruptcy.py
    │   ├── 149d2048065b_add_default_parameter_to_historical_.py
    │   ├── a97d9375430f_default_parameter_for_test_dataframe_py_.py
    │   ├── fa79471ffa8c_declare_bankruptcy_for_test_futures_py.py
    │   ├── 2764a4f5582b_declare_bankruptcy_for_cluster_startup_.py
    │   ├── 9813b7160e69_parametrize_test_large_map.py
    │   ├── 9d6f8ea24ee1_move_h2o_tests.py
    │   ├── f459b2c61eaf_remove_non_upstream_historical_data.py
    │   ├── 59c5cc87c066_drop_outdated_rechunking_data.py
    │   ├── 87cbf883c2be_update_tpch_refactor_from_1094.py
    │   ├── b0e8d5f3295d_update_test_tpch_tpch_test_dask_from_.py
    │   ├── 2381a77e8487_zarr.py
    │   ├── d58983739401_default_parameter_for_test_rechunk_in_.py
    │   ├── a9363331e323_clean_h2o_tests_with_removed_shuffle_.py
    │   ├── e11cd1aaed38_add_cluster_spec_to_db.py
    │   ├── a8785a7b3cae_add_entry_to_database_for_cluster_name_.py
    │   ├── 778e617a2886_merge_xarray_reduction_with_quadratic_.py
    │   ├── 24749594f367_add_prometheus_metrics.py
    │   ├── 1c2fe9d527e4_expand_rechunk_parameters.py
    │   ├── 4ee0e23d96da_compressible_variant_of_tests.py
    │   ├── 967e298408ed_test_spill.py
    │   ├── 78c6e00fee88_remove_task_based_shuffle.py
    │   ├── c38b9d85915e_default_parameter_for_shuffling_tests.py
    │   ├── 7d7844fca7cf_initial_table.py
    │   └── 00d5844fd364_add_tpch_run_table.py
    ├── script.py.mako
    └── env.py
├── ci
    ├── condarc
    ├── environment-geospatial.yml
    ├── environment-git-tip.yml
    ├── environment-snowflake.yml
    ├── environment-dashboard.yml
    ├── environment-tpch-nondask.yml
    ├── environment-test.yml
    ├── scripts
    │   ├── dask_config_to_env.py
    │   ├── combine-dbs.sh
    │   └── discover_ab_environments.py
    └── environment.yml
├── AB_environments
    ├── AB_baseline.cluster.yaml
    ├── AB_baseline.dask.yaml
    ├── AB_sample.dask.yaml
    ├── AB_sample.cluster.yaml
    ├── make_envs.py
    ├── AB_baseline.conda.yaml
    ├── config.yaml
    └── AB_sample.conda.yaml
├── .github
    └── workflows
    │   ├── lint.yml
    │   ├── geospatial.yml
    │   └── tpch.yml
├── .pre-commit-config.yaml
├── setup.cfg
├── LICENSE
├── plugins.py
├── .gitignore
├── cluster_kwargs.yaml
├── alembic.ini
├── benchmark_schema.py
└── detect_regressions.py


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/tpch/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/runtime/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/stability/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/workflows/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/alembic/README:
--------------------------------------------------------------------------------
1 | Generic single-database configuration.


--------------------------------------------------------------------------------
/ci/condarc:
--------------------------------------------------------------------------------
1 | auto_activate_base: false
2 | remote_backoff_factor: 20
3 | remote_connect_timeout_secs: 20.0
4 | remote_max_retries: 10
5 | remote_read_timeout_secs: 60.0
6 | 


--------------------------------------------------------------------------------
/ci/environment-geospatial.yml:
--------------------------------------------------------------------------------
1 | # This is an addition to ci/environment.yml.
2 | # Add dependencies exclusively needed to run geospatial tests.
3 | channels:
4 |   - conda-forge
5 | dependencies:
6 |   - memray ==1.13.4
7 |   - pip:
8 |     - git+https://github.com/pydata/xarray
9 | 


--------------------------------------------------------------------------------
/AB_environments/AB_baseline.cluster.yaml:
--------------------------------------------------------------------------------
1 | # Special environment file for A/B testing, used to define cluster creation options for
2 | # the baseline environment.
3 | # Change contents, but do not rename.
4 | 
5 | # Overrides ../cluster_kwargs.yaml.
6 | # Leave empty if you don't want to override anything.
7 | 


--------------------------------------------------------------------------------
/ci/environment-git-tip.yml:
--------------------------------------------------------------------------------
 1 | # This is an addition to ci/environment.yml, which upgrades dask to the git tip.
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - pip
 6 |   - pip:
 7 |     - git+https://github.com/dask/dask
 8 |     - git+https://github.com/dask/distributed
 9 |     - git+https://github.com/dask/zict
10 | 


--------------------------------------------------------------------------------
/AB_environments/AB_baseline.dask.yaml:
--------------------------------------------------------------------------------
1 | # Special environment file for A/B testing, used to define dask config options
2 | # (overriding the built-in config) for the baseline environment.
3 | # Change contents, but do not rename.
4 | # Leave empty if you don't want to override anything.
5 | dask:
6 |   dataframe:
7 |     query-planning: True
8 | 


--------------------------------------------------------------------------------
/ci/environment-snowflake.yml:
--------------------------------------------------------------------------------
 1 | # This is an addition to ci/environment.yml.
 2 | # Add dask-snowflake and downgrade some pinned dependencies.
 3 | channels:
 4 |   - conda-forge
 5 | dependencies:
 6 |   - pip
 7 |   - snowflake-connector-python ==3.12.2
 8 |   - snowflake-sqlalchemy ==1.6.1
 9 |   - pip:
10 |     - git+https://github.com/coiled/dask-snowflake
11 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   pre-commit:
11 |     name: pre-commit hooks
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - uses: actions/setup-python@v5
16 |       - uses: pre-commit/action@v3.0.1
17 | 


--------------------------------------------------------------------------------
/tests/geospatial/utils.py:
--------------------------------------------------------------------------------
 1 | import xarray as xr
 2 | 
 3 | 
 4 | def load_era5() -> xr.Dataset:
 5 |     return xr.open_zarr(
 6 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
 7 |         chunks={
 8 |             "longitude": "auto",
 9 |             "latitude": "auto",
10 |             "levels": "auto",
11 |             "time": "auto",
12 |         },
13 |     )
14 | 


--------------------------------------------------------------------------------
/AB_environments/AB_sample.dask.yaml:
--------------------------------------------------------------------------------
 1 | # Sample dask config file for A/B testing.
 2 | # Change contents/delete/rename as needed.
 3 | 
 4 | # Every A/B environment *must* present these three files:
 5 | # - AB_<name>.conda.yaml
 6 | # - AB_<name>.dask.yaml
 7 | # - AB_<name>.cluster.yaml
 8 | 
 9 | # Leave empty if you don't want to override anything.
10 | 
11 | # distributed:
12 | #   scheduler:
13 | #     worker-saturation: 1.2
14 | dask:
15 |   dataframe:
16 |     query-planning: True
17 | 


--------------------------------------------------------------------------------
/AB_environments/AB_sample.cluster.yaml:
--------------------------------------------------------------------------------
 1 | # Sample cluster creation options file for A/B testing.
 2 | # Change contents/delete/rename as needed.
 3 | 
 4 | # Every A/B environment *must* present these three files:
 5 | # - AB_<name>.conda.yaml
 6 | # - AB_<name>.dask.yaml
 7 | # - AB_<name>.cluster.yaml
 8 | 
 9 | # Overrides ../cluster_kwargs.yaml.
10 | # Leave empty if you don't want to override anything.
11 | 
12 | # small_cluster:
13 | #   n_workers: 5
14 | #   worker_vm_types: [m6i.xlarge]  # 4CPU, 16GiB
15 | 


--------------------------------------------------------------------------------
/ci/environment-dashboard.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | dependencies:
 4 |   - python =3.10
 5 |   - sqlalchemy =2.0
 6 |   - altair =5.0
 7 |   - bokeh =3.2
 8 |   - panel =1.2
 9 |   - pandas =2.0
10 |   - tabulate =0.9
11 | 
12 |   # These imports are only needed to parse the source code of the tests and embed it in
13 |   # the dashboard
14 |   - coiled
15 |   - conda
16 |   - dask
17 |   - dask-ml
18 |   - distributed
19 |   - filelock
20 |   - optuna
21 |   - pytest
22 |   - s3fs
23 |   - xarray
24 |   - xgboost
25 |   - zarr
26 | 


--------------------------------------------------------------------------------
/alembic/versions/2d2405ad763b_drop_tpc_h_data.py:
--------------------------------------------------------------------------------
 1 | """Drop TPC-H data
 2 | 
 3 | Revision ID: 2d2405ad763b
 4 | Revises: 59c5cc87c066
 5 | Create Date: 2024-08-15 13:54:45.251458
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '2d2405ad763b'
14 | down_revision = '59c5cc87c066'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute("delete from tpch_run")
21 | 
22 | 
23 | def downgrade() -> None:
24 |     pass
25 | 


--------------------------------------------------------------------------------
/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade() -> None:
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade() -> None:
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/ci/environment-tpch-nondask.yml:
--------------------------------------------------------------------------------
 1 | # This is an addition to ci/environment.yml.
 2 | # Add dependencies exclusively needed to run TPCH tests on dask competitors.
 3 | channels:
 4 |   - conda-forge
 5 | dependencies:
 6 |   # PySpark
 7 |   # See https://spark.apache.org/docs/latest/api/python/getting_started/install.html#dependencies
 8 |   - pyspark ==3.4.1 # FIXME https://github.com/coiled/benchmarks/issues/1221
 9 |   - openjdk ~=11.0  # Do not upgrade
10 |   - grpcio ==1.61.1
11 |   - grpcio-status ==1.60.1 # FIXME https://github.com/coiled/benchmarks/issues/1221
12 |   - protobuf ==4.25.2
13 | 
14 |   # Other TPCH tests
15 |   - polars ==0.20.13
16 | 


--------------------------------------------------------------------------------
/alembic/versions/1095dfdfc4ae_add_column_for_memray_profiles_url.py:
--------------------------------------------------------------------------------
 1 | """Add column for Memray profiles url
 2 | 
 3 | Revision ID: 1095dfdfc4ae
 4 | Revises: 2d2405ad763b
 5 | Create Date: 2024-10-23 11:11:15.238042
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '1095dfdfc4ae'
14 | down_revision = '2d2405ad763b'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.add_column('test_run', sa.Column('memray_profiles_url', sa.String(), nullable=True))
21 | 
22 | 
23 | def downgrade() -> None:
24 |     op.drop_column("test_run", "memray_profiles_url")
25 | 


--------------------------------------------------------------------------------
/alembic/versions/aa1fc9fdc665_add_column_for_py_spy_profiles_url.py:
--------------------------------------------------------------------------------
 1 | """Add column for py-spy profiles url
 2 | 
 3 | Revision ID: aa1fc9fdc665
 4 | Revises: 1095dfdfc4ae
 5 | Create Date: 2024-10-23 16:11:24.794416
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'aa1fc9fdc665'
14 | down_revision = '1095dfdfc4ae'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.add_column('test_run', sa.Column('py_spy_profiles_url', sa.String(), nullable=True))
21 | 
22 | 
23 | def downgrade() -> None:
24 |     op.drop_column("test_run", "py_spy_profiles_url")
25 | 


--------------------------------------------------------------------------------
/ci/environment-test.yml:
--------------------------------------------------------------------------------
 1 | # This is an addition to either ci/environment.yml or AB_environments/AB_*.conda.yaml,
 2 | # adding dependencies specific to this test suite.
 3 | channels:
 4 |   - conda-forge
 5 | dependencies:
 6 |   # Testing dependencies
 7 |   - alembic
 8 |   - altair
 9 |   - conda
10 |   - filelock
11 |   - jinja2
12 |   - packaging
13 |   - pytest
14 |   - pytest-timeout
15 |   - pytest-xdist
16 |   - python-dotenv
17 |   - pyyaml
18 |   # TPC-H correctness test and DuckDB implementation
19 |   # Can add duckdb back to conda install after:
20 |   # https://github.com/coiled/benchmarks/issues/1418
21 |   # python-duckdb ==0.10.0
22 |   - pip
23 |   - pip:
24 |     - duckdb==0.10.0


--------------------------------------------------------------------------------
/alembic/versions/25053f75e09f_add_dask_expr_version_tracking_migration.py:
--------------------------------------------------------------------------------
 1 | """Add dask-expr version tracking migration
 2 | 
 3 | Revision ID: 25053f75e09f
 4 | Revises: 24749594f367
 5 | Create Date: 2024-02-26 08:04:47.704600
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '25053f75e09f'
14 | down_revision = '24749594f367'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.add_column('test_run', sa.Column('dask_expr_version', sa.String(), nullable=True))
21 | 
22 | 
23 | def downgrade() -> None:
24 |     op.drop_column('test_run', 'dask_expr_version')
25 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_cloud_optimize.py:
--------------------------------------------------------------------------------
 1 | from tests.geospatial.workloads.cloud_optimize import cloud_optimize
 2 | 
 3 | 
 4 | def test_cloud_optimize(
 5 |     scale,
 6 |     s3,
 7 |     s3_url,
 8 |     setup_benchmark,
 9 |     cluster_kwargs={
10 |         "workspace": "dask-benchmarks",
11 |         "region": "us-west-2",
12 |     },
13 |     scale_kwargs={
14 |         "small": {"n_workers": 10},
15 |         "medium": {"n_workers": 100},
16 |         "large": {"n_workers": 200},
17 |     },
18 | ):
19 |     with setup_benchmark(
20 |         **scale_kwargs[scale], **cluster_kwargs
21 |     ) as benchmark:  # noqa: F841
22 |         benchmark(cloud_optimize, scale, fs=s3, storage_url=s3_url)
23 | 


--------------------------------------------------------------------------------
/alembic/versions/912c8e30690a_remove_tests_in_test_shuffle_py.py:
--------------------------------------------------------------------------------
 1 | """Remove tests in test_shuffle.py
 2 | 
 3 | Revision ID: 912c8e30690a
 4 | Revises: c38b9d85915e
 5 | Create Date: 2023-01-03 20:21:06.704816
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '912c8e30690a'
14 | down_revision = 'c38b9d85915e'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute("""
21 |         delete from test_run
22 |         where originalname in ('test_shuffle_simple', 'test_shuffle_parquet');
23 |         """)
24 | 
25 | 
26 | def downgrade() -> None:
27 |     pass
28 | 


--------------------------------------------------------------------------------
/alembic/versions/924e9b1430e1_spark_test_bankruptcy.py:
--------------------------------------------------------------------------------
 1 | """spark test bankruptcy
 2 | 
 3 | Revision ID: 924e9b1430e1
 4 | Revises: 7d7844fca7cf
 5 | Create Date: 2022-09-12 09:49:32.494687
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = "924e9b1430e1"
13 | down_revision = "7d7844fca7cf"
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     op.execute(
20 |         """
21 |         delete from test_run
22 |         where originalname = 'test_read_spark_generated_data'
23 |         and path = 'benchmarks/test_parquet.py';
24 |         """
25 |     )
26 | 
27 | 
28 | def downgrade() -> None:
29 |     pass
30 | 


--------------------------------------------------------------------------------
/alembic/versions/149d2048065b_add_default_parameter_to_historical_.py:
--------------------------------------------------------------------------------
 1 | """Add default parameter to historical test_basic_sum
 2 | 
 3 | Revision ID: 149d2048065b
 4 | Revises: a8785a7b3cae
 5 | Create Date: 2022-10-18 15:18:00.603726
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '149d2048065b'
12 | down_revision = 'a8785a7b3cae'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade() -> None:
18 |     op.execute(
19 |         """
20 |         update test_run 
21 |         set name = 'test_basic_sum[fast-thin]' 
22 |         where name == 'test_basic_sum';
23 |         """
24 |     )
25 | 
26 | 
27 | def downgrade() -> None:
28 |     pass
29 | 


--------------------------------------------------------------------------------
/alembic/versions/a97d9375430f_default_parameter_for_test_dataframe_py_.py:
--------------------------------------------------------------------------------
 1 | """Default parameter for test_dataframe.py::test_shuffle
 2 | 
 3 | Revision ID: a97d9375430f
 4 | Revises: 967e298408ed
 5 | Create Date: 2023-01-03 19:36:30.469391
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'a97d9375430f'
14 | down_revision = '967e298408ed'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(f"""
21 |         update test_run
22 |             set name = 'test_shuffle[tasks]'
23 |             where name == 'test_shuffle';
24 |         """)
25 | 
26 | 
27 | def downgrade() -> None:
28 |     pass
29 | 


--------------------------------------------------------------------------------
/alembic/versions/fa79471ffa8c_declare_bankruptcy_for_test_futures_py.py:
--------------------------------------------------------------------------------
 1 | """Declare bankruptcy for test_futures.py
 2 | 
 3 | Revision ID: fa79471ffa8c
 4 | Revises: 149d2048065b
 5 | Create Date: 2022-10-19 16:21:21.871309
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'fa79471ffa8c'
12 | down_revision = '149d2048065b'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade() -> None:
18 |     op.execute(
19 |         """
20 |         delete from test_run
21 |         where originalname in ('test_single_future', 'test_memory_efficient')
22 |         and path = 'benchmarks/test_futures.py';
23 |         """
24 |     )
25 | 
26 | 
27 | def downgrade() -> None:
28 |     pass
29 | 


--------------------------------------------------------------------------------
/alembic/versions/2764a4f5582b_declare_bankruptcy_for_cluster_startup_.py:
--------------------------------------------------------------------------------
 1 | """Declare bankruptcy for cluster startup time
 2 | 
 3 | Revision ID: 2764a4f5582b
 4 | Revises: 924e9b1430e1
 5 | Create Date: 2022-09-14 11:45:46.024184
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "2764a4f5582b"
14 | down_revision = "924e9b1430e1"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         delete from test_run
23 |         where originalname = 'test_default_cluster_spinup_time'
24 |         and path = 'benchmarks/test_coiled.py';
25 |         """
26 |     )
27 | 
28 | 
29 | def downgrade() -> None:
30 |     pass
31 | 


--------------------------------------------------------------------------------
/alembic/versions/9813b7160e69_parametrize_test_large_map.py:
--------------------------------------------------------------------------------
 1 | """Parametrize test_large_map
 2 | 
 3 | Revision ID: 9813b7160e69
 4 | Revises: f459b2c61eaf
 5 | Create Date: 2023-07-05 11:04:08.510205
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '9813b7160e69'
13 | down_revision = 'f459b2c61eaf'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     op.execute(
20 |         """
21 |         update test_run 
22 |         set name = 'test_large_map[rootish]' 
23 |         where name == 'test_large_map';
24 |         """
25 |     )
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     pass
30 |     # ### end Alembic commands ###
31 | 


--------------------------------------------------------------------------------
/alembic/versions/9d6f8ea24ee1_move_h2o_tests.py:
--------------------------------------------------------------------------------
 1 | """Move h2o tests
 2 | 
 3 | Revision ID: 9d6f8ea24ee1
 4 | Revises: a97d9375430f
 5 | Create Date: 2023-01-13 14:29:22.118276
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '9d6f8ea24ee1'
13 | down_revision = 'a97d9375430f'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     op.execute(
20 |         """
21 |         update test_run
22 |         set path = 'benchmarks/test_h2o.py'
23 |         where path = 'benchmarks/h2o/test_h2o_benchmarks.py';
24 |         """
25 |     )
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     pass
31 |     # ### end Alembic commands ###
32 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_csv.py:
--------------------------------------------------------------------------------
 1 | import dask.dataframe as dd
 2 | import pandas as pd
 3 | 
 4 | from ..utils_test import run_up_to_nthreads
 5 | 
 6 | 
 7 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
 8 | def test_csv_basic(small_client):
 9 |     ddf = dd.read_csv(
10 |         "s3://coiled-runtime-ci/nyc-tlc/yellow_tripdata_2019_csv/yellow_tripdata_2019-*.csv",
11 |         dtype={
12 |             "payment_type": "UInt8",
13 |             "VendorID": "UInt8",
14 |             "passenger_count": "UInt8",
15 |             "RatecodeID": "UInt8",
16 |         },
17 |         blocksize="16 MiB",
18 |     ).persist()
19 | 
20 |     result = ddf.groupby("passenger_count").tip_amount.mean().compute()
21 | 
22 |     assert isinstance(result, pd.Series)
23 |     assert not result.empty
24 | 


--------------------------------------------------------------------------------
/alembic/versions/f459b2c61eaf_remove_non_upstream_historical_data.py:
--------------------------------------------------------------------------------
 1 | """Remove non-upstream historical data
 2 | 
 3 | Revision ID: f459b2c61eaf
 4 | Revises: 4ee0e23d96da
 5 | Create Date: 2023-05-23 10:39:13.056358
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'f459b2c61eaf'
13 | down_revision = '4ee0e23d96da'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     op.execute(
20 |         """
21 |         delete from test_run
22 |         where (
23 |             coiled_runtime_version <> 'upstream'
24 |             and coiled_runtime_version not like 'AB_%'
25 |         )
26 |         or python_version like '3.8%';
27 |         """
28 |     )
29 | 
30 | 
31 | def downgrade() -> None:
32 |     pass
33 | 


--------------------------------------------------------------------------------
/tests/runtime/test_cluster_creation.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | from coiled import Cluster
 4 | 
 5 | 
 6 | def test_default_cluster_spinup_time(
 7 |     benchmark_time, github_cluster_tags, get_cluster_info
 8 | ):
 9 |     """Note: this test must be kept in a separate module from the tests that use the
10 |     small_cluster fixture (which has the scope=module) or its child small_client.
11 |     This prevents having the small_cluster sitting idle for 5+ minutes while this test
12 |     is running.
13 |     """
14 |     with benchmark_time:
15 |         with Cluster(
16 |             name=f"test_default_cluster_spinup_time-{uuid.uuid4().hex[:8]}",
17 |             n_workers=1,
18 |             tags=github_cluster_tags,
19 |         ) as cluster:
20 |             with get_cluster_info(cluster):
21 |                 pass
22 | 


--------------------------------------------------------------------------------
/alembic/versions/59c5cc87c066_drop_outdated_rechunking_data.py:
--------------------------------------------------------------------------------
 1 | """Drop outdated rechunking data
 2 | 
 3 | Revision ID: 59c5cc87c066
 4 | Revises: e11cd1aaed38
 5 | Create Date: 2024-08-16 15:16:27.114045
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '59c5cc87c066'
14 | down_revision = 'e11cd1aaed38'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         delete from test_run
23 |         where originalname in (
24 |             'test_adjacent_groups',
25 |             'test_heal_oversplit',
26 |             'test_swap_axes',
27 |             'test_tiles_to_rows'
28 |         )
29 |         """
30 |     )
31 | 
32 | 
33 | def downgrade() -> None:
34 |     pass
35 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_rechunking.py:
--------------------------------------------------------------------------------
 1 | from coiled.credentials.google import CoiledShippedCredentials
 2 | 
 3 | from tests.geospatial.workloads.rechunking import era5_rechunking
 4 | 
 5 | 
 6 | def test_era5_rechunking(
 7 |     gcs_url,
 8 |     scale,
 9 |     setup_benchmark,
10 |     cluster_kwargs={
11 |         "workspace": "dask-benchmarks-gcp",
12 |         "region": "us-central1",
13 |     },
14 |     scale_kwargs={
15 |         "small": {"n_workers": 10},
16 |         "medium": {"n_workers": 100},
17 |         "large": {"n_workers": 100},
18 |     },
19 | ):
20 |     with setup_benchmark(
21 |         **scale_kwargs[scale], **cluster_kwargs
22 |     ) as benchmark:  # noqa: F841
23 |         benchmark(
24 |             era5_rechunking,
25 |             scale=scale,
26 |             storage_url=gcs_url,
27 |             storage_options={"token": CoiledShippedCredentials()},
28 |         )
29 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_regridding.py:
--------------------------------------------------------------------------------
 1 | from coiled.credentials.google import CoiledShippedCredentials
 2 | 
 3 | from tests.geospatial.workloads.regridding import xesmf
 4 | 
 5 | 
 6 | def test_xesmf(
 7 |     gcs_url,
 8 |     scale,
 9 |     setup_benchmark,
10 |     cluster_kwargs={
11 |         "workspace": "dask-benchmarks-gcp",
12 |         "region": "us-central1",
13 |         "wait_for_workers": True,
14 |     },
15 |     scale_kwargs={
16 |         "small": {"n_workers": 10},
17 |         "medium": {"n_workers": 10},
18 |         "large": {"n_workers": 10},
19 |     },
20 | ):
21 |     with setup_benchmark(
22 |         **scale_kwargs[scale], **cluster_kwargs
23 |     ) as benchmark:  # noqa: F841
24 |         benchmark(
25 |             xesmf,
26 |             scale=scale,
27 |             storage_url=gcs_url,
28 |             storage_options={"token": CoiledShippedCredentials()},
29 |         )
30 | 


--------------------------------------------------------------------------------
/tests/stability/test_install_plugins.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from distributed import PipInstall
 5 | 
 6 | from ..utils_test import wait
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("restart_workers", [True, False])
10 | def test_private_pip_install(small_client, restart_workers):
11 |     small_client.cluster.send_private_envs(
12 |         {"PYTHON_STUB_TOKEN": os.environ["PYTHON_STUB_PAT"]}
13 |     )
14 | 
15 |     plugin = PipInstall(
16 |         packages=[
17 |             "python_stub@git+https://${PYTHON_STUB_TOKEN}@github.com/coiled/python-stub.git"
18 |         ],
19 |         restart_workers=restart_workers,
20 |     )
21 |     small_client.register_plugin(plugin)
22 | 
23 |     def test(x):
24 |         from python_stub import stub
25 | 
26 |         return stub.echo(x)
27 | 
28 |     fut = small_client.submit(test, "Hello, world!")
29 |     wait(fut, small_client, 5 * 60)
30 | 


--------------------------------------------------------------------------------
/alembic/versions/87cbf883c2be_update_tpch_refactor_from_1094.py:
--------------------------------------------------------------------------------
 1 | """Update tpch refactor from #1094
 2 | 
 3 | Revision ID: 87cbf883c2be
 4 | Revises: b0e8d5f3295d
 5 | Create Date: 2023-10-18 20:31:17.848799
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "87cbf883c2be"
14 | down_revision = "b0e8d5f3295d"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         update test_run
23 |         set path = substr(path, length('benchmarks/') + 1)
24 |         where path like 'benchmarks/tpch/%';
25 |         """
26 |     )
27 | 
28 | 
29 | def downgrade() -> None:
30 |     op.execute(
31 |         """
32 |         update test_run
33 |         set path = 'benchmarks/' || path
34 |         where path like 'tpch/%';
35 |         """
36 |     )
37 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v4.4.0
 4 |       hooks:
 5 |         - id: check-yaml
 6 |           exclude: recipe/meta.yaml
 7 |   -   repo: https://github.com/pycqa/isort
 8 |       rev: 5.12.0
 9 |       hooks:
10 |       - id: isort
11 |         language_version: python3
12 |   -   repo: https://github.com/asottile/pyupgrade
13 |       rev: v3.11.1
14 |       hooks:
15 |       - id: pyupgrade
16 |         args:
17 |           - --py39-plus
18 |   -   repo: https://github.com/psf/black
19 |       rev: 23.9.1
20 |       hooks:
21 |       - id: black
22 |         language_version: python3
23 |         args:
24 |           - --target-version=py39
25 |         exclude: ^alembic/versions/
26 |   -   repo: https://github.com/pycqa/flake8
27 |       rev: 6.1.0
28 |       hooks:
29 |       - id: flake8
30 |         language_version: python3
31 | 


--------------------------------------------------------------------------------
/alembic/versions/b0e8d5f3295d_update_test_tpch_tpch_test_dask_from_.py:
--------------------------------------------------------------------------------
 1 | """Update test_tpch -> tpch/test_dask from #1044
 2 | 
 3 | Revision ID: b0e8d5f3295d
 4 | Revises: 78c6e00fee88
 5 | Create Date: 2023-10-18 20:14:47.476804
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'b0e8d5f3295d'
14 | down_revision = '78c6e00fee88'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         update test_run
23 |         set path = 'benchmarks/tpch/test_dask.py'
24 |         where path = 'benchmarks/test_tpch.py'
25 |         """
26 |     )
27 | 
28 | 
29 | def downgrade() -> None:
30 |     op.execute(
31 |         """
32 |         update test_run
33 |         set path = 'benchmarks/test_tpch.py'
34 |         where path = 'benchmarks/tpch/test_dask.py'
35 |         """
36 |     )
37 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_atmospheric_circulation.py:
--------------------------------------------------------------------------------
 1 | from coiled.credentials.google import CoiledShippedCredentials
 2 | 
 3 | from tests.geospatial.workloads.atmospheric_circulation import atmospheric_circulation
 4 | 
 5 | 
 6 | def test_atmospheric_circulation(
 7 |     gcs_url,
 8 |     scale,
 9 |     setup_benchmark,
10 |     cluster_kwargs={
11 |         "workspace": "dask-benchmarks-gcp",
12 |         "region": "us-central1",
13 |     },
14 |     scale_kwargs={
15 |         "small": {"n_workers": 10},
16 |         "medium": {"n_workers": 100},
17 |         "large": {"n_workers": 100},
18 |     },
19 | ):
20 |     with setup_benchmark(
21 |         **scale_kwargs[scale], **cluster_kwargs
22 |     ) as benchmark:  # noqa: F841
23 |         benchmark(
24 |             atmospheric_circulation,
25 |             scale=scale,
26 |             storage_url=gcs_url,
27 |             storage_options={"token": CoiledShippedCredentials()},
28 |         )
29 | 


--------------------------------------------------------------------------------
/alembic/versions/2381a77e8487_zarr.py:
--------------------------------------------------------------------------------
 1 | """zarr
 2 | 
 3 | Revision ID: 2381a77e8487
 4 | Revises: d58983739401
 5 | Create Date: 2023-03-13 14:57:02.474967
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '2381a77e8487'
14 | down_revision = 'd58983739401'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         update test_run
23 |         set path = 'benchmarks/test_zarr.py'
24 |         where path = 'benchmarks/test_array.py'
25 |         and originalname in (
26 |             'test_filter_then_average', 
27 |             'test_access_slices', 
28 |             'test_sum_residuals'
29 |         )
30 |         """
31 |     )
32 | 
33 | 
34 | def downgrade() -> None:
35 |     # ### commands auto generated by Alembic - please adjust! ###
36 |     pass
37 |     # ### end Alembic commands ###
38 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_custom.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | 
 4 | from dask import delayed
 5 | from dask.utils import parse_bytes
 6 | 
 7 | from ..utils_test import wait
 8 | 
 9 | 
10 | def test_jobqueue(small_client):
11 |     # Just using dask to run lots of embarrassingly-parallel CPU-bound tasks as fast as possible
12 |     nthreads = sum(
13 |         w["nthreads"] for w in small_client.scheduler_info()["workers"].values()
14 |     )
15 |     max_runtime = 120
16 |     max_sleep = 3
17 |     n_tasks = round(max_runtime / max_sleep * nthreads)
18 | 
19 |     @delayed(pure=True)
20 |     def task(i: int) -> int:
21 |         stuff = "x" * parse_bytes("400MiB")
22 |         time.sleep(random.uniform(0, max_sleep))
23 |         del stuff
24 |         return i
25 | 
26 |     tasks = [task(i) for i in range(n_tasks)]
27 |     result = delayed(sum)(tasks)  # just so we have a single object
28 | 
29 |     wait(
30 |         result,
31 |         small_client,
32 |         max_runtime * 1.15,
33 |     )
34 | 


--------------------------------------------------------------------------------
/AB_environments/make_envs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Simple utility to automate creation of A/B environment files"""
 3 | import argparse
 4 | import os
 5 | import shutil
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser(
10 |         description="Create A/B environment files as copies of AB_baseline"
11 |     )
12 |     parser.add_argument("name", nargs="+")
13 |     names = parser.parse_args().name
14 | 
15 |     os.chdir(os.path.dirname(__file__))
16 |     for name in names:
17 |         if not name.startswith("AB_"):
18 |             name = "AB_" + name
19 |         for suffix in ("cluster.yaml", "conda.yaml", "dask.yaml", "requirements.in"):
20 |             fname = f"{name}.{suffix}"
21 |             if os.path.exists(fname):
22 |                 print(f"{fname} already exists")
23 |             else:
24 |                 print(f"Creating {fname} as a copy of baseline")
25 |                 shutil.copy(f"AB_baseline.{suffix}", fname)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     main()
30 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_satellite_filtering.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | 
 5 | from tests.geospatial.workloads.satellite_filtering import satellite_filtering
 6 | 
 7 | 
 8 | def test_satellite_filtering(
 9 |     az_url,
10 |     scale,
11 |     setup_benchmark,
12 |     cluster_kwargs={
13 |         "workspace": "dask-benchmarks-azure",
14 |         "region": "westeurope",
15 |     },
16 |     scale_kwargs={
17 |         "small": {"n_workers": 10},
18 |         "large": {"n_workers": 100},
19 |     },
20 | ):
21 |     if scale not in scale_kwargs.keys():
22 |         pytest.skip(reason=f"{scale=} not implemented")
23 |     with setup_benchmark(
24 |         **scale_kwargs[scale],
25 |         env={
26 |             "AZURE_STORAGE_ACCOUNT_NAME": os.environ["AZURE_STORAGE_ACCOUNT_NAME"],
27 |             "AZURE_STORAGE_SAS_TOKEN": os.environ["AZURE_STORAGE_SAS_TOKEN"],
28 |         },
29 |         **cluster_kwargs,
30 |     ) as benchmark:  # noqa: F841
31 |         benchmark(satellite_filtering, scale=scale, storage_url=az_url)
32 | 


--------------------------------------------------------------------------------
/alembic/versions/d58983739401_default_parameter_for_test_rechunk_in_.py:
--------------------------------------------------------------------------------
 1 | """Default parameter for test_rechunk_in_memory
 2 | 
 3 | Revision ID: d58983739401
 4 | Revises: 9d6f8ea24ee1
 5 | Create Date: 2023-03-07 11:20:28.558141
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'd58983739401'
14 | down_revision = '9d6f8ea24ee1'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(f"""
21 |         update test_run
22 |             set name = 'test_rechunk_in_memory[tasks]',
23 |             path = 'benchmarks/test_array.py'
24 |             where name == 'test_rechunk_in_memory'
25 |             and python_version like '3.9%';
26 |         """)
27 |     op.execute(
28 |         """
29 |         delete from test_run
30 |         where name == 'test_rechunk_in_memory'
31 |         and python_version not like '3.9%';
32 |         """
33 |     )
34 | 
35 | 
36 | def downgrade() -> None:
37 |     pass
38 | 


--------------------------------------------------------------------------------
/tests/stability/test_array.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | import dask.array as da
 4 | import pytest
 5 | 
 6 | from ..utils_test import cluster_memory, scaled_array_shape, wait
 7 | 
 8 | pytestmark = pytest.mark.stability
 9 | 
10 | pytest.importorskip("scipy")
11 | 
12 | 
13 | @pytest.mark.skipif(
14 |     sys.platform.startswith("win"), reason="scaled_array_shape fails on windows"
15 | )
16 | def test_ols(small_client):
17 |     chunksize = int(1e6)
18 |     memory = cluster_memory(small_client)
19 |     target_nbytes = memory * 0.50
20 |     target_shape = scaled_array_shape(target_nbytes, ("x", 100))
21 |     num_samples, num_coeffs = target_shape[0], target_shape[-1]
22 |     rng = da.random.default_rng()
23 |     beta = rng.normal(size=(num_coeffs,))
24 |     X = rng.normal(size=(num_samples, num_coeffs), chunks=(chunksize, -1))
25 |     y = X @ beta + rng.normal(size=(num_samples,), chunks=(chunksize,))
26 |     beta_hat = da.linalg.solve(X.T @ X, X.T @ y)  # normal eq'n
27 |     y_hat = X @ beta_hat
28 |     wait(y_hat, small_client, 20 * 60)
29 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_zonal_average.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example was adapted from https://github.com/dcherian/dask-demo/blob/main/nwm-aws.ipynb
 3 | """
 4 | 
 5 | import pytest
 6 | 
 7 | from tests.geospatial.workloads.zonal_average import nwm
 8 | 
 9 | 
10 | def test_nwm(
11 |     scale,
12 |     benchmark_type,
13 |     setup_benchmark,
14 |     cluster_kwargs={
15 |         "workspace": "dask-benchmarks",
16 |         "region": "us-east-1",
17 |     },
18 |     scale_kwargs={
19 |         "small": {"n_workers": 10},
20 |         "large": {"n_workers": 200, "scheduler_memory": "32 GiB"},
21 |     },
22 | ):
23 |     if benchmark_type == "submission":
24 |         pytest.skip(
25 |             reason="FIXME: Submission requires pre-computations, but no workers were requested."
26 |         )
27 |     if scale not in scale_kwargs.keys():
28 |         pytest.skip(reason=f"{scale=} not implemented")
29 |     with setup_benchmark(
30 |         **scale_kwargs[scale], **cluster_kwargs
31 |     ) as benchmark:  # noqa: F841
32 |         benchmark(nwm, scale=scale)
33 | 


--------------------------------------------------------------------------------
/alembic/versions/a9363331e323_clean_h2o_tests_with_removed_shuffle_.py:
--------------------------------------------------------------------------------
 1 | """Clean h2o tests with removed shuffle param
 2 | 
 3 | Revision ID: a9363331e323
 4 | Revises: 912c8e30690a
 5 | Create Date: 2023-01-03 19:56:22.838577
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'a9363331e323'
14 | down_revision = '912c8e30690a'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     for i in [1, 2, 3, 4, 5, 7]:
21 |         test = f"test_q{i}"
22 |         for ddf_param in ("0.5 GB (csv)", "0.5 GB (parquet)", "5 GB (parquet)"):
23 |             op.execute(f"""
24 |                 update test_run
25 |                     set name = '{test}[{ddf_param}]'
26 |                     where name == '{test}[{ddf_param}-tasks]';
27 |                 """)
28 |             op.execute(f"""
29 |                 delete from test_run
30 |                     where name == '{test}[{ddf_param}-p2p]';
31 |                 """)             
32 | 
33 | 
34 | def downgrade() -> None:
35 |     pass
36 | 


--------------------------------------------------------------------------------
/tests/runtime/test_build.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | import shlex
 5 | import subprocess
 6 | 
 7 | import coiled
 8 | from packaging.version import Version
 9 | 
10 | 
11 | def test_latest_coiled():
12 |     # Ensure that the conda environment installs the latest version of `coiled`
13 |     # FIXME this test can glitch if you install coiled from pip
14 |     v_installed = Version(coiled.__version__)
15 | 
16 |     # Get latest `coiled` release version from conda-forge
17 |     output = subprocess.check_output(
18 |         shlex.split("conda search --override-channels --json -c conda-forge coiled")
19 |     )
20 |     result = json.loads(output)
21 |     v_latest = Version(result["coiled"][-1]["version"])
22 |     # conda can lag behind a few days from pip; allow for the next version too
23 |     v_allowed = {
24 |         v_latest,
25 |         Version(f"{v_latest.major}.{v_latest.minor}.{v_latest.micro + 1}"),
26 |         Version(f"{v_latest.major}.{v_latest.minor + 1}.0"),
27 |         Version(f"{v_latest.major + 1}.0.0"),
28 |     }
29 |     assert v_installed in v_allowed
30 | 


--------------------------------------------------------------------------------
/alembic/versions/e11cd1aaed38_add_cluster_spec_to_db.py:
--------------------------------------------------------------------------------
 1 | """Add cluster spec to db
 2 | 
 3 | Revision ID: e11cd1aaed38
 4 | Revises: 00d5844fd364
 5 | Create Date: 2024-04-15 10:32:18.323088
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'e11cd1aaed38'
14 | down_revision = '00d5844fd364'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('tpch_run', sa.Column('n_workers', sa.Integer(), nullable=True))
22 |     op.add_column('tpch_run', sa.Column('worker_vm_type', sa.String(), nullable=True))
23 |     op.add_column('tpch_run', sa.Column('cluster_disk_size', sa.Integer(), nullable=True))
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     op.drop_column('tpch_run', 'cluster_disk_size')
30 |     op.drop_column('tpch_run', 'worker_vm_type')
31 |     op.drop_column('tpch_run', 'n_workers')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/ci/scripts/dask_config_to_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Read a dask config file and print it out in the format `ENV=VALUE\nENV=VALUE ...`
 3 | This script is a work-around to not being able to upload dask config files to
 4 | `conda env create`.
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | import sys
 9 | from collections.abc import Iterator
10 | 
11 | import yaml
12 | 
13 | 
14 | def main(fname: str) -> None:
15 |     with open(fname) as fh:
16 |         cfg = yaml.safe_load(fh)
17 |     if cfg:
18 |         print("\n".join(traverse(cfg, [])))
19 | 
20 | 
21 | def traverse(node: dict | list | str | float | None, path: list[str]) -> Iterator[str]:
22 |     if isinstance(node, dict):
23 |         for k, v in node.items():
24 |             k = k.upper().replace("-", "_")
25 |             yield from traverse(v, path + [k])
26 |         return
27 | 
28 |     if not path:
29 |         raise ValueError("The top-level element must be a dict")
30 |     if isinstance(node, str) and " " in node:
31 |         raise ValueError("Unsupported character: whitespace")
32 | 
33 |     yield "DASK_" + "__".join(path) + f"={node}"
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     main(sys.argv[1])
38 | 


--------------------------------------------------------------------------------
/alembic/versions/a8785a7b3cae_add_entry_to_database_for_cluster_name_.py:
--------------------------------------------------------------------------------
 1 | """Add entry to database for cluster name/id/details_url
 2 | 
 3 | Revision ID: a8785a7b3cae
 4 | Revises: 2764a4f5582b
 5 | Create Date: 2022-10-06 14:15:33.618367
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'a8785a7b3cae'
14 | down_revision = '2764a4f5582b'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('test_run', sa.Column('cluster_name', sa.String(), nullable=True))
22 |     op.add_column('test_run', sa.Column('cluster_id', sa.Integer(), nullable=True))
23 |     op.add_column('test_run', sa.Column('cluster_details_url', sa.String(), nullable=True))
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade() -> None:
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     op.drop_column('test_run', 'cluster_details_url')
30 |     op.drop_column('test_run', 'cluster_id')
31 |     op.drop_column('test_run', 'cluster_name')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/tests/workflows/test_uber_lyft.py:
--------------------------------------------------------------------------------
 1 | import dask.dataframe as dd
 2 | import pytest
 3 | 
 4 | pytestmark = pytest.mark.workflows
 5 | 
 6 | 
 7 | @pytest.mark.client("uber_lyft")
 8 | def test_exploratory_analysis(client):
 9 |     """Run some exploratory aggs on the dataset"""
10 | 
11 |     # NYC taxi Uber/Lyft dataset
12 |     df = dd.read_parquet(
13 |         "s3://coiled-datasets/uber-lyft-tlc/", storage_options={"anon": True}
14 |     )
15 | 
16 |     # Preprocessing:
17 |     #   - Add a column to indicate company, instead of license number
18 |     #   - Add a column to indicate if a tip was given
19 |     taxi_companies = {
20 |         "HV0002": "Juno",
21 |         "HV0003": "Uber",
22 |         "HV0004": "Via",
23 |         "HV0005": "Lyft",
24 |     }
25 |     df["company"] = df.hvfhs_license_num.replace(taxi_companies)
26 |     df["tipped"] = df.tips > 0
27 | 
28 |     # Persist so we only read once
29 |     df = df.persist()
30 | 
31 |     # How many riders tip?
32 |     df.tipped.mean().compute()
33 |     # How many riders tip for each company?
34 |     df.groupby("company").tipped.value_counts().compute()
35 |     # What are those as percentages?
36 |     df.groupby("company").tipped.mean().compute()
37 | 


--------------------------------------------------------------------------------
/ci/scripts/combine-dbs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | DB_NAME=${DB_NAME:-'benchmark.db'}
 5 | 
 6 | alembic upgrade head
 7 | 
 8 | # Delete old records and vacuum to reduce on-disk size
 9 | sqlite3 "$DB_NAME" <<EOF
10 | DELETE FROM test_run WHERE session_id not in (SELECT DISTINCT session_id FROM test_run WHERE start > date('now', '-90 days'));
11 | VACUUM;
12 | EOF
13 | # Merge in the individual job dbs into our working copy
14 | for FILE in $(find . -name "*.db")
15 | do
16 |   # Skip the output DB if we see it
17 |   if [ ${FILE##*/} == $DB_NAME ]; then
18 |     echo "Skipping $FILE"
19 |     continue
20 |   fi
21 |   echo "Processing $FILE"
22 |   DB_NAME=$FILE alembic upgrade head
23 |   # Copy the individual table into the primary one. We make an intermediate
24 |   # temp table so that we can null out the primary keys and reset the
25 |   # autoincrementing
26 |   for tab in "tpch_run" "test_run"
27 |   do
28 |   sqlite3 "$FILE" <<EOF
29 | attach "$DB_NAME" as lead;
30 | create temporary table tmp as select * from main.$tab;
31 | update tmp set id=NULL;
32 | insert into lead.$tab select * from tmp;
33 | detach database lead;
34 | EOF
35 |   done
36 | done
37 | 
38 | sqlite3 "$DB_NAME" "VACUUM;"
39 | 


--------------------------------------------------------------------------------
/alembic/versions/778e617a2886_merge_xarray_reduction_with_quadratic_.py:
--------------------------------------------------------------------------------
 1 | """Merge xarray reduction with quadratic sum
 2 | 
 3 | Revision ID: 778e617a2886
 4 | Revises: 9813b7160e69
 5 | Create Date: 2023-10-02 17:26:16.502775
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '778e617a2886'
14 | down_revision = '9813b7160e69'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.execute(
22 |             f"""
23 |         update test_run
24 |         set name = 'test_quadratic_mean[array]',
25 |         path = 'benchmarks/test_array.py'
26 |         where name == 'test_quadratic_mean';
27 |         """
28 |     )
29 |     for backend in ["array", "dataframe"]:
30 |         op.execute(
31 |                 f"""
32 |             delete from test_run
33 |             where name == 'test_xarray_reduction[{backend}]';
34 |             """
35 |         )
36 |     # ### end Alembic commands ###
37 | 
38 | 
39 | def downgrade() -> None:
40 |     # ### commands auto generated by Alembic - please adjust! ###
41 |     pass
42 |     # ### end Alembic commands ###
43 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 120
 3 | exclude = alembic/versions/*
 4 | ignore =
 5 |     # Extra space in brackets
 6 |     E20
 7 |     # Line break before binary operator
 8 |     W503
 9 |     # Line break after binary operator
10 |     W504
11 | 
12 | [isort]
13 | skip = alembic
14 | profile = black
15 | 
16 | [tool:pytest]
17 | addopts = -v -rsxfE --durations=0 --color=yes --strict-markers --strict-config --dist loadscope
18 | markers =
19 |     stability: stability tests; not meant to measure performance
20 |     workflows: workflow tests; expensive to run. Disabled in PRs.
21 |     shuffle_p2p: p2p shuffle engine
22 |     shuffle_tasks: legacy tasks-based shuffle engine
23 |     tpch_dask: dask implementation of the TPCH tests suite
24 |     tpch_nondask: competitors' (not dask) implementation of the TPCH test suite
25 |     tpch_correctness: verify correctness of the dask implementation of the TPCH tests suite
26 | 
27 | # pytest-timeout settings
28 | # 'thread' kills off the whole test suite. 'signal' only kills the offending test.
29 | # However, 'signal' doesn't work on Windows (due to lack of SIGALRM).
30 | # The 'tests' CI script modifies this config file on the fly for Windows clients.
31 | timeout_method = signal
32 | timeout = 3600
33 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/rechunking.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Literal
 2 | 
 3 | import xarray as xr
 4 | from dask.delayed import Delayed
 5 | 
 6 | 
 7 | def era5_rechunking(
 8 |     scale: Literal["small", "medium", "large"],
 9 |     storage_url: str,
10 |     storage_options: dict[str, Any],
11 | ) -> Delayed:
12 |     ds = xr.open_zarr(
13 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
14 |     ).drop_encoding()
15 | 
16 |     if scale == "small":
17 |         # 101.83 GiB (small)
18 |         time_range = slice("2020-01-01", "2023-01-01")
19 |         variables = ["sea_surface_temperature"]
20 |     elif scale == "medium":
21 |         # 2.12 TiB (medium)
22 |         time_range = slice(None)
23 |         variables = ["sea_surface_temperature"]
24 |     else:
25 |         # 4.24 TiB (large)
26 |         # This currently doesn't complete successfully.
27 |         time_range = slice(None)
28 |         variables = ["sea_surface_temperature", "snow_depth"]
29 |     subset = ds[variables].sel(time=time_range)
30 | 
31 |     # Rechunk
32 |     result = subset.chunk({"time": -1, "longitude": "auto", "latitude": "auto"})
33 | 
34 |     # Write result to cloud storage
35 |     return result.to_zarr(storage_url, storage_options=storage_options, compute=False)
36 | 


--------------------------------------------------------------------------------
/alembic/versions/24749594f367_add_prometheus_metrics.py:
--------------------------------------------------------------------------------
 1 | """Add prometheus metrics
 2 | 
 3 | Revision ID: 24749594f367
 4 | Revises: 1c2fe9d527e4
 5 | Create Date: 2023-10-16 18:43:35.402355
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '24749594f367'
14 | down_revision = '1c2fe9d527e4'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('test_run', sa.Column('scheduler_cpu_avg', sa.Float(), nullable=True))
22 |     op.add_column('test_run', sa.Column('scheduler_memory_max', sa.Float(), nullable=True))
23 |     op.add_column('test_run', sa.Column('worker_max_tick', sa.Float(), nullable=True))
24 |     op.add_column('test_run', sa.Column('scheduler_max_tick', sa.Float(), nullable=True))
25 |     # ### end Alembic commands ###
26 | 
27 | 
28 | def downgrade() -> None:
29 |     # ### commands auto generated by Alembic - please adjust! ###
30 |     op.drop_column('test_run', 'scheduler_max_tick')
31 |     op.drop_column('test_run', 'worker_max_tick')
32 |     op.drop_column('test_run', 'scheduler_memory_max')
33 |     op.drop_column('test_run', 'scheduler_cpu_avg')
34 |     # ### end Alembic commands ###
35 | 


--------------------------------------------------------------------------------
/alembic/versions/1c2fe9d527e4_expand_rechunk_parameters.py:
--------------------------------------------------------------------------------
 1 | """Expand rechunk parameters
 2 | 
 3 | Revision ID: 1c2fe9d527e4
 4 | Revises: 87cbf883c2be
 5 | Create Date: 2023-10-25 16:26:23.813378
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '1c2fe9d527e4'
14 | down_revision = '87cbf883c2be'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     for test in ("test_swap_axes", "test_tiles_to_rows"):
21 |         for multiplier in (0.1, 1):
22 |             op.execute(
23 |                 f"""
24 |                 update test_run
25 |                 set name = '{test}[{multiplier}-128 MiB-p2p-disk]'
26 |                 where name == '{test}[{multiplier}-128 MiB-p2p]'
27 |                 and path == 'benchmarks/test_rechunk.py';
28 |                 """
29 |             )
30 | 
31 |     for test in ("test_rechunk_in_memory", "test_rechunk_striping", "test_rechunk_swap_axes"):
32 |         op.execute(
33 |             f"""
34 |             delete from test_run
35 |             where originalname == '{test}'
36 |             and path == 'benchmarks/test_array.py';
37 |             """
38 |         )
39 | 
40 | 
41 | def downgrade() -> None:
42 |     # ### commands auto generated by Alembic - please adjust! ###
43 |     pass
44 |     # ### end Alembic commands ###
45 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/zonal_average.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | import flox.xarray
 4 | import fsspec
 5 | import numpy as np
 6 | import rioxarray
 7 | import xarray as xr
 8 | 
 9 | 
10 | def nwm(
11 |     scale: Literal["small", "medium", "large"],
12 | ) -> xr.DataArray:
13 |     ds = xr.open_zarr(
14 |         "s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr", consolidated=True
15 |     )
16 | 
17 |     if scale == "small":
18 |         # 6.03 TiB
19 |         time_range = slice("2020-01-01", "2020-12-31")
20 |     else:
21 |         # 252.30 TiB
22 |         time_range = slice("1979-02-01", "2020-12-31")
23 |     subset = ds.zwattablrt.sel(time=time_range)
24 | 
25 |     counties = rioxarray.open_rasterio(
26 |         "s3://nwm-250m-us-counties/Counties_on_250m_grid.tif",
27 |         chunks="auto",
28 |         opener=fsspec.open,
29 |     ).squeeze()
30 | 
31 |     # Remove any small floating point error in coordinate locations
32 |     _, counties_aligned = xr.align(subset, counties, join="override")
33 |     counties_aligned = counties_aligned.persist()
34 | 
35 |     county_id = np.unique(counties_aligned.data).compute()
36 |     county_id = county_id[county_id != 0]
37 |     county_mean = flox.xarray.xarray_reduce(
38 |         subset,
39 |         counties_aligned.rename("county"),
40 |         func="mean",
41 |         expected_groups=(county_id,),
42 |     )
43 |     return county_mean
44 | 


--------------------------------------------------------------------------------
/alembic/versions/4ee0e23d96da_compressible_variant_of_tests.py:
--------------------------------------------------------------------------------
 1 | """compressible variant of tests
 2 | 
 3 | Revision ID: 4ee0e23d96da
 4 | Revises: 2381a77e8487
 5 | Create Date: 2023-03-14 16:13:23.809226
 6 | 
 7 | """
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '4ee0e23d96da'
13 | down_revision = '2381a77e8487'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade() -> None:
19 |     for name in (
20 |         "test_anom_mean",
21 |         "test_vorticity",
22 |         "test_double_diff",
23 |         "test_dot_product",
24 |         "test_map_overlap_sample",
25 |     ):
26 |         op.execute(
27 |             f"""
28 |             update test_run 
29 |             set name = '{name}[uncompressible]',
30 |             path = 'benchmarks/test_array.py'
31 |             where name == '{name}';
32 |             """
33 |         )
34 |     op.execute(
35 |         """
36 |         delete from test_run
37 |         where path = 'benchmarks/test_spill.py'
38 |         and name in (
39 |             'test_dot_product_spill[compressible]',
40 |             'test_spilling[compressible-keep]',
41 |             'test_spilling[compressible-release]'
42 |         )
43 |         """
44 |     )
45 | 
46 | 
47 | def downgrade() -> None:
48 |     # ### commands auto generated by Alembic - please adjust! ###
49 |     pass
50 |     # ### end Alembic commands ###
51 | 


--------------------------------------------------------------------------------
/tests/runtime/test_xgboost.py:
--------------------------------------------------------------------------------
 1 | import dask.dataframe as dd
 2 | import pytest
 3 | 
 4 | dask_ml = pytest.importorskip("dask_ml")
 5 | dxgb = pytest.importorskip("xgboost.dask")
 6 | 
 7 | 
 8 | def test_xgboost_distributed_training(small_client):
 9 |     ddf = dd.read_parquet(
10 |         "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet",
11 |         storage_options={"anon": True},
12 |     )
13 |     ddf = ddf.partitions[0:30]
14 |     ddf = ddf.persist()
15 | 
16 |     # Create the train-test split
17 |     X, y = ddf.iloc[:, :-1], ddf["target"]
18 |     X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split(
19 |         X, y, test_size=0.3, shuffle=True, random_state=21
20 |     )
21 | 
22 |     # Create the XGBoost DMatrix for our training and testing splits
23 |     dtrain = dxgb.DaskDMatrix(small_client, X_train, y_train)
24 |     dtest = dxgb.DaskDMatrix(small_client, X_test, y_test)
25 | 
26 |     # Set model parameters (XGBoost defaults)
27 |     params = {
28 |         "max_depth": 6,
29 |         "gamma": 0,
30 |         "eta": 0.3,
31 |         "min_child_weight": 30,
32 |         "objective": "reg:squarederror",
33 |         "grow_policy": "depthwise",
34 |     }
35 |     output = dxgb.train(
36 |         small_client, params, dtrain, num_boost_round=5, evals=[(dtrain, "train")]
37 |     )
38 | 
39 |     # make predictions
40 |     y_pred = dxgb.predict(small_client, output, dtest)
41 |     assert y_pred.shape[0] == y_test.shape[0].compute()
42 | 


--------------------------------------------------------------------------------
/alembic/versions/967e298408ed_test_spill.py:
--------------------------------------------------------------------------------
 1 | """test_spill
 2 | 
 3 | Revision ID: 967e298408ed
 4 | Revises: a9363331e323
 5 | Create Date: 2023-01-09 17:05:13.568510
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '967e298408ed'
14 | down_revision = 'a9363331e323'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         update test_run 
23 |         set name = 'test_spilling[uncompressible-release]',
24 |         path = 'benchmarks/test_spill.py'
25 |         where name == 'test_spilling[False]'
26 |         and python_version like '3.9%';
27 |         """
28 |     )
29 |     op.execute(
30 |         """
31 |         update test_run 
32 |         set name = 'test_spilling[uncompressible-keep]',
33 |         path = 'benchmarks/test_spill.py'
34 |         where name == 'test_spilling[True]'
35 |         and python_version like '3.9%';
36 |         """
37 |     )
38 |     op.execute(
39 |         """
40 |         delete from test_run
41 |         where originalname = 'test_spilling'
42 |         and python_version not like '3.9%';
43 |         """
44 |     )
45 |     op.execute(
46 |         """
47 |         delete from test_run
48 |         where originalname = 'test_tensordot_stress';
49 |         """
50 |     )
51 | 
52 | 
53 | def downgrade() -> None:
54 |     # ### commands auto generated by Alembic - please adjust! ###
55 |     pass
56 |     # ### end Alembic commands ###
57 | 


--------------------------------------------------------------------------------
/alembic/versions/78c6e00fee88_remove_task_based_shuffle.py:
--------------------------------------------------------------------------------
 1 | """Remove task based shuffle
 2 | 
 3 | Revision ID: 78c6e00fee88
 4 | Revises: 778e617a2886
 5 | Create Date: 2023-10-19 15:26:04.281985
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = "78c6e00fee88"
14 | down_revision = "778e617a2886"
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     op.execute(
21 |         """
22 |         delete from test_run
23 |         where (
24 |                originalname in ('test_shuffle', 'test_cluster_reconnect')
25 |                or name like 'test_join_big[%tasks%]'
26 |                or name like 'test_set_index[%tasks%]'
27 |             )
28 |         """
29 |     )
30 |     op.execute(
31 |         """
32 |         update test_run
33 |         set name = 'test_join_big[1]'
34 |         where name == 'test_join_big[1-p2p]';
35 |         """
36 |     )
37 |     op.execute(
38 |         """
39 |         update test_run
40 |         set name = 'test_join_big[0.1]'
41 |         where name == 'test_join_big[0.1-p2p]';
42 |         """
43 |     )
44 |     for b in [True, False]:
45 |         for factor in [0.1, 1]:
46 |             op.execute(
47 |             f"""
48 |                 update test_run
49 |                 set name = 'test_set_index[{factor}-{b}]'
50 |                 where name == 'test_set_index[{factor}-p2p-{b}]';
51 |                 """
52 |             )
53 | 
54 | 
55 | def downgrade() -> None:
56 |     pass
57 | 


--------------------------------------------------------------------------------
/tests/tpch/test_optimization.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from . import dask_queries
 4 | 
 5 | pytestmark = pytest.mark.tpch_dask
 6 | 
 7 | 
 8 | @pytest.fixture(
 9 |     params=[
10 |         1,
11 |         2,
12 |         3,
13 |         4,
14 |         5,
15 |         6,
16 |         7,
17 |         8,
18 |         9,
19 |         10,
20 |         11,
21 |         12,
22 |         13,
23 |         14,
24 |         15,
25 |         16,
26 |         17,
27 |         18,
28 |         19,
29 |         20,
30 |         21,
31 |         22,
32 |     ],
33 | )
34 | def query(request):
35 |     return request.param
36 | 
37 | 
38 | def test_optimization(query, dataset_path, fs, client, scale):
39 |     func = getattr(dask_queries, f"query_{query:02d}")
40 |     result = func(dataset_path, fs, scale)
41 |     # We need to inject .repartition(npartitions=1) which .compute() does under the hood
42 |     result.repartition(npartitions=1).optimize()
43 | 
44 | 
45 | @pytest.mark.skip(
46 |     reason="This test does not work. See FIXME and https://github.com/dask/distributed/issues/8833."
47 | )
48 | def test_delay_computation_start(query, dataset_path, fs, client, scale):
49 |     func = getattr(dask_queries, f"query_{query:02d}")
50 |     result = func(dataset_path, fs, scale).optimize()
51 |     # FIXME: Client.compute unblocks only until the graph is serialized and put onto
52 |     # the comm buffer. It should wait until update_graph finishes, i.e. graph is
53 |     # submitted, parsed, and the tasks have been added onto the scheduler.
54 |     client.compute(result)
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Coiled
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/tests/stability/test_deadlock.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | import dask
 4 | import distributed
 5 | import pytest
 6 | from coiled import Cluster
 7 | from distributed import Client, wait
 8 | from packaging.version import Version
 9 | 
10 | pytestmark = pytest.mark.stability
11 | 
12 | 
13 | @pytest.mark.skipif(
14 |     Version(distributed.__version__) < Version("2022.4.2"),
15 |     reason="https://github.com/dask/distributed/issues/6110",
16 | )
17 | def test_repeated_merge_spill(
18 |     benchmark_all,
19 |     cluster_kwargs,
20 |     dask_env_variables,
21 |     github_cluster_tags,
22 | ):
23 |     with Cluster(
24 |         name=f"test_repeated_merge_spill-{uuid.uuid4().hex[:8]}",
25 |         environ=dask_env_variables,
26 |         tags=github_cluster_tags,
27 |         **cluster_kwargs["test_repeated_merge_spill"],
28 |     ) as cluster:
29 |         with Client(cluster) as client:
30 |             with benchmark_all(client):
31 |                 ddf = dask.datasets.timeseries(
32 |                     "2020",
33 |                     "2025",
34 |                     partition_freq="2w",
35 |                 )
36 |                 ddf2 = dask.datasets.timeseries(
37 |                     "2020",
38 |                     "2023",
39 |                     partition_freq="2w",
40 |                 )
41 | 
42 |                 for _ in range(10):
43 |                     client.restart()
44 |                     fs = client.compute((ddf.x + ddf.y).mean())
45 | 
46 |                     wait(fs, timeout=2 * 60)
47 |                     del fs
48 | 
49 |                     ddf3 = ddf.merge(ddf2)
50 |                     fs = client.compute((ddf3.x + ddf3.y).mean())
51 | 
52 |                     wait(fs, timeout=2 * 60)
53 |                     del fs
54 | 


--------------------------------------------------------------------------------
/alembic/versions/c38b9d85915e_default_parameter_for_shuffling_tests.py:
--------------------------------------------------------------------------------
 1 | """Default parameter for shuffling tests
 2 | 
 3 | Revision ID: c38b9d85915e
 4 | Revises: fa79471ffa8c
 5 | Create Date: 2022-12-23 09:05:57.440944
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'c38b9d85915e'
14 | down_revision = 'fa79471ffa8c'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def h2o_update_query(test: str, ddf: str) -> str:
20 |         return f"""
21 |         update test_run
22 |             set name = '{test}[{ddf}-tasks]'
23 |             where name == '{test}[{ddf}]';
24 |         """
25 | 
26 | def rename_h2o_tests() -> None:
27 |     for i in range(1, 10):
28 |         test = f"test_q{i}"
29 |         for ddf_param in ("0.5 GB (csv)", "0.5 GB (parquet)", "5 GB (parquet)"):
30 |             op.execute(f"""
31 |                 update test_run
32 |                     set name = '{test}[{ddf_param}-tasks]'
33 |                     where name == '{test}[{ddf_param}]';
34 |                 """)           
35 | 
36 | def rename_join_tests() -> None:
37 |     for test in ("test_join_big", "test_join_big_small"):
38 |         op.execute(f"""
39 |             update test_run
40 |                 set name = '{test}[0.1-tasks]'
41 |                 where name == '{test}[0.1]';
42 |             """)
43 | 
44 | def rename_shuffle_tests() -> None:
45 |     for test in ("test_shuffle_parquet", "test_shuffle_simple"):
46 |         op.execute(f"""
47 |             update test_run
48 |                 set name = '{test}[tasks]'
49 |                 where name == '{test}';
50 |             """)
51 | 
52 | def upgrade() -> None:
53 |     rename_h2o_tests()
54 |     rename_join_tests()
55 |     rename_shuffle_tests()
56 | 
57 | 
58 | def downgrade() -> None:
59 |     pass
60 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/regridding.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Literal
 2 | 
 3 | import numpy as np
 4 | import xarray as xr
 5 | import xesmf as xe
 6 | from dask.delayed import Delayed
 7 | 
 8 | 
 9 | def xesmf(
10 |     scale: Literal["small", "medium", "large"],
11 |     storage_url: str,
12 |     storage_options: dict[str, Any],
13 | ) -> Delayed:
14 |     ds = xr.open_zarr(
15 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
16 |     )
17 |     # Fixed time range and variable as the interesting part of this benchmark scales with the
18 |     # regridding matrix
19 |     ds = ds[["sea_surface_temperature"]].sel(time=slice("2020-01-01", "2021-12-31"))
20 |     if scale == "small":
21 |         # Regridding from a resolution of 0.25 degress to 1 degrees
22 |         # results in 4 MiB weight matrix
23 |         output_resolution = 1
24 |     elif scale == "medium":
25 |         # Regridding from a resolution of 0.25 degrees to 0.2 degrees
26 |         # results in 100 MiB weight matrix
27 |         output_resolution = 0.2
28 |     else:
29 |         # Regridding from a resolution of 0.25 degrees to 0.05 degrees
30 |         # results in 1.55 GiB weight matrix
31 |         output_resolution = 0.05
32 | 
33 |     out_grid = xr.Dataset(
34 |         {
35 |             "latitude": (
36 |                 ["latitude"],
37 |                 np.arange(90, -90 - output_resolution, -output_resolution),
38 |                 {"units": "degrees_north"},
39 |             ),
40 |             "longitude": (
41 |                 ["longitude"],
42 |                 np.arange(0, 360, output_resolution),
43 |                 {"units": "degrees_east"},
44 |             ),
45 |         }
46 |     )
47 |     regridder = xe.Regridder(ds, out_grid, "bilinear", periodic=True)
48 |     regridded = regridder(ds, keep_attrs=True)
49 | 
50 |     result = regridded.chunk(time="auto")
51 |     return result.to_zarr(storage_url, storage_options=storage_options, compute=False)
52 | 


--------------------------------------------------------------------------------
/ci/environment.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 | dependencies:
 4 | ########################################################
 5 | # PLEASE READ:
 6 | # When modifying this file, you must also align to match
 7 | # - AB_environments/AB_baseline.conda.yaml
 8 | # - AB_environments/AB_sample.conda.yaml
 9 | ########################################################
10 | 
11 |   - python >=3.10
12 |   - pip
13 |   - coiled >=0.2.54
14 |   - numpy ==2.0.2
15 |   - pandas ==2.2.3
16 |   - dask ==2024.11.2
17 |   - distributed ==2024.11.2
18 |   - dask-labextension ==7.0.0
19 |   - dask-ml ==2024.4.4
20 |   - fsspec ==2024.10.0
21 |   - s3fs ==2024.10.0
22 |   - gcsfs ==2024.10.0
23 |   - pyarrow ==18.1.0
24 |   - jupyterlab ==4.3.1
25 |   - lz4 ==4.3.3
26 |   - ipywidgets ==8.1.5
27 |   - numba ==0.60.0
28 |   - scikit-learn ==1.5.2
29 |   - ipycytoscape ==1.3.3
30 |   - click ==8.1.7
31 |   - xarray ==2024.11.0
32 |   - flox ==0.9.15
33 |   - zarr ==2.18.3
34 |   - cftime ==1.6.4
35 |   - msgpack-python
36 |   - cloudpickle ==3.1.0
37 |   - tornado ==6.4.2
38 |   - toolz ==1.0.0
39 |   - zict ==3.0.0
40 |   - xgboost ==3.0.2
41 |   - optuna ==4.1.0
42 |   - optuna-integration ==4.1.0
43 |   - scipy ==1.14.1
44 |   - sqlalchemy ==2.0.36
45 |   - pynvml ==11.5.3
46 |   - bokeh ==3.6.1
47 |   - gilknocker ==0.4.1
48 |   - openssl >1.1.0g
49 |   - rasterio >=1.4.0
50 |   - rioxarray ==0.17.0
51 |   - h5netcdf ==1.4.1
52 |   - xesmf ==0.8.7
53 |   - bottleneck ==1.4.2
54 |   - geojson ==3.1.0
55 |   - planetary-computer ==1.0.0
56 |   - pystac-client ==0.8.5
57 |   - odc-stac ==0.3.10
58 |   - adlfs ==2024.7.0
59 |   # https://github.com/coiled/benchmarks/issues/1616
60 |   - cryptography ==43.0.3
61 |   - pyopenssl ==24.2.1
62 | 
63 | ########################################################
64 | # PLEASE READ:
65 | # When modifying this file, you must also align to match
66 | # - AB_environments/AB_baseline.conda.yaml
67 | # - AB_environments/AB_sample.conda.yaml
68 | ########################################################
69 | 


--------------------------------------------------------------------------------
/tests/geospatial/test_climatology.py:
--------------------------------------------------------------------------------
 1 | """This benchmark is a port of the climatology computation implemented in
 2 | https://github.com/google-research/weatherbench2/blob/47d72575cf5e99383a09bed19ba989b718d5fe30/scripts/compute_climatology.py
 3 | with the parameters
 4 | 
 5 | FREQUENCY = "hourly"
 6 | HOUR_INTERVAL = 6
 7 | WINDOW_SIZE = 61
 8 | STATISTICS = ["mean"]
 9 | METHOD = "explicit"
10 | """
11 | 
12 | from coiled.credentials.google import CoiledShippedCredentials
13 | 
14 | from tests.geospatial.workloads.climatology import highlevel_api, rechunk_map_blocks
15 | 
16 | 
17 | def test_rechunk_map_blocks(
18 |     gcs_url,
19 |     scale,
20 |     setup_benchmark,
21 |     cluster_kwargs={
22 |         "workspace": "dask-benchmarks-gcp",
23 |         "region": "us-central1",
24 |     },
25 |     scale_kwargs={
26 |         "small": {"n_workers": 10},
27 |         "medium": {"n_workers": 100},
28 |         "large": {"n_workers": 100},
29 |     },
30 | ):
31 |     with setup_benchmark(
32 |         **scale_kwargs[scale], **cluster_kwargs
33 |     ) as benchmark:  # noqa: F841
34 |         benchmark(
35 |             rechunk_map_blocks,
36 |             scale=scale,
37 |             storage_url=gcs_url,
38 |             storage_options={"token": CoiledShippedCredentials()},
39 |         )
40 | 
41 | 
42 | def test_highlevel_api(
43 |     gcs_url,
44 |     scale,
45 |     setup_benchmark,
46 |     cluster_kwargs={
47 |         "workspace": "dask-benchmarks-gcp",
48 |         "region": "us-central1",
49 |         "idle_timeout": "1h",
50 |     },
51 |     scale_kwargs={
52 |         "small": {"n_workers": 10},
53 |         "medium": {"n_workers": 100},
54 |         "large": {"n_workers": 100},
55 |     },
56 | ):
57 |     with setup_benchmark(
58 |         **scale_kwargs[scale], **cluster_kwargs
59 |     ) as benchmark:  # noqa: F841
60 |         benchmark(
61 |             highlevel_api,
62 |             scale=scale,
63 |             storage_url=gcs_url,
64 |             storage_options={"token": CoiledShippedCredentials()},
65 |         )
66 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/atmospheric_circulation.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Literal
 2 | 
 3 | import xarray as xr
 4 | from dask.delayed import Delayed
 5 | 
 6 | 
 7 | def atmospheric_circulation(
 8 |     scale: Literal["small", "medium", "large"],
 9 |     storage_url: str,
10 |     storage_options: dict[str, Any],
11 | ) -> Delayed:
12 |     ds = xr.open_zarr(
13 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
14 |         chunks={},
15 |     )
16 |     if scale == "small":
17 |         # 852.56 GiB (small)
18 |         time_range = slice("2020-01-01", "2020-02-01")
19 |     elif scale == "medium":
20 |         # 28.54 TiB (medium)
21 |         time_range = slice("2020-01-01", "2023-01-01")
22 |     else:
23 |         # 608.42 TiB (large)
24 |         time_range = slice(None)
25 |     ds = ds.sel(time=time_range)
26 | 
27 |     ds = ds[
28 |         [
29 |             "u_component_of_wind",
30 |             "v_component_of_wind",
31 |             "temperature",
32 |             "vertical_velocity",
33 |         ]
34 |     ].rename(
35 |         {
36 |             "u_component_of_wind": "U",
37 |             "v_component_of_wind": "V",
38 |             "temperature": "T",
39 |             "vertical_velocity": "W",
40 |         }
41 |     )
42 | 
43 |     zonal_means = ds.mean("longitude")
44 |     anomaly = ds - zonal_means
45 | 
46 |     anomaly["uv"] = anomaly.U * anomaly.V
47 |     anomaly["vt"] = anomaly.V * anomaly.T
48 |     anomaly["uw"] = anomaly.U * anomaly.W
49 | 
50 |     temdiags = zonal_means.merge(anomaly[["uv", "vt", "uw"]].mean("longitude"))
51 | 
52 |     # This is incredibly slow, takes a while for flox to construct the graph
53 |     daily = temdiags.resample(time="D").mean()
54 | 
55 |     # # Users often rework things via a rechunk to make this a blockwise problem
56 |     # daily = (
57 |     #     temdiags.chunk(time=24)
58 |     #     .resample(time="D")
59 |     #     .mean()
60 |     # )
61 | 
62 |     return daily.to_zarr(storage_url, storage_options=storage_options, compute=False)
63 | 


--------------------------------------------------------------------------------
/plugins.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Collection of useful plugins for monitoring clusters.
 3 | """
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | import cloudpickle
 8 | from distributed.diagnostics import SchedulerPlugin
 9 | 
10 | # Tell cloudpickle we want to register objects in this module by value,
11 | # so we can send them to the scheduler without the files existing there.
12 | cloudpickle.register_pickle_by_value(sys.modules[__name__])
13 | 
14 | 
15 | class Durations(SchedulerPlugin):
16 |     def __init__(self):
17 |         """Initialize the plugin"""
18 |         self.durations = defaultdict(float)
19 |         self.scheduler = None
20 |         self._tracking = False
21 |         # Big hack to trigger cloudpickle serialization for distributed < 2022.7.0
22 |         # https://github.com/dask/distributed/pull/6466
23 |         self.__main__ = "__main__"
24 | 
25 |     def start(self, scheduler):
26 |         """Called on scheduler start as well as on registration time"""
27 |         self.scheduler = scheduler
28 |         scheduler.handlers["get_durations"] = self.get_durations
29 |         scheduler.handlers["start_tracking_durations"] = self.start_tracking
30 |         scheduler.handlers["stop_tracking_durations"] = self.stop_tracking
31 | 
32 |     def start_tracking(self, comm):
33 |         self._tracking = True
34 |         self.durations.clear()
35 | 
36 |     def stop_tracking(self, comm):
37 |         self._tracking = False
38 | 
39 |     def transition(self, key, start, finish, *args, **kwargs):
40 |         """On key transition to memory, update the duration data"""
41 |         if not self._tracking:
42 |             return
43 | 
44 |         if start == "processing" and finish == "memory":
45 |             startstops = kwargs.get("startstops")
46 |             if not startstops:
47 |                 return
48 | 
49 |             for ss in startstops:
50 |                 self.durations[ss["action"]] += max(ss["stop"] - ss["start"], 0)
51 | 
52 |     async def get_durations(self, comm):
53 |         return dict(self.durations)
54 | 
55 |     def restart(self, scheduler):
56 |         self.durations.clear()
57 | 


--------------------------------------------------------------------------------
/tests/workflows/test_xgboost_optuna.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from dask.distributed import wait
 3 | 
 4 | pytestmark = pytest.mark.workflows
 5 | 
 6 | optuna = pytest.importorskip("optuna")
 7 | xgb = pytest.importorskip("xgboost")
 8 | pytest.importorskip("sklearn")
 9 | 
10 | from optuna.samplers import RandomSampler  # noqa: E402
11 | from optuna_integration import DaskStorage  # noqa: E402
12 | from sklearn.datasets import fetch_covtype  # noqa: E402
13 | from sklearn.model_selection import KFold, cross_val_score  # noqa: E402
14 | from sklearn.preprocessing import LabelEncoder  # noqa: E402
15 | 
16 | 
17 | @pytest.mark.client("xgboost_optuna")
18 | def test_hpo(client):
19 |     # We use a random sampler with a seed to get deterministic results.
20 |     # This is just for benchmarking purposes.
21 |     study = optuna.create_study(
22 |         direction="maximize", storage=DaskStorage(), sampler=RandomSampler(seed=2)
23 |     )
24 | 
25 |     def objective(trial):
26 |         # Dataset (241.59 MiB) from http://archive.ics.uci.edu/ml/datasets/covertype
27 |         X, y = fetch_covtype(return_X_y=True)
28 | 
29 |         # Format training labels
30 |         le = LabelEncoder()
31 |         y = le.fit_transform(y)
32 | 
33 |         # Get hyperparameter values for this trial and model score
34 |         params = {
35 |             "n_estimators": trial.suggest_int("n_estimators", 2, 10),
36 |             "max_depth": trial.suggest_int("max_depth", 2, 10),
37 |             "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
38 |             "subsample": trial.suggest_float("subsample", 0.2, 1.0),
39 |             "n_jobs": 1,  # Avoid thread oversubscription
40 |         }
41 |         model = xgb.XGBClassifier(**params)
42 |         cv = KFold(n_splits=3, shuffle=True, random_state=2)
43 |         score = cross_val_score(model, X, y, cv=cv)
44 |         return score.mean()
45 | 
46 |     # Run HPO trials on a cluster
47 |     n_trials = 200
48 |     futures = [
49 |         client.submit(study.optimize, objective, n_trials=1, pure=False)
50 |         for _ in range(n_trials)
51 |     ]
52 |     wait(futures)
53 |     assert len(study.trials) >= n_trials
54 | 


--------------------------------------------------------------------------------
/AB_environments/AB_baseline.conda.yaml:
--------------------------------------------------------------------------------
 1 | # Special environment file for A/B testing, used to define the conda environment for the
 2 | # baseline environment.
 3 | # Change contents, but do not rename.
 4 | 
 5 | channels:
 6 |   - conda-forge
 7 | dependencies:
 8 |   - python =3.10  # Single '=' means latest patch version available
 9 |   - memray ==1.13.4
10 |   # Copy-paste from ci/environment.yml
11 |   - pip
12 |   - coiled >=0.2.54
13 |   - numpy ==2.0.2
14 |   - pandas ==2.2.3
15 |   - dask ==2024.11.2
16 |   - distributed ==2024.11.2
17 |   - dask-labextension ==7.0.0
18 |   - dask-ml ==2024.4.4
19 |   - fsspec ==2024.10.0
20 |   - s3fs ==2024.10.0
21 |   - gcsfs ==2024.10.0
22 |   - pyarrow ==18.1.0
23 |   - jupyterlab ==4.3.1
24 |   - lz4 ==4.3.3
25 |   - ipywidgets ==8.1.5
26 |   - numba ==0.60.0
27 |   - scikit-learn ==1.5.2
28 |   - ipycytoscape ==1.3.3
29 |   - click ==8.1.7
30 |   - xarray ==2024.11.0
31 |   - flox ==0.9.15
32 |   - zarr ==2.18.3
33 |   - cftime ==1.6.4
34 |   - msgpack-python
35 |   - cloudpickle ==3.1.0
36 |   - tornado ==6.4.2
37 |   - toolz ==1.0.0
38 |   - zict ==3.0.0
39 |   - xgboost ==3.0.2
40 |   - optuna ==4.1.0
41 |   - optuna-integration ==4.1.0
42 |   - scipy ==1.14.1
43 |   - sqlalchemy ==2.0.36
44 |   - pynvml ==11.5.3
45 |   - bokeh ==3.6.1
46 |   - gilknocker ==0.4.1
47 |   - openssl >1.1.0g
48 |   - rasterio >=1.4.0
49 |   - rioxarray ==0.17.0
50 |   - h5netcdf ==1.4.1
51 |   - xesmf ==0.8.7
52 |   - bottleneck ==1.4.2
53 |   - geojson ==3.1.0
54 |   - planetary-computer ==1.0.0
55 |   - pystac-client ==0.8.5
56 |   - odc-stac ==0.3.10
57 |   - adlfs ==2024.7.0
58 |   # https://github.com/coiled/benchmarks/issues/1616
59 |   - cryptography ==43.0.3
60 |   - pyopenssl ==24.2.1
61 |   # End copy-paste
62 | 
63 |   - pip:
64 |     # Make sure you install dask and distributed either both from pip or both from
65 |     # conda. You may alternatively point to your own git fork (but make sure you
66 |     # sync'ed tags!)
67 |     # Read README.md for troubleshooting.
68 |     # - git+https://github.com/dask/dask@191d39177009d2cce25b818878118e35329b6db3
69 |     # - git+https://github.com/dask/distributed@0304fb6e665e36abf9e3086173cccd36e29ae84d
70 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_zarr.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import dask.array as da
 4 | import pytest
 5 | 
 6 | from ..utils_test import run_up_to_nthreads, wait
 7 | 
 8 | pytest.importorskip("zarr")
 9 | 
10 | 
11 | @pytest.fixture(scope="module")
12 | def zarr_dataset():
13 |     # shape = (2000, 2000, 2000)
14 |     # chunks = (200, 200, 200)
15 |     # Compresses to ~42% of its original size (tested on lz4 4.0)
16 |     store = (
17 |         "s3://coiled-runtime-ci/synthetic-zarr/synth_random_int_array_2000_cubed.zarr"
18 |     )
19 |     return da.from_zarr(store)
20 | 
21 | 
22 | @pytest.fixture(scope="module")
23 | def cmip6():
24 |     xarray = pytest.importorskip("xarray")
25 |     pytest.importorskip("cftime")
26 | 
27 |     store = "s3://coiled-runtime-ci/CMIP6/CMIP/AS-RCEC/TaiESM1/1pctCO2/r1i1p1f1/Amon/zg/gn/v20200225/"
28 |     return xarray.open_dataset(store, engine="zarr", chunks={})
29 | 
30 | 
31 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
32 | @pytest.mark.parametrize("threshold", [50, 100, 200, 255])
33 | def test_filter_then_average(small_client, zarr_dataset, threshold):
34 |     """Compute the mean for increasingly sparse boolean filters of an array"""
35 |     a = zarr_dataset[zarr_dataset > threshold].mean()
36 |     wait(a, small_client, 300)
37 | 
38 | 
39 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
40 | @pytest.mark.parametrize("N", [700, 75, 1])
41 | def test_access_slices(small_client, zarr_dataset, N):
42 |     """Accessing just a few chunks of a zarr array should be quick"""
43 |     a = zarr_dataset[:N, :N, :N]
44 |     wait(a, small_client, 300)
45 | 
46 | 
47 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
48 | def test_sum_residuals(small_client, zarr_dataset):
49 |     """Compute reduce, then map, then reduce again"""
50 |     a = (zarr_dataset - zarr_dataset.mean(axis=0)).sum()
51 |     wait(a, small_client, 300)
52 | 
53 | 
54 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
55 | def test_select_scalar(small_client, cmip6):
56 |     ds = cmip6.isel({"lat": 20, "lon": 40, "plev": 5, "time": 1234}).compute()
57 |     assert ds.zg.shape == ()
58 |     assert ds.zg.size == 1
59 | 


--------------------------------------------------------------------------------
/AB_environments/config.yaml:
--------------------------------------------------------------------------------
 1 | # Number of times to run each test suite.
 2 | # Lower values are faster and cheaper but will result in higher variance.
 3 | # Setting it to 5 is a good value to get statistically significant results.
 4 | # This must remain set to 0 in the main branch, thus completely disabling
 5 | # A/B tests, in order to avoid unnecessary runs.
 6 | repeat: 0
 7 | 
 8 | # Set to true to automatically create a verbatim copy of AB_baseline and then compare
 9 | # the two in the A/B tests. Set to false to save some money if you are already confident
10 | # that the 'repeat' setting is high enough.
11 | test_null_hypothesis: true
12 | 
13 | # Test directories, test files, or individual tests to run.
14 | targets:
15 |   # - tests
16 |   - tests/benchmarks
17 |   # - tests/runtime
18 |   # - tests/stability
19 |   # - tests/tpch/test_dask.py
20 |   # - tests/benchmarks/test_futures.py
21 |   # - tests/benchmarks/test_array.py::test_basic_sum
22 | 
23 | # pytest markers or marker expressions. See setup.cfg for available ones.
24 | # Leave blank to run all marked and unmarked tests.
25 | markers: not tpch_nondask
26 | # markers: shuffle_p2p
27 | # markers: shuffle_p2p or shuffle_tasks
28 | # markers: not shuffle_tasks
29 | 
30 | # Enable specific H2O datasets
31 | h2o_datasets:
32 |   # - 0.5 GB (csv)
33 |   # - 5 GB (csv)
34 |   # - 50 GB (csv)
35 |   # - 0.5 GB (parquet)
36 |   # - 5 GB (parquet)
37 |   # - 50 GB (parquet)
38 |   - 5 GB (parquet+pyarrow)
39 |   # - 50 GB (parquet+pyarrow)
40 |   # - 500 GB (parquet+pyarrow)
41 | 
42 | # AWS implements limiters to how many EC2 instances you can spawn in parallel on the
43 | # same AWS account. If such limit is reached, tests will randomly fail when trying to
44 | # create the Coiled clusters, and restarting failed jobs won't fix the problem.
45 | # Additionally, there are problems with Coiled itself triggered by limitations that are
46 | # never actually reached with real paying users.
47 | max_parallel:
48 |   # Number of parallel A/B test jobs per branch.
49 |   ci_jobs: 5
50 |   # Number of parallel test_*.py modules per A/B test job.
51 |   # Each module typically spawns one Coiled cluster at a time.
52 |   # Set to 1 to disable pytest-xdist.
53 |   pytest_workers_per_job: 4
54 | 


--------------------------------------------------------------------------------
/tests/test_utils_test.py:
--------------------------------------------------------------------------------
 1 | import dask
 2 | import numpy as np
 3 | import pytest
 4 | from dask.sizeof import sizeof
 5 | from dask.utils import parse_bytes
 6 | 
 7 | from .utils_test import (
 8 |     scaled_array_shape,
 9 |     scaled_array_shape_quadratic,
10 |     timeseries_of_size,
11 | )
12 | 
13 | 
14 | def test_scaled_array_shape():
15 |     assert scaled_array_shape(1024, (2, "x"), dtype=bool) == (2, 512)
16 |     assert scaled_array_shape(1024, (2, "x"), dtype=float) == (2, 64)
17 |     assert scaled_array_shape(1024, (2, "x"), dtype=np.float64) == (2, 64)
18 |     assert scaled_array_shape(1024, (2, "x")) == (2, 64)
19 | 
20 |     assert scaled_array_shape(16, ("x", "x"), dtype=bool) == (4, 4)
21 |     assert scaled_array_shape(256, ("4x", "x"), dtype=bool) == (32, 8)
22 |     assert scaled_array_shape(64, ("x", "x", "x"), dtype=float) == (2, 2, 2)
23 | 
24 |     assert scaled_array_shape("10kb", ("x", "1kb"), dtype=bool) == (10, 1000)
25 | 
26 | 
27 | def test_scaled_array_shape_quadratic():
28 |     assert scaled_array_shape("1GB", ("x",)) == (125000000,)
29 |     assert scaled_array_shape_quadratic("1GB", "1GB", ("x",)) == (125000000,)
30 |     assert scaled_array_shape_quadratic("16GB", "1GB", ("x",)) == (500000000,)
31 |     assert scaled_array_shape_quadratic("64MB", "1GB", ("x",)) == (31622776,)
32 | 
33 | 
34 | def sizeof_df(df):
35 |     # Measure the size of each partition separately (each one has overhead of being a separate DataFrame)
36 |     # TODO more efficient method than `df.partitions`? Use `dask.get` directly?
37 |     parts = dask.compute(
38 |         [df.partitions[i] for i in range(df.npartitions)], scheduler="threads"
39 |     )
40 |     return sum(map(sizeof, parts))
41 | 
42 | 
43 | def test_timeseries_of_size():
44 |     small_parts = timeseries_of_size(
45 |         "1mb", freq="1s", partition_freq="100s", dtypes={"x": float}
46 |     )
47 |     big_parts = timeseries_of_size(
48 |         "1mb", freq="1s", partition_freq="100s", dtypes={i: float for i in range(10)}
49 |     )
50 |     assert sizeof_df(small_parts) == pytest.approx(parse_bytes("1mb"), rel=0.1)
51 |     assert sizeof_df(big_parts) == pytest.approx(parse_bytes("1mb"), rel=0.1)
52 |     assert big_parts.npartitions < small_parts.npartitions
53 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_futures.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | from dask.distributed import as_completed, wait
 4 | from distributed.utils_test import inc, slowdec, slowinc
 5 | 
 6 | from ..utils_test import run_up_to_nthreads
 7 | 
 8 | 
 9 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
10 | def test_single_future(small_client):
11 |     """How quickly can we run a simple computation?
12 |     Repeat the test a few times to get a more sensible
13 |     cumulative measure.
14 |     """
15 |     for i in range(100):
16 |         small_client.submit(inc, i).result()
17 | 
18 | 
19 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
20 | @pytest.mark.parametrize("rootish", ["rootish", "non-rootish"])
21 | def test_large_map(small_client, rootish):
22 |     """What's the overhead of map these days?"""
23 |     if rootish == "rootish":
24 |         futures = small_client.map(inc, range(100_000))
25 |     else:
26 | 
27 |         def inc_with_deps(i, deps):
28 |             return i + 1
29 | 
30 |         deps = small_client.map(inc, range(5))
31 |         futures = small_client.map(inc_with_deps, range(100_000), deps=deps)
32 | 
33 |     wait(futures)
34 | 
35 | 
36 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
37 | def test_large_map_first_work(small_client):
38 |     """
39 |     Large maps are fine, but it's pleasant to see work start immediately.
40 |     We have a batch_size keyword that should work here but it's not on by default.
41 |     Maybe it should be.
42 |     """
43 |     futures = small_client.map(inc, range(100_000))
44 |     for _ in as_completed(futures):
45 |         return
46 | 
47 | 
48 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
49 | def test_memory_efficient(small_client):
50 |     """
51 |     We hope that we pipeline xs->ys->zs without keeping all of the xs in memory
52 |     to start. This may not actually happen today.
53 |     """
54 |     xs = small_client.map(np.random.random, [20_000_000] * 100, pure=False)
55 |     ys = small_client.map(slowinc, xs, delay=1)
56 |     zs = small_client.map(slowdec, ys, delay=1)
57 | 
58 |     futures = as_completed(zs)
59 |     del xs, ys, zs  # Don't keep references to intermediate results
60 | 
61 |     for _ in futures:  # pass through all futures, forget them immediately
62 |         continue
63 | 


--------------------------------------------------------------------------------
/AB_environments/AB_sample.conda.yaml:
--------------------------------------------------------------------------------
 1 | # Sample conda environment file for A/B testing.
 2 | # Change contents/delete/rename as needed.
 3 | 
 4 | # Every A/B environment *must* present these three files:
 5 | # - AB_<name>.conda.yaml
 6 | # - AB_<name>.dask.yaml
 7 | # - AB_<name>.cluster.yaml
 8 | 
 9 | # You should always start from a copy-paste from AB_baseline.conda.yaml
10 | 
11 | channels:
12 |   - conda-forge
13 | dependencies:
14 |   - python =3.10  # Single '=' means latest patch version available
15 |   - memray ==1.13.4
16 |   # Copy-paste from ci/environment.yml
17 |   - pip
18 |   - coiled >=0.2.54
19 |   - numpy ==2.0.2
20 |   - pandas ==2.2.3
21 |   - dask ==2024.11.2
22 |   - distributed ==2024.11.2
23 |   - dask-labextension ==7.0.0
24 |   - dask-ml ==2024.4.4
25 |   - fsspec ==2024.10.0
26 |   - s3fs ==2024.10.0
27 |   - gcsfs ==2024.10.0
28 |   - pyarrow ==18.1.0
29 |   - jupyterlab ==4.3.1
30 |   - lz4 ==4.3.3
31 |   - ipywidgets ==8.1.5
32 |   - numba ==0.60.0
33 |   - scikit-learn ==1.5.2
34 |   - ipycytoscape ==1.3.3
35 |   - click ==8.1.7
36 |   - xarray ==2024.11.0
37 |   - flox ==0.9.15
38 |   - zarr ==2.18.3
39 |   - cftime ==1.6.4
40 |   - msgpack-python
41 |   - cloudpickle ==3.1.0
42 |   - tornado ==6.4.2
43 |   - toolz ==1.0.0
44 |   - zict ==3.0.0
45 |   - xgboost ==3.0.2
46 |   - optuna ==4.1.0
47 |   - optuna-integration ==4.1.0
48 |   - scipy ==1.14.1
49 |   - sqlalchemy ==2.0.36
50 |   - pynvml ==11.5.3
51 |   - bokeh ==3.6.1
52 |   - gilknocker ==0.4.1
53 |   - openssl >1.1.0g
54 |   - rasterio >=1.4.0
55 |   - rioxarray ==0.17.0
56 |   - h5netcdf ==1.4.1
57 |   - xesmf ==0.8.7
58 |   - bottleneck ==1.4.2
59 |   - geojson ==3.1.0
60 |   - planetary-computer ==1.0.0
61 |   - pystac-client ==0.8.5
62 |   - odc-stac ==0.3.10
63 |   - adlfs ==2024.7.0
64 |   # https://github.com/coiled/benchmarks/issues/1616
65 |   - cryptography ==43.0.3
66 |   - pyopenssl ==24.2.1
67 |   # End copy-paste
68 | 
69 |   - pip:
70 |     # Make sure you install dask and distributed either both from pip or both from
71 |     # conda. You may alternatively point to your own git fork (but make sure you
72 |     # sync'ed tags!)
73 |     # Read README.md for troubleshooting.
74 |     - git+https://github.com/dask/dask@191d39177009d2cce25b818878118e35329b6db3
75 |     - git+https://github.com/dask/distributed@0304fb6e665e36abf9e3086173cccd36e29ae84d
76 | 


--------------------------------------------------------------------------------
/tests/tpch/README.md:
--------------------------------------------------------------------------------
 1 | TPC-H Benchmarks
 2 | ================
 3 | 
 4 | This document will help you run the TPC-H benchmarks in this directory.
 5 | 
 6 | Setup
 7 | -----
 8 | 
 9 | Clone this repository
10 | 
11 | ```
12 | git clone git@github.com:coiled/benchmarks
13 | cd benchmarks
14 | ```
15 | 
16 | Follow the environment creation steps in the root directory. Namely the
17 | following:
18 | 
19 | ```
20 | mamba env create -n tpch -f ci/environment.yml
21 | conda activate tpch
22 | mamba env update -f ci/environment-git-tip.yml
23 | mamba env update -f ci/environment-test.yml
24 | mamba install grpcio grpcio-status protobuf -y  # if you want Spark
25 | ```
26 | 
27 | Run Dask Benchmarks
28 | -------------------
29 | 
30 | ```
31 | pytest --benchmark tests/tpch/test_dask.py
32 | ```
33 | 
34 | Configure
35 | ---------
36 | 
37 | By default we run Scale 100 (about 100 GB) on the cloud with Coiled.  You can
38 | configure this by changing the values for `_local` and `_scale` in the
39 | `conftest.py` file in this directory (they're at the top).
40 | 
41 | Local Data Generation
42 | ---------------------
43 | 
44 | If you want to run locally, you'll need to generate data.  Run the following
45 | from the **root directory** of this repository.
46 | 
47 | ```
48 | python tests/tpch/generate_data.py --scale 10
49 | ```
50 | 
51 | Run Many Tests
52 | --------------
53 | 
54 | When running on the cloud you can run many tests simultaneously.  We recommend
55 | using pytest-xdist for this with the keywords:
56 | 
57 | -   `-n 4` run four parallel jobs
58 | -   `--dist loadscope` split apart by module
59 | 
60 | ```
61 | py.test --benchmark -n 4 --dist loadscope tests/tpch
62 | ```
63 | 
64 | Generate Plots
65 | --------------
66 | 
67 | Timing outputs are dropped into `benchmark.db` in the root of this repository.
68 | You can generate charts analyzing results using either the notebook
69 | `visualize.ipynb` in this directory (recommended) or the `generate-plot.py`
70 | script in this directory.  These require `ibis` and `altair` (not installed
71 | above).
72 | 
73 | These are both meant to be run from the root directory of this repository.
74 | 
75 | These pull out the most recent records for each query/library pairing.  If
76 | you're changing scales and want to ensure clean results, you may want to nuke
77 | your `benchmark.db` file between experiments (it's ok, it'll regenerate
78 | automatically).
79 | 


--------------------------------------------------------------------------------
/alembic/versions/7d7844fca7cf_initial_table.py:
--------------------------------------------------------------------------------
 1 | """Initial table
 2 | 
 3 | Revision ID: 7d7844fca7cf
 4 | Revises: 
 5 | Create Date: 2022-07-21 12:32:36.579599
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '7d7844fca7cf'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('test_run',
22 |     sa.Column('id', sa.Integer(), nullable=False),
23 |     sa.Column('session_id', sa.String(), nullable=False),
24 |     sa.Column('name', sa.String(), nullable=False),
25 |     sa.Column('originalname', sa.String(), nullable=False),
26 |     sa.Column('path', sa.String(), nullable=True),
27 |     sa.Column('setup_outcome', sa.String(), nullable=True),
28 |     sa.Column('call_outcome', sa.String(), nullable=True),
29 |     sa.Column('teardown_outcome', sa.String(), nullable=True),
30 |     sa.Column('coiled_runtime_version', sa.String(), nullable=True),
31 |     sa.Column('coiled_software_name', sa.String(), nullable=True),
32 |     sa.Column('dask_version', sa.String(), nullable=True),
33 |     sa.Column('distributed_version', sa.String(), nullable=True),
34 |     sa.Column('python_version', sa.String(), nullable=True),
35 |     sa.Column('platform', sa.String(), nullable=True),
36 |     sa.Column('ci_run_url', sa.String(), nullable=True),
37 |     sa.Column('start', sa.DateTime(), nullable=True),
38 |     sa.Column('end', sa.DateTime(), nullable=True),
39 |     sa.Column('duration', sa.Float(), nullable=True),
40 |     sa.Column('average_memory', sa.Float(), nullable=True),
41 |     sa.Column('peak_memory', sa.Float(), nullable=True),
42 |     sa.Column('compute_time', sa.Float(), nullable=True),
43 |     sa.Column('disk_spill_time', sa.Float(), nullable=True),
44 |     sa.Column('serializing_time', sa.Float(), nullable=True),
45 |     sa.Column('transfer_time', sa.Float(), nullable=True),
46 |     sa.Column('performance_report_url', sa.String(), nullable=True),
47 |     sa.Column('cluster_dump_url', sa.String(), nullable=True),
48 |     sa.PrimaryKeyConstraint('id')
49 |     )
50 |     # ### end Alembic commands ###
51 | 
52 | 
53 | def downgrade() -> None:
54 |     # ### commands auto generated by Alembic - please adjust! ###
55 |     op.drop_table('test_run')
56 |     # ### end Alembic commands ###
57 | 


--------------------------------------------------------------------------------
/alembic/env.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from logging.config import fileConfig
 3 | 
 4 | from sqlalchemy import engine_from_config, pool
 5 | 
 6 | from alembic import context
 7 | from benchmark_schema import Base
 8 | 
 9 | # this is the Alembic Config object, which provides
10 | # access to the values within the .ini file in use.
11 | config = context.config
12 | 
13 | # Interpret the config file for Python logging.
14 | # This line sets up loggers basically.
15 | if config.config_file_name is not None:
16 |     fileConfig(config.config_file_name)
17 | 
18 | # add your model's MetaData object here
19 | # for 'autogenerate' support
20 | target_metadata = Base.metadata
21 | 
22 | # other values from the config, defined by the needs of env.py,
23 | # can be acquired:
24 | # my_important_option = config.get_main_option("my_important_option")
25 | # ... etc.
26 | 
27 | # Set the database name from the DB_NAME environment variable
28 | ENGINE_URL = f"sqlite:///{os.environ.get('DB_NAME', 'benchmark.db')}"
29 | config.set_main_option("sqlalchemy.url", ENGINE_URL)
30 | 
31 | 
32 | def run_migrations_offline() -> None:
33 |     """Run migrations in 'offline' mode.
34 | 
35 |     This configures the context with just a URL
36 |     and not an Engine, though an Engine is acceptable
37 |     here as well.  By skipping the Engine creation
38 |     we don't even need a DBAPI to be available.
39 | 
40 |     Calls to context.execute() here emit the given string to the
41 |     script output.
42 | 
43 |     """
44 |     url = config.get_main_option("sqlalchemy.url")
45 |     context.configure(
46 |         url=url,
47 |         target_metadata=target_metadata,
48 |         literal_binds=True,
49 |         dialect_opts={"paramstyle": "named"},
50 |     )
51 | 
52 |     with context.begin_transaction():
53 |         context.run_migrations()
54 | 
55 | 
56 | def run_migrations_online() -> None:
57 |     """Run migrations in 'online' mode.
58 | 
59 |     In this scenario we need to create an Engine
60 |     and associate a connection with the context.
61 | 
62 |     """
63 |     connectable = engine_from_config(
64 |         config.get_section(config.config_ini_section),
65 |         prefix="sqlalchemy.",
66 |         poolclass=pool.NullPool,
67 |     )
68 | 
69 |     with connectable.connect() as connection:
70 |         context.configure(connection=connection, target_metadata=target_metadata)
71 | 
72 |         with context.begin_transaction():
73 |             context.run_migrations()
74 | 
75 | 
76 | if context.is_offline_mode():
77 |     run_migrations_offline()
78 | else:
79 |     run_migrations_online()
80 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_spill.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | 
 3 | import pytest
 4 | from coiled import Cluster
 5 | from dask.distributed import Client, wait
 6 | from toolz import merge
 7 | 
 8 | from ..conftest import dump_cluster_kwargs
 9 | from ..utils_test import (
10 |     cluster_memory,
11 |     print_size_info,
12 |     scaled_array_shape,
13 |     scaled_array_shape_quadratic,
14 | )
15 | 
16 | 
17 | @pytest.fixture(scope="module")
18 | def spill_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags):
19 |     kwargs = dict(
20 |         name=f"spill-{uuid.uuid4().hex[:8]}",
21 |         environ=merge(
22 |             dask_env_variables,
23 |             {
24 |                 # Ensure that no tasks are not retried on worker ungraceful termination
25 |                 # caused by out-of-memory issues
26 |                 "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0",
27 |             },
28 |         ),
29 |         tags=github_cluster_tags,
30 |         **cluster_kwargs["spill_cluster"],
31 |     )
32 |     dump_cluster_kwargs(kwargs, "spill")
33 |     with Cluster(**kwargs) as cluster:
34 |         yield cluster
35 | 
36 | 
37 | @pytest.fixture
38 | def spill_client(spill_cluster, cluster_kwargs, benchmark_all, wait_for_workers):
39 |     n_workers = cluster_kwargs["spill_cluster"]["n_workers"]
40 |     with Client(spill_cluster) as client:
41 |         spill_cluster.scale(n_workers)
42 |         wait_for_workers(client, n_workers, timeout=600)
43 |         client.restart()
44 |         with benchmark_all(client):
45 |             yield client
46 | 
47 | 
48 | @pytest.mark.parametrize(
49 |     "keep_around", [pytest.param(False, id="release"), pytest.param(True, id="keep")]
50 | )
51 | def test_spilling(spill_client, new_array, keep_around):
52 |     memory = cluster_memory(spill_client)  # 36 GiB
53 |     shape = scaled_array_shape(memory * 1.79, ("x", "x"))  # 64 GiB
54 |     a = new_array(shape)
55 |     print_size_info(memory, memory * 1.79, a)
56 | 
57 |     a = a.persist()
58 |     wait(a)
59 |     b = a.sum().persist()
60 |     if not keep_around:
61 |         del a
62 |     assert b.compute()
63 | 
64 | 
65 | def test_dot_product_spill(spill_client, new_array):
66 |     """See also test_array.py::test_dot_product
67 |     for variant that doesn't hit the spill threshold
68 |     """
69 |     memory = cluster_memory(spill_client)  # 38.33 GiB
70 |     shape = scaled_array_shape_quadratic(memory * 0.3, "11.5 GiB", ("x", "x"))
71 |     a = new_array(shape)
72 |     print_size_info(memory, memory * 0.3, a)
73 |     b = (a @ a.T).sum()
74 |     assert b.compute()
75 | 


--------------------------------------------------------------------------------
/tests/tpch/plotting.py:
--------------------------------------------------------------------------------
 1 | import altair as alt
 2 | import pandas as pd
 3 | 
 4 | LIBRARY_COLORS = {
 5 |     "dask": "#5677a4",
 6 |     "duckdb": "#e68b39",
 7 |     "polars": "#d4605b",
 8 |     "pyspark": "green",
 9 | }
10 | 
11 | 
12 | def from_db(path):
13 |     df = pd.read_sql_table(table_name="test_run", con=f"sqlite:///{path}")
14 | 
15 |     df = df[
16 |         (df.call_outcome == "passed")
17 |         & (df.path.str.contains("^tpch/test_(?:dask|duckdb|polars|pyspark)"))
18 |         & df.cluster_name
19 |     ]
20 |     df = df[["path", "name", "duration", "start", "cluster_name"]]
21 |     df["library"] = df.path.map(lambda path: path.split("_")[-1].split(".")[0])
22 |     df["query"] = df.name.map(lambda name: int(name.split("_")[-1]))
23 |     df["name"] = df.cluster_name.map(lambda name: name.split("-", 3)[-1])
24 |     df["scale"] = df.cluster_name.map(lambda name: int(name.split("-")[2]))
25 |     del df["path"]
26 |     del df["cluster_name"]
27 |     return df
28 | 
29 | 
30 | def latest(df, n=1):
31 |     df = df.sort_values(["query", "library"])
32 | 
33 |     def recent(df):
34 |         return df.sort_values("start").tail(n)
35 | 
36 |     df = df.groupby(["library", "query"]).apply(recent).reset_index(drop=True)
37 |     del df["start"]
38 |     return df
39 | 
40 | 
41 | def normalize(df):
42 |     dask_durations = df[df["library"] == "dask"].set_index("query")["duration"]
43 |     data = df.groupby("query").apply(
44 |         lambda group: group.assign(
45 |             relative_duration=group["duration"] / dask_durations[group.name]
46 |         )
47 |     )
48 |     return data.reset_index(drop=True)
49 | 
50 | 
51 | def subplot(df, column, libraries):
52 |     return (
53 |         alt.Chart(df)
54 |         .mark_bar()
55 |         .encode(
56 |             x="query:N",
57 |             y=f"{column}:Q",
58 |             xOffset="library:N",
59 |             color=alt.Color("library").scale(
60 |                 domain=libraries,
61 |                 range=[LIBRARY_COLORS[lib] for lib in libraries],
62 |             ),
63 |             tooltip=["library", column],
64 |         )
65 |     )
66 | 
67 | 
68 | def plot(df, libraries=None, column="duration"):
69 |     if libraries is None:
70 |         libraries = ["dask", "duckdb", "polars", "pyspark"]
71 |     plot = subplot(df[df["query"] < 12], column=column, libraries=libraries) & subplot(
72 |         df[df["query"] >= 12], column=column, libraries=libraries
73 |     )
74 |     return plot.properties(
75 |         title=f"TPC-H -- scale:{df.scale.iloc[0]} name:{df.name.iloc[0]}"
76 |     ).configure_title(
77 |         fontSize=20,
78 |     )
79 | 


--------------------------------------------------------------------------------
/ci/scripts/discover_ab_environments.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import glob
 4 | import json
 5 | import os.path
 6 | from typing import TypedDict
 7 | 
 8 | import yaml
 9 | 
10 | 
11 | class JSONOutput(TypedDict):
12 |     run_AB: bool
13 |     repeat: list[int]
14 |     runtime: list[str]
15 |     max_parallel: int
16 |     pytest_args: list[str]
17 |     h2o_datasets: list[str]
18 | 
19 | 
20 | DO_NOT_RUN: JSONOutput = {
21 |     "run_AB": False,
22 |     "repeat": [],
23 |     "runtime": [],
24 |     "max_parallel": 1,
25 |     "pytest_args": [],
26 |     "h2o_datasets": [],
27 | }
28 | 
29 | 
30 | def build_json() -> JSONOutput:
31 |     with open("AB_environments/config.yaml") as fh:
32 |         cfg = yaml.safe_load(fh)
33 | 
34 |     if not isinstance(cfg.get("repeat"), int) or cfg["repeat"] < 0:
35 |         raise ValueError("AB_environments/config.yaml: missing key {repeat: N}")
36 |     for target in cfg["targets"]:
37 |         target = target.split("::")[0]
38 |         if not os.path.exists(target):
39 |             raise FileNotFoundError(target)
40 | 
41 |     if not cfg["repeat"] or not cfg["targets"]:
42 |         return DO_NOT_RUN
43 | 
44 |     runtimes = []
45 |     for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")):
46 |         env_name = os.path.basename(conda_fname)[: -len(".conda.yaml")]
47 |         dask_fname = f"AB_environments/{env_name}.dask.yaml"
48 |         # Raise FileNotFoundError if missing
49 |         open(dask_fname).close()
50 |         runtimes.append(env_name)
51 | 
52 |     if not runtimes:
53 |         return DO_NOT_RUN
54 | 
55 |     if "AB_baseline" not in runtimes:
56 |         # If any A/B environments are defined, AB_baseline is required
57 |         raise FileNotFoundError("AB_environments/AB_baseline.conda.yaml")
58 | 
59 |     if cfg["test_null_hypothesis"]:
60 |         runtimes += ["AB_null_hypothesis"]
61 | 
62 |     pytest_args = []
63 |     if (n := cfg["max_parallel"]["pytest_workers_per_job"]) > 1:
64 |         pytest_args.append(f"-n {n} --dist loadscope")
65 |     if cfg["markers"]:
66 |         pytest_args.append(f"-m '{cfg['markers']}'")
67 |     for target in cfg["targets"]:
68 |         pytest_args.append(f"'{target}'")
69 | 
70 |     return {
71 |         "run_AB": True,
72 |         "repeat": list(range(1, cfg["repeat"] + 1)),
73 |         "runtime": runtimes,
74 |         "max_parallel": cfg["max_parallel"]["ci_jobs"],
75 |         "pytest_args": [" ".join(pytest_args)],
76 |         "h2o_datasets": [",".join(cfg["h2o_datasets"])],
77 |     }
78 | 
79 | 
80 | def main() -> None:
81 |     print(json.dumps(build_json()))
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/alembic/versions/00d5844fd364_add_tpch_run_table.py:
--------------------------------------------------------------------------------
 1 | """Add tpch run table
 2 | 
 3 | Revision ID: 00d5844fd364
 4 | Revises: 25053f75e09f
 5 | Create Date: 2024-04-09 13:41:39.795757
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '00d5844fd364'
14 | down_revision = '25053f75e09f'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade() -> None:
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('tpch_run',
22 |     sa.Column('id', sa.Integer(), nullable=False),
23 |     sa.Column('session_id', sa.String(), nullable=False),
24 |     sa.Column('name', sa.String(), nullable=False),
25 |     sa.Column('originalname', sa.String(), nullable=False),
26 |     sa.Column('path', sa.String(), nullable=True),
27 |     sa.Column('setup_outcome', sa.String(), nullable=True),
28 |     sa.Column('call_outcome', sa.String(), nullable=True),
29 |     sa.Column('teardown_outcome', sa.String(), nullable=True),
30 |     sa.Column('dask_version', sa.String(), nullable=True),
31 |     sa.Column('dask_expr_version', sa.String(), nullable=True),
32 |     sa.Column('distributed_version', sa.String(), nullable=True),
33 |     sa.Column('duckdb_version', sa.String(), nullable=True),
34 |     sa.Column('pyspark_version', sa.String(), nullable=True),
35 |     sa.Column('polars_version', sa.String(), nullable=True),
36 |     sa.Column('python_version', sa.String(), nullable=True),
37 |     sa.Column('platform', sa.String(), nullable=True),
38 |     sa.Column('ci_run_url', sa.String(), nullable=True),
39 |     sa.Column('start', sa.DateTime(), nullable=True),
40 |     sa.Column('end', sa.DateTime(), nullable=True),
41 |     sa.Column('duration', sa.Float(), nullable=True),
42 |     sa.Column('average_memory', sa.Float(), nullable=True),
43 |     sa.Column('peak_memory', sa.Float(), nullable=True),
44 |     sa.Column('cluster_name', sa.String(), nullable=True),
45 |     sa.Column('cluster_id', sa.Integer(), nullable=True),
46 |     sa.Column('cluster_details_url', sa.String(), nullable=True),
47 |     sa.Column('scale', sa.Integer(), nullable=False),
48 |     sa.Column('query', sa.Integer(), nullable=False),
49 |     sa.Column('local', sa.Boolean(), nullable=False),
50 |     sa.Column('compression', sa.String(), nullable=True),
51 |     sa.Column('partition_size', sa.String(), nullable=True),
52 |     sa.PrimaryKeyConstraint('id')
53 |     )
54 |     # ### end Alembic commands ###
55 | 
56 | 
57 | def downgrade() -> None:
58 |     # ### commands auto generated by Alembic - please adjust! ###
59 |     op.drop_table('tpch_run')
60 |     # ### end Alembic commands ###
61 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/cloud_optimize.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | import xarray as xr
 4 | from s3fs import S3FileSystem
 5 | 
 6 | 
 7 | def cloud_optimize(
 8 |     scale: Literal["small", "medium", "large"], fs: S3FileSystem, storage_url: str
 9 | ):
10 |     models = [
11 |         "ACCESS-CM2",
12 |         "ACCESS-ESM1-5",
13 |         "CMCC-ESM2",
14 |         "CNRM-CM6-1",
15 |         "CNRM-ESM2-1",
16 |         "CanESM5",
17 |         "EC-Earth3",
18 |         "EC-Earth3-Veg-LR",
19 |         "FGOALS-g3",
20 |         "GFDL-ESM4",
21 |         "GISS-E2-1-G",
22 |         "INM-CM4-8",
23 |         "INM-CM5-0",
24 |         "KACE-1-0-G",
25 |         "MIROC-ES2L",
26 |         "MPI-ESM1-2-HR",
27 |         "MPI-ESM1-2-LR",
28 |         "MRI-ESM2-0",
29 |         "NorESM2-LM",
30 |         "NorESM2-MM",
31 |         "TaiESM1",
32 |         "UKESM1-0-LL",
33 |     ]
34 |     variables = [
35 |         "hurs",
36 |         "huss",
37 |         "pr",
38 |         "rlds",
39 |         "rsds",
40 |         "sfcWind",
41 |         "tas",
42 |         "tasmax",
43 |         "tasmin",
44 |     ]
45 | 
46 |     if scale == "small":
47 |         # 130 files (152.83 GiB). One model and one variable.
48 |         models = models[:1]
49 |         variables = variables[:1]
50 |     elif scale == "medium":
51 |         # 390 files. Two models and two variables.
52 |         # Currently fails after hitting 20 minute idle timeout
53 |         # sending large graph to the scheduler.
54 |         models = models[:2]
55 |         variables = variables[:2]
56 |     else:
57 |         # 11635 files. All models and variables.
58 |         pass
59 | 
60 |     # Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6
61 |     # for dataset details.
62 |     files = []
63 |     for model in models:
64 |         for variable in variables:
65 |             data_dir = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc"
66 |             files += [f"s3://{path}" for path in fs.glob(data_dir)]
67 |     print(f"Processing {len(files)} NetCDF files")
68 | 
69 |     # Load input NetCDF data files
70 |     # TODO: Reduce explicit settings once https://github.com/pydata/xarray/issues/8778 is completed.
71 |     ds = xr.open_mfdataset(
72 |         files,
73 |         engine="h5netcdf",
74 |         combine="nested",
75 |         concat_dim="time",
76 |         data_vars="minimal",
77 |         coords="minimal",
78 |         compat="override",
79 |         parallel=True,
80 |     )
81 | 
82 |     # Rechunk from "pancake" to "pencil" format
83 |     ds = ds.chunk({"time": -1, "lon": "auto", "lat": "auto"})
84 | 
85 |     # Write out to a Zar dataset
86 |     return ds.to_zarr(storage_url, compute=False)
87 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from dask.sizeof import sizeof
 3 | from dask.utils import format_bytes
 4 | 
 5 | from ..utils_test import cluster_memory, timeseries_of_size, wait
 6 | 
 7 | 
 8 | def print_dataframe_info(df):
 9 |     p = df.partitions[0].compute(scheduler="threads")
10 |     partition_size = sizeof(p)
11 |     total_size = partition_size * df.npartitions
12 |     print(
13 |         f"~{len(p) * df.npartitions:,} rows x {len(df.columns)} columns, "
14 |         f"{format_bytes(total_size)} total, "
15 |         f"{df.npartitions:,} {format_bytes(partition_size)} partitions"
16 |     )
17 | 
18 | 
19 | def test_dataframe_align(small_client):
20 |     memory = cluster_memory(small_client)  # 76.66 GiB
21 | 
22 |     df = timeseries_of_size(
23 |         memory // 2,
24 |         start="2020-01-01",
25 |         freq="600ms",
26 |         partition_freq="12h",
27 |         dtypes={i: float for i in range(100)},
28 |     )
29 |     print_dataframe_info(df)
30 |     # ~50,904,000 rows x 100 columns, 38.31 GiB total, 707 55.48 MiB partitions
31 | 
32 |     df2 = timeseries_of_size(
33 |         memory // 4,
34 |         start="2010-01-01",
35 |         freq="600ms",
36 |         partition_freq="12h",
37 |         dtypes={i: float for i in range(100)},
38 |     )
39 |     print_dataframe_info(df2)
40 |     # ~25,488,000 rows x 100 columns, 19.18 GiB total, 354 55.48 MiB partitions
41 | 
42 |     final = (df2 - df).mean()  # will be all NaN, just forcing alignment
43 |     wait(final, small_client, 10 * 60)
44 | 
45 | 
46 | @pytest.mark.xfail(reason="https://github.com/coiled/benchmarks/pull/1116")
47 | def test_filter(small_client):
48 |     """How fast can we filter a DataFrame?"""
49 |     memory = cluster_memory(small_client)
50 |     df = timeseries_of_size(memory)
51 |     name = df.head(1).name.iloc[0]  # Get first name that appears
52 |     result = df[df.name == name]
53 |     wait(result, small_client, 10 * 60)
54 | 
55 | 
56 | def test_dataframe_cow_chain(small_client):
57 |     memory = cluster_memory(small_client)  # 76.66 GiB
58 | 
59 |     df = timeseries_of_size(
60 |         memory // 2,
61 |         start="2020-01-01",
62 |         freq="600ms",
63 |         partition_freq="12h",
64 |         dtypes={
65 |             **{i: float for i in range(40)},
66 |             **{i: int for i in range(41, 80)},
67 |             **{i: object for i in range(81, 120)},
68 |         },
69 |     )
70 |     print_dataframe_info(df)
71 | 
72 |     result = (
73 |         df.rename(columns={1: 1000})
74 |         .replace("x", "xxx")
75 |         .fillna({i: 100 for i in range(10, 70)})
76 |         .astype({50: "float"})
77 |         .loc[:, slice(2, 100)]
78 |     )
79 |     wait(result, small_client, 10 * 60)
80 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # macOS
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # PyCharm project settings
124 | .idea
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | # Project-specific files
138 | cluster_kwargs.*.pickle
139 | cluster_kwargs.*.yaml
140 | benchmark.db
141 | static/
142 | mamba_env_export.yml
143 | tpch-data/
144 | 
145 | # .visualize() output of dask collections
146 | mydask.html
147 | 


--------------------------------------------------------------------------------
/tests/workflows/test_embarrassingly_parallel.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import tarfile
 3 | 
 4 | import pandas as pd
 5 | import pytest
 6 | from dask.distributed import wait
 7 | 
 8 | pytestmark = pytest.mark.workflows
 9 | 
10 | 
11 | @pytest.mark.client("embarrassingly_parallel")
12 | def test_embarassingly_parallel(client, s3_factory):
13 |     # How popular is matplotlib?
14 |     s3 = s3_factory(requester_pays=True)
15 |     directories = s3.ls("s3://arxiv/pdf")
16 | 
17 |     # We only analyze files from 1991-2022 here in order to have a consistent data volume.
18 |     # This is benchmarking purposes only, as this dataset is updated monthly.
19 |     years = list(range(91, 100)) + list(range(23))
20 |     directories = [
21 |         d
22 |         for d in directories
23 |         if d.endswith(".tar") and int(d.split("_")[2][:2]) in years
24 |     ]
25 | 
26 |     def extract(filename: str, fs):
27 |         """Extract and process one directory of arXiv data
28 | 
29 |         Returns
30 |         -------
31 |         filename: str
32 |         contains_matplotlib: boolean
33 |         """
34 |         out = []
35 |         with fs.open(filename) as f:
36 |             bytes_ = f.read()
37 |             with io.BytesIO() as bio:
38 |                 bio.write(bytes_)
39 |                 bio.seek(0)
40 |                 with tarfile.TarFile(fileobj=bio) as tf:
41 |                     for member in tf.getmembers():
42 |                         if member.isfile() and member.name.endswith(".pdf"):
43 |                             data = tf.extractfile(member).read()
44 |                             out.append((member.name, b"matplotlib" in data.lower()))
45 |                 return out
46 | 
47 |     futures = client.map(extract, directories, fs=s3)
48 |     wait(futures)
49 |     # We had one error in one file.  Let's just ignore and move on.
50 |     good = [future for future in futures if future.status == "finished"]
51 |     data = client.gather(good)
52 | 
53 |     # Convert to Pandas
54 |     dfs = [pd.DataFrame(d, columns=["filename", "has_matplotlib"]) for d in data]
55 |     df = pd.concat(dfs)
56 | 
57 |     def filename_to_date(filename):
58 |         year = int(filename.split("/")[0][:2])
59 |         month = int(filename.split("/")[0][2:4])
60 |         if year > 80:
61 |             year = 1900 + year
62 |         else:
63 |             year = 2000 + year
64 | 
65 |         return pd.Timestamp(year=year, month=month, day=1)
66 | 
67 |     df["date"] = df.filename.map(filename_to_date)
68 |     result = df.groupby("date").has_matplotlib.mean()
69 |     # Some light validation to ensure results are consistent.
70 |     # This is only for benchmarking.
71 |     assert result.idxmin() == pd.Timestamp("1991-07-01")  # Earliest timestamp
72 |     assert result.idxmax() == pd.Timestamp("2022-10-01")  # Row with maximum value
73 |     assert result.ne(0).idxmax() == pd.Timestamp("2005-06-01")  # First non-zero row
74 | 


--------------------------------------------------------------------------------
/.github/workflows/geospatial.yml:
--------------------------------------------------------------------------------
 1 | name: Geospatial Benchmarks
 2 | on:
 3 |   workflow_dispatch:
 4 |     inputs:
 5 |       scale:
 6 |         description: 'Scale'
 7 |         required: true
 8 |         default: 'small'
 9 |         type: choice
10 |         options:
11 |           - 'small'
12 |           - 'medium'
13 |           - 'large'
14 |       
15 | defaults:
16 |   # Required shell entrypoint to have properly activated conda environments
17 |   run:
18 |     shell: bash -l {0}
19 | 
20 | jobs:
21 |   geospatial:
22 |     name: Geospatial
23 |     runs-on: ubuntu-latest
24 | 
25 |     steps:
26 |     - name: Checkout
27 |       uses: actions/checkout@v4
28 | 
29 |     - name: Set up environment
30 |       uses: conda-incubator/setup-miniconda@v3
31 |       with:
32 |         miniforge-version: latest
33 |         use-mamba: true
34 |         condarc-file: ci/condarc
35 |         python-version: "3.10"
36 |         environment-file: ci/environment.yml
37 |         conda-remove-defaults: "true"
38 | 
39 |     - name: Add geospatial dependencies
40 |       run: mamba env update --file ci/environment-geospatial.yml
41 | 
42 |     - name: Upgrade dask to git tip
43 |       run: mamba env update --file ci/environment-git-tip.yml
44 | 
45 |     - name: Add test dependencies
46 |       run: mamba env update --file ci/environment-test.yml
47 | 
48 |     - name: Dump environment
49 |       run: |
50 |         # For debugging
51 |         echo -e "--\n--Conda Environment (re-create this with \`conda env create --name <name> -f <output_file>\`)\n--"
52 |         mamba env export | grep -E -v '^prefix:.*$'
53 | 
54 |     - name: Google auth
55 |       uses: "google-github-actions/auth@v2"
56 |       with:
57 |         credentials_json: "${{ secrets.GCP_CREDENTIALS }}"
58 | 
59 |     - name: Run geospatial benchmarks
60 |       env:
61 |         DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
62 |         AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
63 |         AWS_DEFAULT_REGION: us-east-2 # this is needed for boto for some reason
64 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
65 |         AZURE_STORAGE_ACCOUNT_NAME: ${{ secrets.AZURE_STORAGE_ACCOUNT_NAME}}
66 |         AZURE_STORAGE_SAS_TOKEN: ${{ secrets.AZURE_STORAGE_SAS_TOKEN}}
67 |         COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
68 |         DB_NAME: geospatial_${{ inputs.scale }}.db
69 |         MEMRAY_PROFILE: "none"
70 |       run: |
71 |         pytest --benchmark \
72 |             tests/geospatial -m geo_execution \
73 |             -n 4 --dist loadscope \
74 |             --scale ${{ inputs.scale }} \
75 |              --memray ${{ env.MEMRAY_PROFILE }} \
76 | 
77 |     - name: Upload benchmark results
78 |       uses: actions/upload-artifact@v4
79 |       if: always()
80 |       with:
81 |         name: geospatial-benchmark
82 |         path: |
83 |           geospatial_${{ inputs.scale }}.db
84 |           mamba_env_export.yml
85 | 


--------------------------------------------------------------------------------
/cluster_kwargs.yaml:
--------------------------------------------------------------------------------
 1 | # Static kwargs passed to coiled.Cluster
 2 | # In A/B tests, these can be overridden by AB_environments/AB_<name>.cluster.yaml
 3 | 
 4 | # The override priority is as follows (bottom wins):
 5 | # 1. default parameters of coiled.Cluster
 6 | # 2. default section of this file
 7 | # 3. default section of AB_environments/AB_<name>.cluster.yaml
 8 | # 4. specific sections of this file
 9 | # 5. specific sections of AB_environments/AB_<name>.cluster.yaml
10 | 
11 | # The keys 'name', 'environ', and 'tags' must not be used.
12 | 
13 | # Settings for all clusters, unless overriden below
14 | default:
15 |   package_sync: true
16 |   wait_for_workers: true
17 |   scheduler_vm_types: [m6i.large]
18 |   spot_policy: spot_with_fallback
19 | 
20 | # For all tests using the small_client fixture
21 | small_cluster:
22 |   n_workers: 10
23 |   worker_vm_types: [m6i.large]  # 2CPU, 8GiB
24 | 
25 | # For tests/benchmarks/test_parquet.py
26 | parquet_cluster:
27 |   n_workers: 15
28 |   worker_vm_types: [m5.xlarge]  # 4 CPU, 16 GiB
29 | 
30 | # For tests/benchmarks/test_spill.py
31 | spill_cluster:
32 |   n_workers: 5
33 |   worker_disk_size: 64
34 |   worker_vm_types: [m6i.large]  # 2CPU, 8GiB
35 | 
36 | # For tests/benchmarks/test_xarray.py
37 | group_reduction_cluster:
38 |   n_workers: 20
39 |   worker_vm_types: [m6i.xlarge]  # 4CPU, 16GiB
40 |   region: "us-east-1"  # Same region as dataset
41 | 
42 | # For tests/workflows/test_embarrassingly_parallel.py
43 | embarrassingly_parallel:
44 |   n_workers: 100
45 |   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
46 |   region: "us-east-1"  # Same region as dataset
47 | 
48 | # For tests/workflows/test_xgboost_optuna.py
49 | xgboost_optuna:
50 |   n_workers: 50
51 |   worker_vm_types: [m6i.xlarge]  # 4 CPU, 16 GiB (preferred default instance)
52 | 
53 | # For tests/workflows/test_uber_lyft.py
54 | uber_lyft:
55 |   n_workers: 20
56 |   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
57 | 
58 | uber_lyft_large:
59 |   n_workers: 50
60 |   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
61 | 
62 | # For tests/workflows/test_pytorch_optuna.py
63 | pytorch_optuna:
64 |   n_workers: 10
65 |   worker_vm_types: [g4dn.xlarge] # 1 GPU, 4 CPU, 16 GiB
66 |   worker_options:
67 |     # Making workers single-threaded to avoid GPU contention. See discussion in
68 |     # https://github.com/coiled/benchmarks/pull/787#discussion_r1177004248 for
69 |     # more details.
70 |     nthreads: 1
71 | 
72 | # For tests/workflows/test_snowflake.py
73 | snowflake:
74 |   n_workers: 20
75 |   worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance)
76 | 
77 | 
78 | # Specific tests
79 | test_work_stealing_on_scaling_up:
80 |   n_workers: 1
81 |   worker_vm_types: [t3.medium]
82 | 
83 | test_work_stealing_on_straggling_worker:
84 |   n_workers: 10
85 |   worker_vm_types: [t3.medium]
86 | 
87 | test_repeated_merge_spill:
88 |   n_workers: 20
89 |   worker_vm_types: [m6i.large]
90 | 
91 | # For tests/workflows/test_from_csv_to_parquet.py
92 | from_csv_to_parquet:
93 |   n_workers: 10
94 |   worker_vm_types: [m6i.xlarge]  # 4 CPU, 16 GiB (preferred default instance)
95 |   region: "us-east-1"  # Same region as dataset
96 | 


--------------------------------------------------------------------------------
/tests/tpch/generate_answers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | 
  4 | import botocore.session
  5 | import click
  6 | import coiled
  7 | import duckdb
  8 | import duckdb_queries
  9 | import pyarrow as pa
 10 | import pyarrow.parquet as pq
 11 | from utils import (
 12 |     get_answers_path,
 13 |     get_bucket_region,
 14 |     get_dataset_path,
 15 |     get_single_vm_spec,
 16 | )
 17 | 
 18 | 
 19 | def generate(scale: int, path: str, local: bool) -> None:
 20 |     dataset_path = get_dataset_path(local, scale)
 21 |     use_coiled = False
 22 | 
 23 |     if path.startswith("s3"):
 24 |         use_coiled = True
 25 |         global REGION
 26 |         REGION = get_bucket_region(path)
 27 |     else:
 28 |         path = pathlib.Path(path)
 29 |         path.mkdir(parents=True, exist_ok=True)
 30 | 
 31 |     def connection():
 32 |         con = duckdb.connect()
 33 | 
 34 |         if not local:  # Setup s3 credentials
 35 |             session = botocore.session.Session()
 36 |             creds = session.get_credentials()
 37 |             con.install_extension("httpfs")
 38 |             con.load_extension("httpfs")
 39 |             con.sql(
 40 |                 f"""
 41 |                 SET s3_region='us-east-2';
 42 |                 SET s3_access_key_id='{creds.access_key}';
 43 |                 SET s3_secret_access_key='{creds.secret_key}';
 44 |                 SET s3_session_token='{creds.token}';
 45 |                 """
 46 |             )
 47 |         return con
 48 | 
 49 |     def generate_answer(query):
 50 |         table = getattr(duckdb_queries, f"query_{query}")(
 51 |             connection(), dataset_path, scale
 52 |         )
 53 |         relaxed_schema = table.schema
 54 |         for i, field in enumerate(table.schema):
 55 |             if pa.types.is_decimal(field.type):
 56 |                 relaxed_schema = relaxed_schema.set(i, field.with_type(pa.float64()))
 57 |             elif pa.types.is_date(field.type):
 58 |                 relaxed_schema = relaxed_schema.set(
 59 |                     i, field.with_type(pa.timestamp("ms"))
 60 |                 )
 61 |         table = table.cast(relaxed_schema)
 62 |         pq.write_table(table, os.path.join(str(path), f"answer_{query}.parquet"))
 63 | 
 64 |     if use_coiled:
 65 |         generate_answer = coiled.function(
 66 |             name=f"tpch-generate-answers-{scale}", **get_single_vm_spec(scale)
 67 |         )(generate_answer)
 68 |     for query in range(1, 23):
 69 |         generate_answer(query)
 70 | 
 71 |     print("Finished exporting all answers!")
 72 | 
 73 | 
 74 | @click.command()
 75 | @click.option(
 76 |     "--scale", default=10, help="Scale factor to use, roughly equal to number of GB"
 77 | )
 78 | @click.option(
 79 |     "--path",
 80 |     help="Local or S3 base path, will affix '/answers' subdirectory to this path",
 81 | )
 82 | @click.option(
 83 |     "--local",
 84 |     is_flag=True,
 85 |     default=False,
 86 |     help="Whether to generate the answers locally",
 87 | )
 88 | def main(
 89 |     scale: int,
 90 |     path: str | None,
 91 |     local: bool,
 92 | ):
 93 |     if path is None:
 94 |         path = get_answers_path(local, scale)
 95 |     generate(scale, path, local)
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/tests/tpch/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | import boto3
  4 | 
  5 | 
  6 | def get_dataset_path(local, scale):
  7 |     remote_paths = {
  8 |         1: "s3://coiled-runtime-ci/tpc-h/snappy/scale-1/",
  9 |         10: "s3://coiled-runtime-ci/tpc-h/snappy/scale-10/",
 10 |         100: "s3://coiled-runtime-ci/tpc-h/snappy/scale-100/",
 11 |         1000: "s3://coiled-runtime-ci/tpc-h/snappy/scale-1000/",
 12 |         10000: "s3://coiled-runtime-ci/tpc-h/snappy/scale-10000/",
 13 |     }
 14 |     local_paths = {
 15 |         1: "./tpch-data/scale-1/",
 16 |         10: "./tpch-data/scale-10/",
 17 |         100: "./tpch-data/scale-100/",
 18 |     }
 19 | 
 20 |     if local:
 21 |         return local_paths[scale]
 22 |     else:
 23 |         return remote_paths[scale]
 24 | 
 25 | 
 26 | def get_answers_path(local, scale):
 27 |     if local:
 28 |         return f"./tpch-data/answers/scale-{scale}/"
 29 |     return f"s3://coiled-runtime-ci/tpc-h/answers/scale-{scale}/"
 30 | 
 31 | 
 32 | def get_bucket_region(path: str):
 33 |     if not path.startswith("s3://"):
 34 |         raise ValueError(f"'{path}' is not an S3 path")
 35 |     bucket = path.replace("s3://", "").split("/")[0]
 36 |     resp = boto3.client("s3").get_bucket_location(Bucket=bucket)
 37 |     # Buckets in region 'us-east-1' results in None, b/c why not.
 38 |     # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/get_bucket_location.html#S3.Client.get_bucket_location
 39 |     return resp["LocationConstraint"] or "us-east-1"
 40 | 
 41 | 
 42 | def get_cluster_spec(scale: int, shutdown_on_close: bool) -> dict[str, Any]:
 43 |     everywhere = dict(
 44 |         idle_timeout="1h",
 45 |         wait_for_workers=True,
 46 |         scheduler_vm_types=["m6i.2xlarge"],
 47 |         shutdown_on_close=shutdown_on_close,
 48 |     )
 49 | 
 50 |     if scale == 1:
 51 |         return {
 52 |             "worker_vm_types": ["m6i.large"],
 53 |             "n_workers": 4,
 54 |             **everywhere,
 55 |         }
 56 |     if scale == 10:
 57 |         return {
 58 |             "worker_vm_types": ["m6i.large"],
 59 |             "n_workers": 8,
 60 |             **everywhere,
 61 |         }
 62 |     elif scale == 100:
 63 |         return {
 64 |             "worker_vm_types": ["m6i.large"],
 65 |             "n_workers": 16,
 66 |             **everywhere,
 67 |         }
 68 |     elif scale == 1000:
 69 |         return {
 70 |             "worker_vm_types": ["m6i.xlarge"],
 71 |             "n_workers": 32,
 72 |             "worker_disk_size": 128,
 73 |             **everywhere,
 74 |         }
 75 |     elif scale == 10000:
 76 |         return {
 77 |             "worker_vm_types": ["m6i.xlarge"],
 78 |             "n_workers": 32 * 10,
 79 |             "worker_disk_size": 100,
 80 |             **everywhere,
 81 |         }
 82 | 
 83 | 
 84 | def get_single_vm_spec(scale):
 85 |     if scale == 1:
 86 |         return {
 87 |             "vm_type": "m6i.2xlarge",
 88 |         }
 89 |     if scale == 10:
 90 |         return {
 91 |             "vm_type": "m6i.4xlarge",
 92 |         }
 93 |     elif scale == 100:
 94 |         return {
 95 |             "vm_type": "m6i.8xlarge",
 96 |         }
 97 |     elif scale == 1000:
 98 |         return {
 99 |             "vm_type": "m6i.32xlarge",
100 |         }
101 |     elif scale == 10000:
102 |         return {
103 |             "vm_type": "m6i.32xlarge",
104 |             "disk_size": 1000,
105 |         }
106 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/satellite_filtering.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from typing import Literal
  3 | 
  4 | import fsspec
  5 | import geojson
  6 | import odc.stac
  7 | import planetary_computer
  8 | import pystac_client
  9 | import xarray as xr
 10 | 
 11 | 
 12 | def harmonize_to_old(data: xr.Dataset) -> xr.Dataset:
 13 |     """
 14 |     Harmonize new Sentinel-2 data to the old baseline.
 15 | 
 16 |     Parameters
 17 |     ----------
 18 |     data:
 19 |         A Dataset with various bands as data variables and three dimensions: time, y, x
 20 | 
 21 |     Returns
 22 |     -------
 23 |     harmonized: xarray.Dataset
 24 |         A Dataset with all values harmonized to the old
 25 |         processing baseline.
 26 |     """
 27 |     cutoff = datetime.datetime(2022, 1, 25)
 28 |     offset = 1000
 29 |     bands = [
 30 |         "B01",
 31 |         "B02",
 32 |         "B03",
 33 |         "B04",
 34 |         "B05",
 35 |         "B06",
 36 |         "B07",
 37 |         "B08",
 38 |         "B8A",
 39 |         "B09",
 40 |         "B10",
 41 |         "B11",
 42 |         "B12",
 43 |     ]
 44 | 
 45 |     to_process = list(set(bands) & set(list(data.data_vars)))
 46 |     old = data.sel(time=slice(cutoff))[to_process]
 47 | 
 48 |     new = data.sel(time=slice(cutoff, None)).drop_vars(to_process)
 49 | 
 50 |     new_harmonized = data.sel(time=slice(cutoff, None))[to_process].clip(offset)
 51 |     new_harmonized -= offset
 52 | 
 53 |     new = xr.merge([new, new_harmonized])
 54 |     return xr.concat([old, new], dim="time")
 55 | 
 56 | 
 57 | def satellite_filtering(
 58 |     scale: Literal["small", "medium", "large"],
 59 |     storage_url: str,
 60 | ):
 61 |     catalog = pystac_client.Client.open(
 62 |         "https://planetarycomputer.microsoft.com/api/stac/v1",
 63 |         modifier=planetary_computer.sign_inplace,
 64 |     )
 65 | 
 66 |     # GeoJSON for region of interest is from https://github.com/isellsoap/deutschlandGeoJSON/tree/main/1_deutschland
 67 |     with fsspec.open(
 68 |         "https://raw.githubusercontent.com/isellsoap/deutschlandGeoJSON/main/1_deutschland/3_mittel.geo.json"
 69 |     ) as f:
 70 |         gj = geojson.load(f)
 71 | 
 72 |     # Flatten MultiPolygon to single Polygon
 73 |     coordinates = []
 74 |     for x in gj.features[0]["geometry"]["coordinates"]:
 75 |         coordinates.extend(x)
 76 |     area_of_interest = {
 77 |         "type": "Polygon",
 78 |         "coordinates": coordinates,
 79 |     }
 80 | 
 81 |     # Get stack items
 82 |     if scale == "small":
 83 |         time_of_interest = "2024-01-01/2024-09-01"
 84 |     else:
 85 |         time_of_interest = "2015-01-01/2024-09-01"
 86 | 
 87 |     search = catalog.search(
 88 |         collections=["sentinel-2-l2a"],
 89 |         intersects=area_of_interest,
 90 |         datetime=time_of_interest,
 91 |     )
 92 |     items = search.item_collection()
 93 | 
 94 |     # Construct Xarray Dataset from stack items
 95 |     ds = odc.stac.load(
 96 |         items,
 97 |         chunks={},
 98 |         patch_url=planetary_computer.sign,
 99 |         resolution=40,
100 |         crs="EPSG:3857",
101 |         groupby="solar_day",
102 |     )
103 |     # See https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a#Baseline-Change
104 |     ds = harmonize_to_old(ds)
105 | 
106 |     # Compute humidity index
107 |     humidity = (ds.B08 - ds.B11) / (ds.B08 + ds.B11)
108 | 
109 |     result = humidity.groupby("time.month").mean()
110 |     return result.to_zarr(storage_url, compute=False)
111 | 


--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
  1 | # A generic, single database configuration.
  2 | 
  3 | [alembic]
  4 | # path to migration scripts
  5 | script_location = alembic
  6 | 
  7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
  8 | # Uncomment the line below if you want the files to be prepended with date and time
  9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file
 10 | # for all available tokens
 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
 12 | 
 13 | # sys.path path, will be prepended to sys.path if present.
 14 | # defaults to the current working directory.
 15 | prepend_sys_path = .
 16 | 
 17 | # timezone to use when rendering the date within the migration file
 18 | # as well as the filename.
 19 | # If specified, requires the python-dateutil library that can be
 20 | # installed by adding `alembic[tz]` to the pip requirements
 21 | # string value is passed to dateutil.tz.gettz()
 22 | # leave blank for localtime
 23 | # timezone =
 24 | 
 25 | # max length of characters to apply to the
 26 | # "slug" field
 27 | # truncate_slug_length = 40
 28 | 
 29 | # set to 'true' to run the environment during
 30 | # the 'revision' command, regardless of autogenerate
 31 | # revision_environment = false
 32 | 
 33 | # set to 'true' to allow .pyc and .pyo files without
 34 | # a source .py file to be detected as revisions in the
 35 | # versions/ directory
 36 | # sourceless = false
 37 | 
 38 | # version location specification; This defaults
 39 | # to alembic/versions.  When using multiple version
 40 | # directories, initial revisions must be specified with --version-path.
 41 | # The path separator used here should be the separator specified by "version_path_separator" below.
 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions
 43 | 
 44 | # version path separator; As mentioned above, this is the character used to split
 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep.
 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas.
 47 | # Valid values for version_path_separator are:
 48 | #
 49 | # version_path_separator = :
 50 | # version_path_separator = ;
 51 | # version_path_separator = space
 52 | version_path_separator = os  # Use os.pathsep. Default configuration used for new projects.
 53 | 
 54 | # the output encoding used when revision files
 55 | # are written from script.py.mako
 56 | # output_encoding = utf-8
 57 | 
 58 | sqlalchemy.url = sqlite:///benchmark.db
 59 | 
 60 | 
 61 | [post_write_hooks]
 62 | # post_write_hooks defines scripts or Python functions that are run
 63 | # on newly generated revision scripts.  See the documentation for further
 64 | # detail and examples
 65 | 
 66 | # format using "black" - use the console_scripts runner, against the "black" entrypoint
 67 | # hooks = black
 68 | # black.type = console_scripts
 69 | # black.entrypoint = black
 70 | # black.options = -l 79 REVISION_SCRIPT_FILENAME
 71 | 
 72 | # Logging configuration
 73 | [loggers]
 74 | keys = root,sqlalchemy,alembic
 75 | 
 76 | [handlers]
 77 | keys = console
 78 | 
 79 | [formatters]
 80 | keys = generic
 81 | 
 82 | [logger_root]
 83 | level = WARN
 84 | handlers = console
 85 | qualname =
 86 | 
 87 | [logger_sqlalchemy]
 88 | level = WARN
 89 | handlers =
 90 | qualname = sqlalchemy.engine
 91 | 
 92 | [logger_alembic]
 93 | level = INFO
 94 | handlers =
 95 | qualname = alembic
 96 | 
 97 | [handler_console]
 98 | class = StreamHandler
 99 | args = (sys.stderr,)
100 | level = NOTSET
101 | formatter = generic
102 | 
103 | [formatter_generic]
104 | format = %(levelname)-5.5s [%(name)s] %(message)s
105 | datefmt = %H:%M:%S
106 | 


--------------------------------------------------------------------------------
/tests/tpch/visualize.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0cdaec38-4a9e-4a25-b45e-1188903d219d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Plot TPC-H results\n",
  9 |     "\n",
 10 |     "This currently assumes that benchmarks have been run and have populated benchmark.db.  It also assumes that that database has only those results and from only one run (this is usually a bad assumption).\n",
 11 |     "\n",
 12 |     "```\n",
 13 |     "rm benchmark.db\n",
 14 |     "pytest --benchmark tests/tpch\n",
 15 |     "```"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "c7ec3d43-3a70-4666-9552-04d82ac42a31",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import pandas as pd\n",
 26 |     "\n",
 27 |     "df = pd.read_sql_table(table_name=\"test_run\", con=\"sqlite:///../../benchmark.db\")\n",
 28 |     "\n",
 29 |     "df = df[\n",
 30 |     "    (df.call_outcome == \"passed\")\n",
 31 |     "    & (df.path.str.contains(\"^tpch/test_(?:dask|duckdb|polars|pyspark)\"))\n",
 32 |     "    & df.cluster_name\n",
 33 |     "]\n",
 34 |     "df = df[[\"path\", \"name\", \"duration\", \"start\", \"cluster_name\"]]\n",
 35 |     "\n",
 36 |     "df[\"library\"] = df.path.map(lambda path: path.split(\"_\")[-1].split(\".\")[0])\n",
 37 |     "df[\"query\"] = df.name.map(lambda name: int(name.split(\"_\")[-1]))\n",
 38 |     "df[\"name\"] = df.cluster_name.map(lambda name: name.split(\"-\", 3)[-1])\n",
 39 |     "df[\"scale\"] = df.cluster_name.map(lambda name: int(name.split(\"-\")[2]))\n",
 40 |     "del df[\"path\"]\n",
 41 |     "del df[\"cluster_name\"]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "31fbdc8a-c782-4000-9e23-5488b4d04d14",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "df"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "92f926a6-cbeb-4765-b5c1-3e8ea7c71ff6",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "df = df.sort_values([\"query\", \"library\"])\n",
 62 |     "\n",
 63 |     "def recent(df):\n",
 64 |     "    return df.sort_values(\"start\").iloc[-1]\n",
 65 |     "\n",
 66 |     "df = df.groupby([\"library\", \"query\"]).apply(recent).reset_index(drop=True)\n",
 67 |     "del df[\"start\"]\n",
 68 |     "df.head(10)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "34830787-2364-4541-8cf8-8adffbde9148",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "import altair as alt\n",
 79 |     "\n",
 80 |     "chart = alt.Chart(df).mark_bar().encode(\n",
 81 |     "    x=\"query:N\",\n",
 82 |     "    y=\"duration:Q\",\n",
 83 |     "    xOffset=\"library:N\",\n",
 84 |     "    color=alt.Color('library').scale(\n",
 85 |     "        domain=[\"dask\", \"duckdb\", \"polars\", \"pyspark\"], \n",
 86 |     "        range=[\"#5677a4\", \"#e68b39\", \"#d4605b\", \"green\"],\n",
 87 |     "    ),\n",
 88 |     "    tooltip=[\"library\", \"duration\"]\n",
 89 |     ").properties(\n",
 90 |     "    title=f\"TPC-H -- scale:{df.scale.iloc[0]} name:{df.name.iloc[0]}\"\n",
 91 |     ").configure_title(\n",
 92 |     "    fontSize=20,\n",
 93 |     "\n",
 94 |     ")\n",
 95 |     "chart"
 96 |    ]
 97 |   }
 98 |  ],
 99 |  "metadata": {
100 |   "kernelspec": {
101 |    "display_name": "Python 3 (ipykernel)",
102 |    "language": "python",
103 |    "name": "python3"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.11.7"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 5
120 | }
121 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_join.py:
--------------------------------------------------------------------------------
 1 | import dask.dataframe as dd
 2 | import pytest
 3 | 
 4 | from ..utils_test import cluster_memory, run_up_to_nthreads, timeseries_of_size, wait
 5 | 
 6 | 
 7 | @pytest.mark.shuffle_p2p
 8 | @run_up_to_nthreads("small_cluster", 40, reason="Does not finish")
 9 | def test_join_big(small_client, memory_multiplier):
10 |     memory = cluster_memory(small_client)  # 76.66 GiB
11 | 
12 |     df1_big = timeseries_of_size(
13 |         memory * memory_multiplier, dtypes={str(i): float for i in range(100)}
14 |     )  # 66.58 MiB partitions
15 |     df1_big["predicate"] = df1_big["0"] * 1e9
16 |     df1_big = df1_big.astype({"predicate": "int"})
17 | 
18 |     df2_big = timeseries_of_size(
19 |         memory * memory_multiplier, dtypes={str(i): float for i in range(100)}
20 |     )  # 66.58 MiB partitions
21 | 
22 |     # Control cardinality on column to join - this produces cardinality ~ to len(df)
23 |     df2_big["predicate"] = df2_big["0"] * 1e9
24 |     df2_big = df2_big.astype({"predicate": "int"})
25 | 
26 |     join = df1_big.merge(df2_big, on="predicate", how="inner")
27 |     # dask.dataframe will drop all columns except the Index for size
28 |     # computations, which will optimize itself through merges, e.g.
29 |     # shuffling a lot less data than what we want to test
30 |     # map_partitions blocks those optimizations
31 |     join = join.map_partitions(lambda x: x)
32 |     result = join.size
33 |     wait(result, small_client, 20 * 60)
34 | 
35 | 
36 | def test_join_big_small(small_client, memory_multiplier, configure_shuffling):
37 |     if memory_multiplier == 0.1:
38 |         raise pytest.skip(reason="Too noisy; not adding anything to multiplier=1")
39 | 
40 |     memory = cluster_memory(small_client)  # 76.66 GiB
41 | 
42 |     df_big = timeseries_of_size(
43 |         memory * memory_multiplier, dtypes={str(i): float for i in range(100)}
44 |     )  # 66.58 MiB partitions
45 | 
46 |     # Control cardinality on column to join - this produces cardinality ~ to len(df)
47 |     df_big["predicate"] = df_big["0"] * 1e9
48 |     df_big = df_big.astype({"predicate": "int"})
49 | 
50 |     df_small = timeseries_of_size(
51 |         "100 MB", dtypes={str(i): float for i in range(100)}
52 |     )  # make it obviously small
53 | 
54 |     df_small["predicate"] = df_small["0"] * 1e9
55 |     df_small_pd = df_small.astype({"predicate": "int"}).compute()
56 | 
57 |     join = df_big.merge(df_small_pd, on="predicate", how="inner")
58 |     # dask.dataframe will drop all columns except the Index for size
59 |     # computations, which will optimize itself through merges, e.g.
60 |     # shuffling a lot less data than what we want to test
61 |     # map_partitions blocks those optimizations
62 |     join = join.map_partitions(lambda x: x)
63 |     result = join.size
64 |     wait(result, small_client, 20 * 60)
65 | 
66 | 
67 | @pytest.mark.shuffle_p2p
68 | @pytest.mark.parametrize("persist", [True, False])
69 | def test_set_index(small_client, persist, memory_multiplier):
70 |     memory = cluster_memory(small_client)  # 76.66 GiB
71 | 
72 |     df_big = timeseries_of_size(
73 |         memory * memory_multiplier, dtypes={str(i): float for i in range(100)}
74 |     )  # 66.58 MiB partitions
75 |     df_big["predicate"] = df_big["0"] * 1e9
76 |     df_big = df_big.astype({"predicate": "int"})
77 |     if persist:
78 |         df_big = df_big.persist()
79 |     df_indexed = df_big.set_index("0")
80 |     # dask.dataframe will drop all columns except the Index for size
81 |     # computations, which will optimize itself through set_index, e.g.
82 |     # shuffling a lot less data than what we want to test
83 |     # map_partitions blocks those optimizations
84 |     df_indexed = df_indexed.map_partitions(lambda x: x)
85 |     wait(df_indexed.size, small_client, 20 * 60)
86 | 
87 | 
88 | @pytest.mark.client("uber_lyft_large")
89 | def test_set_index_on_uber_lyft(client, configure_shuffling):
90 |     df = dd.read_parquet(
91 |         "s3://coiled-datasets/uber-lyft-tlc/", storage_options={"anon": True}
92 |     )
93 |     result = df.set_index("PULocationID")
94 |     wait(result, client, 20 * 60)
95 | 


--------------------------------------------------------------------------------
/.github/workflows/tpch.yml:
--------------------------------------------------------------------------------
  1 | name: TPC-H Benchmarks
  2 | on:
  3 |   workflow_dispatch:
  4 |     inputs:
  5 |       scale:
  6 |         description: 'Scale Factor'
  7 |         required: true
  8 |         default: 10000
  9 |         type: choice
 10 |         options:
 11 |           - 10000
 12 |           - 1000
 13 |           - 100
 14 |           - 10
 15 |           - 1
 16 |       dask:
 17 |         description: 'Dask'
 18 |         required: true
 19 |         default: true
 20 |         type: boolean
 21 |       duckdb:
 22 |         description: 'DuckDB'
 23 |         required: true
 24 |         default: true
 25 |         type: boolean
 26 |       polars:
 27 |         description: 'Polars'
 28 |         required: true
 29 |         default: false
 30 |         type: boolean
 31 |       pyspark:
 32 |         description: 'PySpark'
 33 |         required: true
 34 |         default: true
 35 |         type: boolean
 36 | 
 37 | defaults:
 38 |   # Required shell entrypoint to have properly activated conda environments
 39 |   run:
 40 |     shell: bash -l {0}
 41 | 
 42 | jobs:
 43 |   tpch:
 44 |     name: TPC-H
 45 |     runs-on: ubuntu-latest
 46 | 
 47 |     steps:
 48 |     - name: Checkout
 49 |       uses: actions/checkout@v4
 50 | 
 51 |     - name: Set up environment
 52 |       uses: conda-incubator/setup-miniconda@v3
 53 |       with:
 54 |         miniforge-version: latest
 55 |         use-mamba: true
 56 |         condarc-file: ci/condarc
 57 |         python-version: "3.10"
 58 |         environment-file: ci/environment.yml
 59 |         conda-remove-defaults: "true"
 60 | 
 61 |     - name: Add TPC-H non-dask dependencies
 62 |       run: mamba env update --file ci/environment-tpch-nondask.yml
 63 | 
 64 |     - name: Upgrade dask to git tip
 65 |       run: mamba env update --file ci/environment-git-tip.yml
 66 | 
 67 |     - name: Add test dependencies
 68 |       run: mamba env update --file ci/environment-test.yml
 69 | 
 70 |     - name: Dump environment
 71 |       run: |
 72 |         # For debugging
 73 |         echo -e "--\n--Conda Environment (re-create this with \`conda env create --name <name> -f <output_file>\`)\n--"
 74 |         mamba env export | grep -E -v '^prefix:.*$'
 75 | 
 76 |     - name: Add Dask to benchmark if enabled
 77 |       if: ${{ inputs.dask }}
 78 |       run: |
 79 |         echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_dask.py" >> $GITHUB_ENV
 80 | 
 81 |     - name: Add DuckDB to benchmark if enabled
 82 |       if: ${{ inputs.duckdb }}
 83 |       run: |
 84 |         echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_duckdb.py" >> $GITHUB_ENV
 85 | 
 86 |     - name: Add Polars to benchmark if enabled
 87 |       if: ${{ inputs.polars }}
 88 |       run: |
 89 |         echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_polars.py" >> $GITHUB_ENV
 90 | 
 91 |     - name: Add PySpark to benchmark if enabled
 92 |       if: ${{ inputs.pyspark }}
 93 |       run: |
 94 |         echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_pyspark.py" >> $GITHUB_ENV
 95 | 
 96 |     - name: Run TPC-H benchmarks (except polars)
 97 |       env:
 98 |         DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }}
 99 |         AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }}
100 |         AWS_DEFAULT_REGION: us-east-2 # this is needed for boto for some reason
101 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }}
102 |         COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }}
103 |         DB_NAME: tpch_${{ inputs.scale }}.db
104 |         DASK_DATAFRAME__QUERY_PLANNING: True
105 |       run: |
106 |         pytest --benchmark \
107 |             ${{ env.PYTEST_BENCHMARKS }} \
108 |             -n 4 --dist loadscope \
109 |             --scale ${{ inputs.scale }} \
110 | 
111 |     - name: Upload benchmark results
112 |       uses: actions/upload-artifact@v4
113 |       if: always()
114 |       with:
115 |         name: tpch-benchmark
116 |         path: |
117 |           tpch_${{ inputs.scale }}.db
118 |           mamba_env_export.yml
119 | 


--------------------------------------------------------------------------------
/tests/workflows/test_snowflake.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import uuid
  3 | 
  4 | import dask.dataframe as dd
  5 | import pandas as pd
  6 | import pytest
  7 | 
  8 | pytestmark = pytest.mark.workflows
  9 | 
 10 | pytest.skip(
 11 |     reason="https://github.com/coiled/benchmarks/issues/1341", allow_module_level=True
 12 | )
 13 | 
 14 | pytest.importorskip("dask_snowflake")
 15 | pytest.importorskip("sqlalchemy")
 16 | 
 17 | from dask_snowflake import read_snowflake, to_snowflake  # noqa: E402
 18 | from snowflake.sqlalchemy import URL  # noqa: E402
 19 | from sqlalchemy import create_engine  # noqa: E402
 20 | 
 21 | 
 22 | @pytest.fixture(scope="module")
 23 | def connection_kwargs():
 24 |     return {
 25 |         "user": os.environ["SNOWFLAKE_USER"],
 26 |         "password": os.environ["SNOWFLAKE_PASSWORD"],
 27 |         "account": os.environ["SNOWFLAKE_ACCOUNT"],
 28 |         "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"],
 29 |         "role": os.environ.get("SNOWFLAKE_ROLE", "public"),
 30 |         "database": os.environ.get("SNOWFLAKE_DATABASE") or "testdb",
 31 |         "schema": os.environ.get("SNOWFLAKE_SCHEMA") or "testschema",
 32 |     }
 33 | 
 34 | 
 35 | @pytest.fixture
 36 | def table(connection_kwargs):
 37 |     """Connect to snowflake and create table"""
 38 |     name = f"citibike_tripdata_{uuid.uuid4().hex}"
 39 |     engine = create_engine(URL(**connection_kwargs))
 40 |     engine.execute(f"DROP TABLE IF EXISTS {name}")
 41 |     engine.execute(
 42 |         f"""create table if not exists {name} (
 43 |                 ride_id varchar not null unique,
 44 |                 rideable_type varchar not null,
 45 |                 started_at timestamp not null,
 46 |                 ended_at timestamp not null,
 47 |                 start_station_name varchar not null,
 48 |                 start_station_id smallint not null,
 49 |                 end_station_name varchar not null,
 50 |                 end_station_id smallint not null,
 51 |                 start_lat number,
 52 |                 start_lng number,
 53 |                 end_lat number,
 54 |                 end_lng number,
 55 |                 is_member boolean not null
 56 |             )"""
 57 |     )
 58 |     yield name
 59 |     # after the data is written, delete table
 60 |     engine.execute(f"DROP TABLE IF EXISTS {name}")
 61 | 
 62 | 
 63 | @pytest.mark.client("snowflake")
 64 | def test_etl_into_snowflake(client, connection_kwargs, table):
 65 |     csv_paths = [
 66 |         f"s3://tripdata/{ts.year}{ts.month:02}-*-*.csv.zip"
 67 |         for ts in pd.date_range(start="2022-01-01", end="2023-03-01", freq="MS")
 68 |     ]
 69 | 
 70 |     # preprocess data
 71 |     def safe_int(x):
 72 |         """Some station IDs are not correct integers"""
 73 |         try:
 74 |             return int(float(x))
 75 |         except Exception:
 76 |             # if station ID is not an int, return -1
 77 |             return -1
 78 | 
 79 |     ddf = dd.read_csv(
 80 |         csv_paths,
 81 |         compression="zip",
 82 |         blocksize=None,
 83 |         converters={"start_station_id": safe_int, "end_station_id": safe_int},
 84 |         storage_options={"anon": True},
 85 |     )
 86 | 
 87 |     # filter out incorrect station IDs
 88 |     ddf = ddf[(ddf.start_station_id != -1) & (ddf.end_station_id != -1)].reset_index(
 89 |         drop=True
 90 |     )
 91 | 
 92 |     # create boolean is_member and drop member_casual
 93 |     ddf["is_member"] = ddf.member_casual == "member"
 94 | 
 95 |     ddf = ddf.drop(columns="member_casual")
 96 | 
 97 |     # repartition to ensure even chunks
 98 |     ddf = ddf.repartition(partition_size="100Mb")
 99 | 
100 |     # save data to Snowflake
101 |     to_snowflake(ddf, name=table, connection_kwargs=connection_kwargs)
102 | 
103 | 
104 | @pytest.mark.client("snowflake")
105 | def test_read(client, connection_kwargs):
106 |     """Read and explore NYC bike dataset from Snowflake"""
107 |     table = "citibike_tripdata"  # persistent table
108 | 
109 |     df = read_snowflake(
110 |         f"SELECT * FROM {table}",
111 |         connection_kwargs=connection_kwargs,
112 |         partition_size="100MiB",
113 |     )
114 |     df["IS_MEMBER"].mean().compute()
115 | 


--------------------------------------------------------------------------------
/tests/tpch/test_dask.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pytest
  4 | 
  5 | from tests.tpch.utils import get_dataset_path
  6 | 
  7 | pytestmark = pytest.mark.tpch_dask
  8 | 
  9 | dd = pytest.importorskip("dask.dataframe")
 10 | 
 11 | 
 12 | from . import dask_queries  # noqa: E402
 13 | 
 14 | 
 15 | @pytest.fixture(scope="session")
 16 | def dataset_path(local, scale):
 17 |     if local:
 18 |         # FIXME: pyarrow local fs is a bit odd. dask.dataframe should deal with this
 19 |         return "file://" + os.path.abspath(get_dataset_path(local, scale)) + "/"
 20 |     else:
 21 |         return get_dataset_path(local, scale)
 22 | 
 23 | 
 24 | @pytest.mark.shuffle_p2p
 25 | def test_query_01(client, dataset_path, fs, scale):
 26 |     dask_queries.query_01(dataset_path, fs, scale).compute()
 27 | 
 28 | 
 29 | @pytest.mark.shuffle_p2p
 30 | def test_query_02(client, dataset_path, fs, scale):
 31 |     dask_queries.query_02(dataset_path, fs, scale).compute()
 32 | 
 33 | 
 34 | @pytest.mark.shuffle_p2p
 35 | def test_query_03(client, dataset_path, fs, scale):
 36 |     dask_queries.query_03(dataset_path, fs, scale).compute()
 37 | 
 38 | 
 39 | @pytest.mark.shuffle_p2p
 40 | def test_query_04(client, dataset_path, fs, scale):
 41 |     dask_queries.query_04(dataset_path, fs, scale).compute()
 42 | 
 43 | 
 44 | @pytest.mark.shuffle_p2p
 45 | def test_query_05(client, dataset_path, fs, scale):
 46 |     dask_queries.query_05(dataset_path, fs, scale).compute()
 47 | 
 48 | 
 49 | def test_query_06(client, dataset_path, fs, scale):
 50 |     dask_queries.query_06(dataset_path, fs, scale).compute()
 51 | 
 52 | 
 53 | @pytest.mark.shuffle_p2p
 54 | def test_query_07(client, dataset_path, fs, scale):
 55 |     dask_queries.query_07(dataset_path, fs, scale).compute()
 56 | 
 57 | 
 58 | @pytest.mark.shuffle_p2p
 59 | def test_query_08(client, dataset_path, fs, scale):
 60 |     dask_queries.query_08(dataset_path, fs, scale).compute()
 61 | 
 62 | 
 63 | @pytest.mark.shuffle_p2p
 64 | def test_query_09(client, dataset_path, fs, scale):
 65 |     dask_queries.query_09(dataset_path, fs, scale).compute()
 66 | 
 67 | 
 68 | @pytest.mark.shuffle_p2p
 69 | def test_query_10(client, dataset_path, fs, scale):
 70 |     dask_queries.query_10(dataset_path, fs, scale).compute()
 71 | 
 72 | 
 73 | @pytest.mark.shuffle_p2p
 74 | def test_query_11(client, dataset_path, fs, scale):
 75 |     dask_queries.query_11(dataset_path, fs, scale).compute()
 76 | 
 77 | 
 78 | @pytest.mark.shuffle_p2p
 79 | def test_query_12(client, dataset_path, fs, scale):
 80 |     dask_queries.query_12(dataset_path, fs, scale).compute()
 81 | 
 82 | 
 83 | @pytest.mark.shuffle_p2p
 84 | def test_query_13(client, dataset_path, fs, scale):
 85 |     dask_queries.query_13(dataset_path, fs, scale).compute()
 86 | 
 87 | 
 88 | @pytest.mark.shuffle_p2p
 89 | def test_query_14(client, dataset_path, fs, scale):
 90 |     dask_queries.query_14(dataset_path, fs, scale).compute()
 91 | 
 92 | 
 93 | @pytest.mark.shuffle_p2p
 94 | def test_query_15(client, dataset_path, fs, scale):
 95 |     dask_queries.query_15(dataset_path, fs, scale).compute()
 96 | 
 97 | 
 98 | @pytest.mark.shuffle_p2p
 99 | def test_query_16(client, dataset_path, fs, scale):
100 |     dask_queries.query_16(dataset_path, fs, scale).compute()
101 | 
102 | 
103 | @pytest.mark.shuffle_p2p
104 | def test_query_17(client, dataset_path, fs, scale):
105 |     dask_queries.query_17(dataset_path, fs, scale).compute()
106 | 
107 | 
108 | @pytest.mark.shuffle_p2p
109 | def test_query_18(client, dataset_path, fs, scale):
110 |     dask_queries.query_18(dataset_path, fs, scale).compute()
111 | 
112 | 
113 | @pytest.mark.shuffle_p2p
114 | def test_query_19(client, dataset_path, fs, scale):
115 |     dask_queries.query_19(dataset_path, fs, scale).compute()
116 | 
117 | 
118 | @pytest.mark.shuffle_p2p
119 | def test_query_20(client, dataset_path, fs, scale):
120 |     dask_queries.query_20(dataset_path, fs, scale).compute()
121 | 
122 | 
123 | @pytest.mark.shuffle_p2p
124 | def test_query_21(client, dataset_path, fs, scale):
125 |     dask_queries.query_21(dataset_path, fs, scale).compute()
126 | 
127 | 
128 | @pytest.mark.shuffle_p2p
129 | def test_query_22(client, dataset_path, fs, scale):
130 |     dask_queries.query_22(dataset_path, fs, scale).compute()
131 | 


--------------------------------------------------------------------------------
/tests/workflows/test_from_csv_to_parquet.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import dask.dataframe as dd
  4 | import pytest
  5 | 
  6 | pytestmark = pytest.mark.workflows
  7 | 
  8 | 
  9 | SCHEMA = OrderedDict(
 10 |     [
 11 |         ("GlobalEventID", "Int64"),
 12 |         ("Day", "Int64"),
 13 |         ("MonthYear", "Int64"),
 14 |         ("Year", "Int64"),
 15 |         ("FractionDate", "float64"),
 16 |         ("Actor1Code", "string[pyarrow]"),
 17 |         ("Actor1Name", "string[pyarrow]"),
 18 |         ("Actor1CountryCode", "string[pyarrow]"),
 19 |         ("Actor1KnownGroupCode", "string[pyarrow]"),
 20 |         ("Actor1EthnicCode", "string[pyarrow]"),
 21 |         ("Actor1Religion1Code", "string[pyarrow]"),
 22 |         ("Actor1Religion2Code", "string[pyarrow]"),
 23 |         ("Actor1Type1Code", "string[pyarrow]"),
 24 |         ("Actor1Type2Code", "string[pyarrow]"),
 25 |         ("Actor1Type3Code", "string[pyarrow]"),
 26 |         ("Actor2Code", "string[pyarrow]"),
 27 |         ("Actor2Name", "string[pyarrow]"),
 28 |         ("Actor2CountryCode", "string[pyarrow]"),
 29 |         ("Actor2KnownGroupCode", "string[pyarrow]"),
 30 |         ("Actor2EthnicCode", "string[pyarrow]"),
 31 |         ("Actor2Religion1Code", "string[pyarrow]"),
 32 |         ("Actor2Religion2Code", "string[pyarrow]"),
 33 |         ("Actor2Type1Code", "string[pyarrow]"),
 34 |         ("Actor2Type2Code", "string[pyarrow]"),
 35 |         ("Actor2Type3Code", "string[pyarrow]"),
 36 |         ("IsRootEvent", "Int64"),
 37 |         ("EventCode", "string[pyarrow]"),
 38 |         ("EventBaseCode", "string[pyarrow]"),
 39 |         ("EventRootCode", "string[pyarrow]"),
 40 |         ("QuadClass", "Int64"),
 41 |         ("GoldsteinScale", "float64"),
 42 |         ("NumMentions", "Int64"),
 43 |         ("NumSources", "Int64"),
 44 |         ("NumArticles", "Int64"),
 45 |         ("AvgTone", "float64"),
 46 |         ("Actor1Geo_Type", "Int64"),
 47 |         ("Actor1Geo_Fullname", "string[pyarrow]"),
 48 |         ("Actor1Geo_CountryCode", "string[pyarrow]"),
 49 |         ("Actor1Geo_ADM1Code", "string[pyarrow]"),
 50 |         ("Actor1Geo_Lat", "float64"),
 51 |         ("Actor1Geo_Long", "float64"),
 52 |         ("Actor1Geo_FeatureID", "string[pyarrow]"),
 53 |         ("Actor2Geo_Type", "Int64"),
 54 |         ("Actor2Geo_Fullname", "string[pyarrow]"),
 55 |         ("Actor2Geo_CountryCode", "string[pyarrow]"),
 56 |         ("Actor2Geo_ADM1Code", "string[pyarrow]"),
 57 |         ("Actor2Geo_Lat", "float64"),
 58 |         ("Actor2Geo_Long", "float64"),
 59 |         ("Actor2Geo_FeatureID", "string[pyarrow]"),
 60 |         ("ActionGeo_Type", "Int64"),
 61 |         ("ActionGeo_Fullname", "string[pyarrow]"),
 62 |         ("ActionGeo_CountryCode", "string[pyarrow]"),
 63 |         ("ActionGeo_ADM1Code", "string[pyarrow]"),
 64 |         ("ActionGeo_Lat", "float64"),
 65 |         ("ActionGeo_Long", "float64"),
 66 |         ("ActionGeo_FeatureID", "string[pyarrow]"),
 67 |         ("DATEADDED", "Int64"),
 68 |         ("SOURCEURL", "string[pyarrow]"),
 69 |     ]
 70 | )
 71 | 
 72 | 
 73 | @pytest.mark.client("from_csv_to_parquet")
 74 | def test_from_csv_to_parquet(client, s3_factory, s3_url):
 75 |     s3 = s3_factory(anon=True)
 76 |     files = s3.ls("s3://gdelt-open-data/events/")[:1000]
 77 |     files = [f"s3://{f}" for f in files]
 78 | 
 79 |     df = dd.read_csv(
 80 |         files,
 81 |         sep="\t",
 82 |         names=SCHEMA.keys(),
 83 |         # 'dtype' and 'converters' cannot overlap
 84 |         dtype={col: dtype for col, dtype in SCHEMA.items() if dtype != "float64"},
 85 |         storage_options=s3.storage_options,
 86 |         on_bad_lines="skip",
 87 |         # Some bad files have '#' in float values
 88 |         converters={
 89 |             col: lambda v: float(v.replace("#", "") or "NaN")
 90 |             for col, dtype in SCHEMA.items()
 91 |             if dtype == "float64"
 92 |         },
 93 |     )
 94 | 
 95 |     # Now we can safely convert the float columns
 96 |     df = df.astype({col: dtype for col, dtype in SCHEMA.items() if dtype == "float64"})
 97 | 
 98 |     df = df.map_partitions(
 99 |         lambda xdf: xdf.drop_duplicates(subset=["SOURCEURL"], keep="first")
100 |     )
101 |     df["national_paper"] = df.SOURCEURL.str.contains(
102 |         "washingtonpost|nytimes", regex=True
103 |     )
104 |     df = df[df["national_paper"]]
105 |     df.to_parquet(f"{s3_url}/from-csv-to-parquet/", write_index=False)
106 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_work_stealing.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import dask.array as da
  4 | import distributed
  5 | import numpy as np
  6 | import pytest
  7 | from coiled import Cluster
  8 | from dask import delayed, utils
  9 | from distributed import Client
 10 | from packaging.version import Version
 11 | from tornado.ioloop import PeriodicCallback
 12 | 
 13 | from ..utils_test import run_up_to_nthreads
 14 | 
 15 | 
 16 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
 17 | def test_trivial_workload_should_not_cause_work_stealing(small_client):
 18 |     root = delayed(lambda n: "x" * n)(utils.parse_bytes("1MiB"), dask_key_name="root")
 19 |     results = [delayed(lambda *args: None)(root, i) for i in range(10000)]
 20 |     futs = small_client.compute(results)
 21 |     small_client.gather(futs)
 22 | 
 23 | 
 24 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
 25 | def test_work_stealing_on_inhomogeneous_workload(small_client):
 26 |     np.random.seed(42)
 27 |     delays = np.random.lognormal(1, 1.3, 500)
 28 | 
 29 |     @delayed
 30 |     def clog(n):
 31 |         time.sleep(min(n, 60))
 32 |         return n
 33 | 
 34 |     results = [clog(i) for i in delays]
 35 |     futs = small_client.compute(results)
 36 |     small_client.gather(futs)
 37 | 
 38 | 
 39 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
 40 | @pytest.mark.xfail(
 41 |     Version(distributed.__version__) < Version("2022.6.1"),
 42 |     reason="https://github.com/dask/distributed/issues/6624",
 43 | )
 44 | def test_work_stealing_on_scaling_up(
 45 |     test_name_uuid,
 46 |     benchmark_all,
 47 |     cluster_kwargs,
 48 |     dask_env_variables,
 49 |     github_cluster_tags,
 50 | ):
 51 |     with Cluster(
 52 |         name=test_name_uuid,
 53 |         environ=dask_env_variables,
 54 |         tags=github_cluster_tags,
 55 |         **cluster_kwargs["test_work_stealing_on_scaling_up"],
 56 |     ) as cluster:
 57 |         with Client(cluster) as client:
 58 |             # FIXME https://github.com/coiled/platform/issues/103
 59 |             client.wait_for_workers(1, timeout=300)
 60 |             with benchmark_all(client):
 61 |                 # Slow task.
 62 |                 def func1(chunk):
 63 |                     if sum(chunk.shape) != 0:  # Make initialization fast
 64 |                         time.sleep(5)
 65 |                     return chunk
 66 | 
 67 |                 def func2(chunk):
 68 |                     return chunk
 69 | 
 70 |                 data = da.zeros((30, 30, 30), chunks=5)
 71 |                 result = data.map_overlap(func1, depth=1, dtype=data.dtype)
 72 |                 result = result.map_overlap(func2, depth=1, dtype=data.dtype)
 73 |                 future = client.compute(result)
 74 | 
 75 |                 print("started computation")
 76 | 
 77 |                 time.sleep(11)
 78 |                 # print('scaling to 4 workers')
 79 |                 # client.cluster.scale(4)
 80 | 
 81 |                 time.sleep(5)
 82 |                 print("scaling to 20 workers")
 83 |                 cluster.scale(20)
 84 | 
 85 |                 _ = future.result()
 86 | 
 87 | 
 88 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset")
 89 | def test_work_stealing_on_straggling_worker(
 90 |     test_name_uuid,
 91 |     benchmark_all,
 92 |     cluster_kwargs,
 93 |     dask_env_variables,
 94 |     github_cluster_tags,
 95 | ):
 96 |     kwargs = cluster_kwargs["test_work_stealing_on_straggling_worker"]
 97 |     with Cluster(
 98 |         name=test_name_uuid,
 99 |         environ=dask_env_variables,
100 |         tags=github_cluster_tags,
101 |         **kwargs,
102 |     ) as cluster:
103 |         with Client(cluster) as client:
104 |             # FIXME https://github.com/coiled/platform/issues/103
105 |             client.wait_for_workers(kwargs["n_workers"], timeout=600)
106 |             with benchmark_all(client):
107 | 
108 |                 def clog():
109 |                     time.sleep(1)
110 | 
111 |                 @delayed
112 |                 def slowinc(i, delay):
113 |                     time.sleep(delay)
114 |                     return i + 1
115 | 
116 |                 def install_clogging_callback(dask_worker):
117 |                     pc = PeriodicCallback(clog, 1500)
118 |                     dask_worker.periodic_callbacks["clog"] = pc
119 |                     pc.start()
120 | 
121 |                 straggler = list(client.scheduler_info()["workers"].keys())[0]
122 |                 client.run(install_clogging_callback, workers=[straggler])
123 |                 results = [slowinc(i, delay=1) for i in range(1000)]
124 |                 futs = client.compute(results)
125 |                 client.gather(futs)
126 | 


--------------------------------------------------------------------------------
/benchmark_schema.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import Boolean, Column, DateTime, Float, Integer, String
  2 | from sqlalchemy.orm import declarative_base
  3 | 
  4 | Base = declarative_base()
  5 | 
  6 | 
  7 | class TestRun(Base):
  8 |     __tablename__ = "test_run"
  9 | 
 10 |     # unique run ID
 11 |     id = Column(Integer, primary_key=True)
 12 | 
 13 |     # pytest data
 14 |     session_id = Column(String, nullable=False)
 15 |     name = Column(String, nullable=False)
 16 |     originalname = Column(String, nullable=False)
 17 |     path = Column(String, nullable=True)
 18 |     setup_outcome = Column(String, nullable=True)
 19 |     call_outcome = Column(String, nullable=True)
 20 |     teardown_outcome = Column(String, nullable=True)
 21 | 
 22 |     # Runtime data
 23 |     coiled_runtime_version = Column(String, nullable=True)
 24 |     coiled_software_name = Column(String, nullable=True)
 25 |     dask_version = Column(String, nullable=True)
 26 |     dask_expr_version = Column(String, nullable=True)
 27 |     distributed_version = Column(String, nullable=True)
 28 |     python_version = Column(String, nullable=True)
 29 |     platform = Column(String, nullable=True)
 30 | 
 31 |     # CI runner data
 32 |     ci_run_url = Column(String, nullable=True)
 33 | 
 34 |     # Wall clock data
 35 |     start = Column(DateTime, nullable=True)
 36 |     end = Column(DateTime, nullable=True)
 37 |     duration = Column(Float, nullable=True)
 38 | 
 39 |     # Memory data
 40 |     average_memory = Column(Float, nullable=True)
 41 |     peak_memory = Column(Float, nullable=True)
 42 | 
 43 |     # Durations data
 44 |     compute_time = Column(Float, nullable=True)
 45 |     disk_spill_time = Column(Float, nullable=True)
 46 |     serializing_time = Column(Float, nullable=True)
 47 |     transfer_time = Column(Float, nullable=True)
 48 | 
 49 |     # Scheduler
 50 |     scheduler_cpu_avg = Column(Float, nullable=True)
 51 |     scheduler_memory_max = Column(Float, nullable=True)
 52 | 
 53 |     # Event Loop
 54 |     worker_max_tick = Column(Float, nullable=True)
 55 |     scheduler_max_tick = Column(Float, nullable=True)
 56 | 
 57 |     # Cluster name/id/details_url
 58 |     cluster_name = Column(String, nullable=True)
 59 |     cluster_id = Column(Integer, nullable=True)
 60 |     cluster_details_url = Column(String, nullable=True)
 61 | 
 62 |     # Artifacts
 63 |     performance_report_url = Column(String, nullable=True)  # Not yet collected
 64 |     cluster_dump_url = Column(String, nullable=True)
 65 |     memray_profiles_url = Column(String, nullable=True)
 66 |     py_spy_profiles_url = Column(String, nullable=True)
 67 | 
 68 | 
 69 | class TPCHRun(Base):
 70 |     __tablename__ = "tpch_run"
 71 | 
 72 |     # unique run ID
 73 |     id = Column(Integer, primary_key=True)
 74 | 
 75 |     # pytest data
 76 |     session_id = Column(String, nullable=False)
 77 |     name = Column(String, nullable=False)
 78 |     originalname = Column(String, nullable=False)
 79 |     path = Column(String, nullable=True)
 80 |     setup_outcome = Column(String, nullable=True)
 81 |     call_outcome = Column(String, nullable=True)
 82 |     teardown_outcome = Column(String, nullable=True)
 83 | 
 84 |     # Runtime data
 85 |     dask_version = Column(String, nullable=True)
 86 |     dask_expr_version = Column(String, nullable=True)
 87 |     distributed_version = Column(String, nullable=True)
 88 |     duckdb_version = Column(String, nullable=True)
 89 |     pyspark_version = Column(String, nullable=True)
 90 |     polars_version = Column(String, nullable=True)
 91 | 
 92 |     python_version = Column(String, nullable=True)
 93 |     platform = Column(String, nullable=True)
 94 | 
 95 |     # CI runner data
 96 |     ci_run_url = Column(String, nullable=True)
 97 | 
 98 |     # Wall clock data
 99 |     start = Column(DateTime, nullable=True)
100 |     end = Column(DateTime, nullable=True)
101 |     duration = Column(Float, nullable=True)
102 | 
103 |     # Memory data
104 |     average_memory = Column(Float, nullable=True)
105 |     peak_memory = Column(Float, nullable=True)
106 | 
107 |     # Cluster name/id/details_url
108 |     cluster_name = Column(String, nullable=True)
109 |     cluster_id = Column(Integer, nullable=True)
110 |     cluster_details_url = Column(String, nullable=True)
111 | 
112 |     scale = Column(Integer, nullable=False)
113 |     query = Column(Integer, nullable=False)
114 |     local = Column(Boolean, nullable=False)
115 | 
116 |     compression = Column(String, nullable=True)
117 |     partition_size = Column(String, nullable=True)
118 |     partition_size = Column(String, nullable=True)
119 | 
120 |     n_workers = Column(Integer, nullable=True)
121 |     worker_vm_type = Column(String, nullable=True)
122 |     cluster_disk_size = Column(Integer, nullable=True)
123 | 


--------------------------------------------------------------------------------
/tests/tpch/test_correctness.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import coiled
  4 | import dask
  5 | import pandas as pd
  6 | import pytest
  7 | from distributed import LocalCluster
  8 | 
  9 | from .utils import get_answers_path, get_cluster_spec, get_dataset_path
 10 | 
 11 | pytestmark = pytest.mark.tpch_correctness
 12 | 
 13 | 
 14 | @pytest.fixture(params=[1, 10, 100], scope="session")
 15 | def scale(request):
 16 |     scale = request.param
 17 |     if scale != 100:
 18 |         pytest.skip(reason="Don't test everything by default")
 19 |     return scale
 20 | 
 21 | 
 22 | # Override identical fixture in conftest.py to use different scale
 23 | @pytest.fixture(scope="session")
 24 | def dataset_path(local, scale):
 25 |     return get_dataset_path(local, scale)
 26 | 
 27 | 
 28 | @pytest.fixture(scope="session")
 29 | def answers_path(local, scale):
 30 |     return get_answers_path(local, scale)
 31 | 
 32 | 
 33 | # Override identical fixture in conftest.py to use different scale
 34 | @pytest.fixture(scope="session")
 35 | def cluster_spec(scale, shutdown_on_close):
 36 |     return get_cluster_spec(scale=scale, shutdown_on_close=shutdown_on_close)
 37 | 
 38 | 
 39 | @pytest.fixture(scope="module")
 40 | def cluster(
 41 |     local,
 42 |     scale,
 43 |     module,
 44 |     dask_env_variables,
 45 |     cluster_spec,
 46 |     github_cluster_tags,
 47 |     name,
 48 |     make_chart,
 49 | ):
 50 |     if local:
 51 |         with LocalCluster() as cluster:
 52 |             yield cluster
 53 |     else:
 54 |         kwargs = dict(
 55 |             name=f"tpch-{module}-{scale}-{name}",
 56 |             environ=dask_env_variables,
 57 |             tags=github_cluster_tags,
 58 |             region="us-east-2",
 59 |             **cluster_spec,
 60 |         )
 61 |         with dask.config.set({"distributed.scheduler.worker-saturation": "inf"}):
 62 |             with coiled.Cluster(**kwargs) as cluster:
 63 |                 yield cluster
 64 | 
 65 | 
 66 | @pytest.fixture
 67 | def client(
 68 |     request,
 69 |     cluster,
 70 |     cluster_kwargs,
 71 |     get_cluster_info,
 72 |     performance_report,
 73 |     benchmark_time,
 74 |     restart,
 75 |     local,
 76 |     query,
 77 |     scale,
 78 | ):
 79 |     with cluster.get_client() as client:
 80 |         if restart:
 81 |             client.restart()
 82 |         client.run(lambda: None)
 83 | 
 84 |         with get_cluster_info(cluster), performance_report, benchmark_time:
 85 |             yield client
 86 | 
 87 | 
 88 | def get_expected_answer(query: int, answers_path: str, s3_storage_options):
 89 |     answer = pd.read_parquet(
 90 |         os.path.join(answers_path, f"answer_{query}.parquet"),
 91 |         storage_options=s3_storage_options,
 92 |     )
 93 |     answer = answer.rename(columns=lambda x: x.strip())
 94 |     if "o_orderdate" in answer.columns:
 95 |         answer["o_orderdate"] = pd.to_datetime(answer["o_orderdate"])
 96 |     if "cntrycode" in answer.columns:
 97 |         answer["cntrycode"] = answer["cntrycode"].astype(str)
 98 | 
 99 |     return answer
100 | 
101 | 
102 | def verify_result(
103 |     result: pd.DataFrame, query: int, answers_path: str, s3_storage_options
104 | ):
105 |     expected = get_expected_answer(query, answers_path, s3_storage_options)
106 | 
107 |     for column, dtype in expected.dtypes.items():
108 |         if pd.api.types.is_object_dtype(dtype):
109 |             result[column] = result[column].astype("str")
110 |             expected[column] = expected[column].astype("str")
111 |             # Some DuckDB results appear to be stripped, so strip them all for better comparison.
112 |             result[column] = result[column].str.strip()
113 |             expected[column] = expected[column].str.strip()
114 | 
115 |     # Query 11 is not deterministically sorted, there may be several 'ps_partkey' with the same,,l\\ 'value'
116 |     if query == 11:
117 |         assert result["value"].is_monotonic_decreasing
118 |         assert expected["value"].is_monotonic_decreasing
119 |         result = result.sort_values(["value", "ps_partkey"], ascending=[False, True])
120 |         expected = expected.sort_values(
121 |             ["value", "ps_partkey"], ascending=[False, True]
122 |         )
123 | 
124 |     result = result.reset_index(drop=True)
125 |     expected = expected.reset_index(drop=True)
126 | 
127 |     pd.testing.assert_frame_equal(result, expected, check_dtype=False, atol=1e-2)
128 | 
129 | 
130 | @pytest.mark.tpch_correctness
131 | @pytest.mark.parametrize(
132 |     "query",
133 |     [
134 |         1,
135 |         2,
136 |         3,
137 |         4,
138 |         5,
139 |         6,
140 |         7,
141 |         8,
142 |         9,
143 |         10,
144 |         11,
145 |         12,
146 |         13,
147 |         14,
148 |         15,
149 |         16,
150 |         17,
151 |         18,
152 |         19,
153 |         20,
154 |         21,
155 |         22,
156 |     ],
157 | )
158 | def test_dask_results(
159 |     query, scale, local, dataset_path, answers_path, s3_storage_options, client
160 | ):
161 |     from . import dask_queries
162 | 
163 |     func = getattr(dask_queries, f"query_{query:02d}")
164 |     result = func(dataset_path, None, scale).compute()
165 |     verify_result(result, query, answers_path, s3_storage_options)
166 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_rechunk.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import dask
  4 | import dask.array as da
  5 | import pytest
  6 | 
  7 | from ..conftest import requires_p2p_memory, requires_p2p_rechunk
  8 | from ..utils_test import cluster_memory, scaled_array_shape, wait
  9 | 
 10 | 
 11 | @pytest.fixture(params=["8.5 MiB", "auto"])
 12 | def input_chunk_size(request):
 13 |     return request.param
 14 | 
 15 | 
 16 | @pytest.fixture(
 17 |     params=[
 18 |         pytest.param("tasks", marks=pytest.mark.shuffle_tasks),
 19 |         pytest.param("p2p-disk", marks=[pytest.mark.shuffle_p2p, requires_p2p_rechunk]),
 20 |         pytest.param(
 21 |             "p2p-memory", marks=[pytest.mark.shuffle_p2p, requires_p2p_memory]
 22 |         ),
 23 |     ]
 24 | )
 25 | def configure_rechunking_in_memory(request):
 26 |     if request.param == "tasks":
 27 |         with dask.config.set({"array.rechunk.method": "tasks"}):
 28 |             yield
 29 |     else:
 30 |         disk = "disk" in request.param
 31 |         with dask.config.set(
 32 |             {
 33 |                 "array.rechunk.method": "p2p",
 34 |                 "distributed.p2p.disk": disk,
 35 |             }
 36 |         ):
 37 |             yield
 38 | 
 39 | 
 40 | @pytest.fixture(
 41 |     params=[
 42 |         pytest.param("tasks", marks=pytest.mark.shuffle_tasks),
 43 |         pytest.param("p2p", marks=[pytest.mark.shuffle_p2p, requires_p2p_rechunk]),
 44 |     ]
 45 | )
 46 | def configure_rechunking_out_of_core(request):
 47 |     if request.param == "tasks":
 48 |         with dask.config.set({"array.rechunk.method": "tasks"}):
 49 |             yield
 50 |     else:
 51 |         with dask.config.set(
 52 |             {
 53 |                 "array.rechunk.method": "p2p",
 54 |                 "distributed.p2p.disk": True,
 55 |             }
 56 |         ):
 57 |             yield
 58 | 
 59 | 
 60 | def test_tiles_to_rows(
 61 |     # Order matters: don't initialize client when skipping test
 62 |     input_chunk_size,
 63 |     configure_rechunking_in_memory,
 64 |     small_client,
 65 | ):
 66 |     """2D array sliced into square tiles becomes sliced by columns.
 67 |     This use case can be broken down into N independent problems.
 68 |     In task rechunk, this generates O(N) intermediate tasks and graph edges.
 69 |     """
 70 |     memory = cluster_memory(small_client)
 71 |     shape = scaled_array_shape(memory * 1.5, ("x", "x"))
 72 | 
 73 |     a = da.random.random(shape, chunks=input_chunk_size)
 74 |     a = a.rechunk((-1, "auto")).sum()
 75 |     wait(a, small_client, timeout=600)
 76 | 
 77 | 
 78 | def test_swap_axes_in_memory(
 79 |     # Order matters: don't initialize client when skipping test
 80 |     input_chunk_size,
 81 |     configure_rechunking_in_memory,
 82 |     small_client,
 83 | ):
 84 |     """2D array sliced by columns becomes sliced by rows.
 85 |     This is an N-to-N problem, so grouping into sub-problems is impossible.
 86 |     In task rechunk, this generates O(N^2) intermediate tasks and graph edges.
 87 |     """
 88 |     memory = cluster_memory(small_client)
 89 |     shape = scaled_array_shape(memory * 0.5, ("x", "x"))
 90 | 
 91 |     a = da.random.random(shape, chunks=(-1, input_chunk_size))
 92 |     a = a.rechunk(("auto", -1)).sum()
 93 |     wait(a, small_client, timeout=600)
 94 | 
 95 | 
 96 | def test_swap_axes_out_of_core(
 97 |     # Order matters: don't initialize client when skipping test
 98 |     configure_rechunking_out_of_core,
 99 |     small_client,
100 | ):
101 |     """2D array sliced by columns becomes sliced by rows.
102 |     This is an N-to-N problem, so grouping into sub-problems is impossible.
103 |     In task rechunk, this generates O(N^2) intermediate tasks and graph edges.
104 |     """
105 |     memory = cluster_memory(small_client)
106 |     shape = scaled_array_shape(memory * 1.5, ("x", "x"))
107 | 
108 |     a = da.random.random(shape, chunks=(-1, "auto"))
109 |     a = a.rechunk(("auto", -1)).sum()
110 |     wait(a, small_client, timeout=600)
111 | 
112 | 
113 | def test_adjacent_groups(
114 |     # Order matters: don't initialize client when skipping test
115 |     input_chunk_size,
116 |     configure_rechunking_in_memory,
117 |     small_client,
118 | ):
119 |     """M-to-N use case, where each input task feeds into a localized but substantial
120 |     subset of the output tasks, with partial interaction between adjacent zones.
121 |     """
122 |     memory = cluster_memory(small_client)
123 |     shape = scaled_array_shape(memory * 1.5, ("x", 10, 10_000))
124 | 
125 |     a = da.random.random(shape, chunks=(input_chunk_size, 2, 5_000))
126 |     a = a.rechunk(("auto", 5, 10_000)).sum()
127 |     wait(a, small_client, timeout=600)
128 | 
129 | 
130 | def test_heal_oversplit(
131 |     # Order matters: don't initialize client when skipping test
132 |     configure_rechunking_in_memory,
133 |     small_client,
134 | ):
135 |     """rechunk() is used to heal a situation where chunks are too small.
136 |     This is a trivial N-to-1 reduction step that gets no benefit from p2p rechunking.
137 |     """
138 |     memory = cluster_memory(small_client)
139 |     shape = scaled_array_shape(memory * 1.5, ("x", "x"))
140 |     # Avoid exact n:1 rechunking, which would be a simpler special case.
141 |     # Dask should be smart enough to avoid splitting input chunks out to multiple output
142 |     # chunks.
143 |     a = da.random.random(shape, chunks="8.5 MiB")
144 |     a = a.rechunk("128 MiB").sum()
145 |     wait(a, small_client, timeout=600)
146 | 


--------------------------------------------------------------------------------
/detect_regressions.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | from distutils.util import strtobool
  4 | 
  5 | import pandas as pd
  6 | import sqlalchemy
  7 | 
  8 | 
  9 | def detect_regressions(database_file, is_pr=False):
 10 |     engine = sqlalchemy.create_engine(f"sqlite:///{database_file}")
 11 | 
 12 |     # regression analysis only on tests that passed
 13 |     df = pd.read_sql(
 14 |         "select * from test_run where platform = 'linux' and call_outcome = 'passed'",
 15 |         engine,
 16 |     )
 17 | 
 18 |     # join runtime + py version
 19 |     df = df.assign(
 20 |         runtime=(
 21 |             "coiled-"
 22 |             + df.coiled_runtime_version
 23 |             + "-py"
 24 |             + df.python_version.str.split(".", n=2).str[:2].str.join(".")
 25 |         ),
 26 |         category=df.path.str.split("/", n=1).str[0],
 27 |     )
 28 | 
 29 |     reg_df = pd.DataFrame(
 30 |         columns=[
 31 |             "category",
 32 |             "type",
 33 |             "mean",
 34 |             "last",
 35 |             "last-1",
 36 |             "last-2",
 37 |             "threshold",
 38 |             "str_report",
 39 |         ]
 40 |     )
 41 |     if is_pr:
 42 |         # Only include last run in detection regression
 43 |         n_last = 1
 44 |         n_std = 3  # be a bit more aggressive on PRs
 45 |     else:
 46 |         n_last = 3
 47 |         n_std = 3
 48 | 
 49 |     runtimes = list(df.runtime.unique())
 50 |     for runtime in runtimes:
 51 |         by_test = df[(df.runtime == runtime)].groupby("name")
 52 | 
 53 |         test_names = list(by_test.groups.keys())
 54 |         for name in test_names:
 55 |             df_test = by_test.get_group(name)
 56 | 
 57 |             # check the test is not obsolete.
 58 |             if pd.Timestamp(df_test.start.iloc[-1]) < (
 59 |                 pd.Timestamp.now() - pd.Timedelta(days=7)
 60 |             ):
 61 |                 # the latest run was 7+ days ago, test is obsolete
 62 |                 pass
 63 |             else:
 64 |                 for metric in ["duration", "average_memory", "peak_memory"]:
 65 |                     # check that we have enough data to do some stats (last three plus previous ten)
 66 |                     if len(df_test.loc[df_test[metric].notna()]) > (10 + n_last):
 67 |                         category = df_test.category.unique()[0]
 68 | 
 69 |                         if metric in ["average_memory", "peak_memory"]:
 70 |                             units_norm = 1 / (1024**3)  # to GiB to match dashboard
 71 |                             u = "[GiB]"
 72 |                         else:
 73 |                             units_norm = 1
 74 |                             u = "[s]"
 75 | 
 76 |                         metric_threshold = df_test[metric][
 77 |                             -(10 + n_last) : -n_last
 78 |                         ].mean() + max(
 79 |                             n_std * df_test[metric][-(10 + n_last) : -n_last].std(),
 80 |                             1 / units_norm,
 81 |                         )
 82 | 
 83 |                         if (df_test[metric].iloc[-n_last:] >= metric_threshold).all():
 84 |                             last_three = (
 85 |                                 df_test[metric].iloc[-1] * units_norm,
 86 |                                 df_test[metric].iloc[-2] * units_norm,
 87 |                                 df_test[metric].iloc[-3] * units_norm,
 88 |                             )
 89 |                             reg = (
 90 |                                 f"{runtime=}, {name=}, {category=}, "
 91 |                                 f"last_three_{metric} {u} = "
 92 |                                 f"{last_three}, "
 93 |                                 f"{metric}_threshold {u} = {metric_threshold * units_norm} \n"
 94 |                             )
 95 | 
 96 |                             # ["category", "type", "mean", "last", "last-1", "last-2", "threshold"])
 97 |                             reg_df.loc[f"{(runtime, name, metric)} {u}"] = [
 98 |                                 category,
 99 |                                 metric,
100 |                                 df_test[metric][-(10 + n_last) : -n_last].mean()
101 |                                 * units_norm,
102 |                                 df_test[metric].iloc[-1] * units_norm,
103 |                                 df_test[metric].iloc[-2] * units_norm,
104 |                                 df_test[metric].iloc[-3] * units_norm,
105 |                                 metric_threshold * units_norm,
106 |                                 reg,
107 |                             ]
108 | 
109 |     return reg_df
110 | 
111 | 
112 | def regressions_report(reg_df):
113 |     # write reg_df to markdown for GHA summary
114 |     cols_for_report = [
115 |         "category",
116 |         "type",
117 |         "mean",
118 |         "last",
119 |         "last-1",
120 |         "last-2",
121 |         "threshold",
122 |     ]
123 |     reg_df[cols_for_report].to_markdown("regressions_summary.md")
124 | 
125 |     if not reg_df.empty:
126 |         # Raise exception to cause CI job to fail if we detected regressions
127 |         raise Exception(
128 |             f"\x1b[31m {len(reg_df)} regressions detected: \n{''.join(reg_df.str_report.values)} \x1b[0m"
129 |         )
130 |     else:
131 |         return
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     DB_FILE = pathlib.Path("./benchmark.db")
136 | 
137 |     IS_PR = strtobool(os.environ.get("IS_PR", "false"))
138 |     regressions_df = detect_regressions(DB_FILE, is_pr=IS_PR)
139 | 
140 |     regressions_report(regressions_df)
141 | 


--------------------------------------------------------------------------------
/tests/tpch/test_duckdb.py:
--------------------------------------------------------------------------------
  1 | import botocore.session
  2 | import pytest
  3 | 
  4 | pytestmark = pytest.mark.tpch_nondask
  5 | 
  6 | duckdb = pytest.importorskip("duckdb")
  7 | 
  8 | from . import duckdb_queries  # noqa: E402
  9 | 
 10 | 
 11 | @pytest.fixture(autouse=True)
 12 | def add_duckdb_version(tpch_database_table_schema):
 13 |     tpch_database_table_schema.duckdb_version = duckdb.__version__
 14 | 
 15 | 
 16 | @pytest.fixture(autouse=True)
 17 | def add_cluster_spec_to_db(tpch_database_table_schema, machine_spec, local):
 18 |     if not local:
 19 |         tpch_database_table_schema.n_workers = 1
 20 |         tpch_database_table_schema.worker_vm_type = machine_spec["vm_type"]
 21 |         tpch_database_table_schema.cluster_disk_size = machine_spec.get(
 22 |             "worker_disk_size"
 23 |         )
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def connection(local, restart):
 28 |     def _():
 29 |         con = duckdb.connect()
 30 | 
 31 |         if not local:  # Setup s3 credentials
 32 |             session = botocore.session.Session()
 33 |             creds = session.get_credentials()
 34 |             con.install_extension("httpfs")
 35 |             con.load_extension("httpfs")
 36 |             con.sql(
 37 |                 f"""
 38 |                 SET s3_region='us-east-2';
 39 |                 SET s3_access_key_id='{creds.access_key}';
 40 |                 SET s3_secret_access_key='{creds.secret_key}';
 41 |                 SET s3_session_token='{creds.token}';
 42 |                 """
 43 |             )
 44 |         return con
 45 | 
 46 |     return _
 47 | 
 48 | 
 49 | def test_query_01(run, connection, dataset_path, scale):
 50 |     def _():
 51 |         duckdb_queries.query_01(connection(), dataset_path, scale)
 52 | 
 53 |     run(_)
 54 | 
 55 | 
 56 | def test_query_02(run, connection, dataset_path, scale):
 57 |     def _():
 58 |         duckdb_queries.query_02(connection(), dataset_path, scale)
 59 | 
 60 |     run(_)
 61 | 
 62 | 
 63 | def test_query_03(run, connection, dataset_path, scale):
 64 |     def _():
 65 |         duckdb_queries.query_03(connection(), dataset_path, scale)
 66 | 
 67 |     run(_)
 68 | 
 69 | 
 70 | def test_query_04(run, connection, dataset_path, scale):
 71 |     def _():
 72 |         duckdb_queries.query_04(connection(), dataset_path, scale)
 73 | 
 74 |     run(_)
 75 | 
 76 | 
 77 | def test_query_05(run, connection, dataset_path, scale):
 78 |     def _():
 79 |         duckdb_queries.query_05(connection(), dataset_path, scale)
 80 | 
 81 |     run(_)
 82 | 
 83 | 
 84 | def test_query_06(run, connection, dataset_path, scale):
 85 |     def _():
 86 |         duckdb_queries.query_06(connection(), dataset_path, scale)
 87 | 
 88 |     run(_)
 89 | 
 90 | 
 91 | def test_query_07(run, connection, dataset_path, scale):
 92 |     def _():
 93 |         duckdb_queries.query_07(connection(), dataset_path, scale)
 94 | 
 95 |     run(_)
 96 | 
 97 | 
 98 | def test_query_08(run, connection, dataset_path, scale):
 99 |     def _():
100 |         duckdb_queries.query_08(connection(), dataset_path, scale)
101 | 
102 |     run(_)
103 | 
104 | 
105 | def test_query_09(run, connection, dataset_path, scale):
106 |     def _():
107 |         duckdb_queries.query_09(connection(), dataset_path, scale)
108 | 
109 |     run(_)
110 | 
111 | 
112 | def test_query_10(run, connection, dataset_path, scale):
113 |     def _():
114 |         duckdb_queries.query_10(connection(), dataset_path, scale)
115 | 
116 |     run(_)
117 | 
118 | 
119 | def test_query_11(run, connection, dataset_path, scale):
120 |     def _():
121 |         duckdb_queries.query_11(connection(), dataset_path, scale)
122 | 
123 |     run(_)
124 | 
125 | 
126 | def test_query_12(run, connection, dataset_path, scale):
127 |     def _():
128 |         duckdb_queries.query_12(connection(), dataset_path, scale)
129 | 
130 |     run(_)
131 | 
132 | 
133 | def test_query_13(run, connection, dataset_path, scale):
134 |     def _():
135 |         duckdb_queries.query_13(connection(), dataset_path, scale)
136 | 
137 |     run(_)
138 | 
139 | 
140 | def test_query_14(run, connection, dataset_path, scale):
141 |     def _():
142 |         duckdb_queries.query_14(connection(), dataset_path, scale)
143 | 
144 |     run(_)
145 | 
146 | 
147 | def test_query_15(run, connection, dataset_path, scale):
148 |     def _():
149 |         duckdb_queries.query_15(connection(), dataset_path, scale)
150 | 
151 |     run(_)
152 | 
153 | 
154 | def test_query_16(run, connection, dataset_path, scale):
155 |     def _():
156 |         duckdb_queries.query_16(connection(), dataset_path, scale)
157 | 
158 |     run(_)
159 | 
160 | 
161 | def test_query_17(run, connection, dataset_path, scale):
162 |     def _():
163 |         duckdb_queries.query_17(connection(), dataset_path, scale)
164 | 
165 |     run(_)
166 | 
167 | 
168 | def test_query_18(run, connection, dataset_path, scale):
169 |     def _():
170 |         duckdb_queries.query_18(connection(), dataset_path, scale)
171 | 
172 |     run(_)
173 | 
174 | 
175 | def test_query_19(run, connection, dataset_path, scale):
176 |     def _():
177 |         duckdb_queries.query_19(connection(), dataset_path, scale)
178 | 
179 |     run(_)
180 | 
181 | 
182 | def test_query_20(run, connection, dataset_path, scale):
183 |     def _():
184 |         duckdb_queries.query_20(connection(), dataset_path, scale)
185 | 
186 |     run(_)
187 | 
188 | 
189 | def test_query_21(run, connection, dataset_path, scale):
190 |     def _():
191 |         duckdb_queries.query_21(connection(), dataset_path, scale)
192 | 
193 |     run(_)
194 | 
195 | 
196 | def test_query_22(run, connection, dataset_path, scale):
197 |     def _():
198 |         duckdb_queries.query_22(connection(), dataset_path, scale)
199 | 
200 |     run(_)
201 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_parquet.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parquet-related benchmarks.
  3 | """
  4 | import io
  5 | import uuid
  6 | 
  7 | import boto3
  8 | import dask.dataframe as dd
  9 | import dask.datasets
 10 | import distributed
 11 | import fsspec
 12 | import pandas
 13 | import pytest
 14 | from coiled import Cluster
 15 | from packaging.version import Version
 16 | 
 17 | from ..conftest import dump_cluster_kwargs
 18 | from ..utils_test import run_up_to_nthreads, wait
 19 | 
 20 | try:
 21 |     import pyarrow
 22 | 
 23 |     HAS_PYARROW12 = Version(pyarrow.__version__) >= Version("12.0.0")
 24 | except ImportError:
 25 |     HAS_PYARROW12 = False
 26 | 
 27 | 
 28 | @pytest.fixture(scope="module")
 29 | def parquet_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags):
 30 |     kwargs = dict(
 31 |         name=f"parquet-{uuid.uuid4().hex[:8]}",
 32 |         environ=dask_env_variables,
 33 |         tags=github_cluster_tags,
 34 |         **cluster_kwargs["parquet_cluster"],
 35 |     )
 36 |     dump_cluster_kwargs(kwargs, "parquet")
 37 | 
 38 |     with Cluster(**kwargs) as cluster:
 39 |         yield cluster
 40 | 
 41 | 
 42 | @pytest.fixture
 43 | def parquet_client(parquet_cluster, cluster_kwargs, benchmark_all, wait_for_workers):
 44 |     n_workers = cluster_kwargs["parquet_cluster"]["n_workers"]
 45 |     with distributed.Client(parquet_cluster) as client:
 46 |         parquet_cluster.scale(n_workers)
 47 |         wait_for_workers(client, n_workers, timeout=600)
 48 |         client.restart()
 49 |         with benchmark_all(client):
 50 |             yield client
 51 | 
 52 | 
 53 | @pytest.mark.skipif(
 54 |     HAS_PYARROW12,
 55 |     reason="50x slower than PyArrow 11; https://github.com/coiled/benchmarks/issues/998",
 56 | )
 57 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset")
 58 | def test_read_spark_generated_data(parquet_client):
 59 |     """
 60 |     Read a ~15 GB subset of a ~800 GB spark-generated
 61 |     open dataset on AWS.
 62 | 
 63 |     The dataset was copied from AWS open data on 2022-05-25
 64 |     https://registry.opendata.aws/1000-genomes-data-lakehouse-ready/
 65 |     Citation: https://www.nature.com/articles/s41467-018-08148-z
 66 |     """
 67 |     ddf = dd.read_parquet(
 68 |         "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet",
 69 |         engine="pyarrow",
 70 |         index="sample_id",
 71 |     )
 72 |     coll = ddf.groupby(ddf.index).first()
 73 |     wait(coll, parquet_client, 500)
 74 | 
 75 | 
 76 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset")
 77 | def test_read_hive_partitioned_data(parquet_client):
 78 |     """
 79 |     Read a dataset partitioned by year and quarter.
 80 | 
 81 |     The dataset was copied from AWS open data on 2022-05-25
 82 |     https://registry.opendata.aws/speedtest-global-performance/
 83 |     """
 84 |     ddf = dd.read_parquet(
 85 |         "s3://coiled-runtime-ci/ookla-open-data/type=fixed/*/*/*.parquet",
 86 |         engine="pyarrow",
 87 |     )
 88 |     coll = ddf.groupby(["year", "quarter"]).first()
 89 |     wait(coll, parquet_client, 100)
 90 | 
 91 | 
 92 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset")
 93 | def test_write_wide_data(parquet_client, s3_url):
 94 |     # Write a ~700 partition, ~200 GB dataset with a lot of columns
 95 |     ddf = dask.datasets.timeseries(
 96 |         dtypes={
 97 |             **{f"name-{i}": str for i in range(25)},
 98 |             **{f"price-{i}": float for i in range(25)},
 99 |             **{f"id-{i}": int for i in range(25)},
100 |             **{f"cat-{i}": "category" for i in range(25)},
101 |         },
102 |         start="2021-01-01",
103 |         end="2021-02-01",
104 |         freq="10ms",
105 |         partition_freq="1H",
106 |     )
107 |     ddf.to_parquet(s3_url + "/wide-data/")
108 | 
109 | 
110 | @run_up_to_nthreads("parquet_cluster", 60, reason="fixed dataset")
111 | @pytest.mark.parametrize("kind", ["boto3", "s3fs", "pandas", "pandas+boto3", "dask"])
112 | def test_download_throughput(parquet_client, kind):
113 |     """Test throughput for downloading and parsing a single 563 MB parquet file.
114 | 
115 |     Note
116 |     ----
117 |     I/O performance on S3 is heavily dependent on how many times the same file has been
118 |     requested over the last few seconds. In A/B tests, this could lead to a false
119 |     impression that test cases later in this list are faster than the earlier ones.
120 |     Read more: https://github.com/coiled/benchmarks/issues/821
121 |     """
122 |     path = (
123 |         "s3://coiled-runtime-ci/ookla-open-data/"
124 |         "type=fixed/year=2022/quarter=1/2022-01-01_performance_fixed_tiles.parquet"
125 |     )
126 | 
127 |     def boto3_load(path):
128 |         s3 = boto3.client("s3")
129 |         _, _, bucket_name, key = path.split("/", maxsplit=3)
130 |         response = s3.get_object(Bucket=bucket_name, Key=key)
131 |         return response["Body"].read()
132 | 
133 |     if kind == "boto3":
134 |         fut = parquet_client.submit(boto3_load, path)
135 | 
136 |     elif kind == "s3fs":
137 | 
138 |         def load(path):
139 |             with fsspec.open(path) as f:
140 |                 return f.read()
141 | 
142 |         fut = parquet_client.submit(load, path)
143 | 
144 |     elif kind == "pandas":
145 |         fut = parquet_client.submit(pandas.read_parquet, path, engine="pyarrow")
146 | 
147 |     elif kind == "pandas+boto3":
148 | 
149 |         def load(path):
150 |             raw = boto3_load(path)
151 |             buf = io.BytesIO(raw)
152 |             return pandas.read_parquet(buf, engine="pyarrow")
153 | 
154 |         fut = parquet_client.submit(load, path)
155 | 
156 |     elif kind == "dask":
157 |         fut = dd.read_parquet(path, engine="pyarrow")
158 | 
159 |     wait(fut, parquet_client, timeout=60)
160 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_h2o.py:
--------------------------------------------------------------------------------
  1 | """
  2 | h2o-ai benchmark groupby part running on coiled.
  3 | 
  4 | Note: Only holistic aggregations (median and groupby-apply) use a shuffle with the
  5 | default split_out=1.
  6 | """
  7 | import os
  8 | 
  9 | import dask.dataframe as dd
 10 | import pandas as pd
 11 | import pytest
 12 | 
 13 | from ..utils_test import run_up_to_nthreads
 14 | 
 15 | DATASETS = {
 16 |     "0.5 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2/*.csv",
 17 |     "5 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2/*.csv",
 18 |     "50 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2/*.csv",
 19 |     "0.5 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet",
 20 |     "5 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet",
 21 |     "50 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet",
 22 |     "5 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e8_K_1e2/*.parquet",
 23 |     "50 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet",
 24 |     "500 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e10_K_1e2/*.parquet",
 25 | }
 26 | 
 27 | enabled_datasets = os.getenv("H2O_DATASETS")
 28 | if enabled_datasets is not None:
 29 |     enabled_datasets = {k.strip() for k in enabled_datasets.split(",")}
 30 |     if unknown_datasets := enabled_datasets - DATASETS.keys():
 31 |         raise ValueError("Unknown h2o dataset(s): ", unknown_datasets)
 32 | else:
 33 |     enabled_datasets = {
 34 |         "5 GB (parquet)",
 35 |     }
 36 | 
 37 | 
 38 | @pytest.fixture(autouse=True)
 39 | def client(small_client):
 40 |     yield small_client
 41 | 
 42 | 
 43 | @pytest.fixture(params=sorted(enabled_datasets), scope="module")
 44 | def ddf(request):
 45 |     n_gib = float(request.param.split(" GB ")[0])
 46 |     # 0.5 GB datasets are broken in 5~10 files
 47 |     # 5 GB -> 100 files
 48 |     # 50 GB -> 1000 files
 49 |     # 500 GB -> 10,000 files
 50 |     max_threads = max(20, int(n_gib * 20))
 51 |     run_up_to_nthreads(
 52 |         "small_cluster", max_threads, reason="fixed data size", as_decorator=False
 53 |     )
 54 | 
 55 |     uri = DATASETS[request.param]
 56 | 
 57 |     if uri.endswith("csv"):
 58 |         yield dd.read_csv(
 59 |             uri,
 60 |             dtype={
 61 |                 "id1": "category",
 62 |                 "id2": "category",
 63 |                 "id3": "category",
 64 |                 "id4": "Int32",
 65 |                 "id5": "Int32",
 66 |                 "id6": "Int32",
 67 |                 "v1": "Int32",
 68 |                 "v2": "Int32",
 69 |                 "v3": "float64",
 70 |             },
 71 |             storage_options={"anon": True},
 72 |         )
 73 |     else:
 74 |         yield dd.read_parquet(uri, engine="pyarrow", storage_options={"anon": True})
 75 | 
 76 | 
 77 | def test_q1(ddf):
 78 |     ddf = ddf[["id1", "v1"]]
 79 |     ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute()
 80 | 
 81 | 
 82 | def test_q2(ddf):
 83 |     ddf = ddf[["id1", "id2", "v1"]]
 84 |     (
 85 |         ddf.groupby(["id1", "id2"], dropna=False, observed=True)
 86 |         .agg({"v1": "sum"})
 87 |         .compute()
 88 |     )
 89 | 
 90 | 
 91 | def test_q3(ddf):
 92 |     ddf = ddf[["id3", "v1", "v3"]]
 93 |     (
 94 |         ddf.groupby("id3", dropna=False, observed=True)
 95 |         .agg({"v1": "sum", "v3": "mean"})
 96 |         .compute()
 97 |     )
 98 | 
 99 | 
100 | def test_q4(ddf):
101 |     ddf = ddf[["id4", "v1", "v2", "v3"]]
102 |     (
103 |         ddf.groupby("id4", dropna=False, observed=True)
104 |         .agg({"v1": "mean", "v2": "mean", "v3": "mean"})
105 |         .compute()
106 |     )
107 | 
108 | 
109 | def test_q5(ddf):
110 |     ddf = ddf[["id6", "v1", "v2", "v3"]]
111 |     (
112 |         ddf.groupby("id6", dropna=False, observed=True)
113 |         .agg(
114 |             {"v1": "sum", "v2": "sum", "v3": "sum"},
115 |         )
116 |         .compute()
117 |     )
118 | 
119 | 
120 | def test_q6(ddf, shuffle_method):
121 |     # Median aggregation uses an explicitly-set shuffle
122 |     ddf = ddf[["id4", "id5", "v3"]]
123 |     (
124 |         ddf.groupby(["id4", "id5"], dropna=False, observed=True)
125 |         .agg({"v3": ["median", "std"]}, shuffle=shuffle_method)
126 |         .compute()  # requires shuffle arg to be set explicitly
127 |     )
128 | 
129 | 
130 | def test_q7(ddf):
131 |     ddf = ddf[["id3", "v1", "v2"]]
132 |     (
133 |         ddf.groupby("id3", dropna=False, observed=True)
134 |         .agg({"v1": "max", "v2": "min"})
135 |         .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]]
136 |         .compute()
137 |     )
138 | 
139 | 
140 | def test_q8(ddf, configure_shuffling):
141 |     # .groupby(...).apply(...) uses a shuffle to transfer data before applying the function
142 |     ddf = ddf[["id6", "v1", "v2", "v3"]]
143 |     (
144 |         ddf[~ddf["v3"].isna()][["id6", "v3"]]
145 |         .groupby("id6", dropna=False, observed=True)
146 |         .apply(
147 |             lambda x: x.nlargest(2, columns="v3"),
148 |             meta={"id6": "Int64", "v3": "float64"},
149 |         )[["v3"]]
150 |         .compute()
151 |     )
152 | 
153 | 
154 | def test_q9(ddf, configure_shuffling):
155 |     # .groupby(...).apply(...) uses a shuffle to transfer data before applying the function
156 |     ddf = ddf[["id2", "id4", "v1", "v2"]]
157 |     (
158 |         ddf[["id2", "id4", "v1", "v2"]]
159 |         .groupby(["id2", "id4"], dropna=False, observed=True)
160 |         .apply(
161 |             lambda x: pd.Series({"r2": x.corr(numeric_only=True)["v1"]["v2"] ** 2}),
162 |             meta={"r2": "float64"},
163 |         )
164 |         .compute()
165 |     )
166 | 


--------------------------------------------------------------------------------
/tests/benchmarks/test_xarray.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | 
  3 | import dask.array as da
  4 | import fsspec
  5 | import numpy as np
  6 | import pytest
  7 | from coiled import Cluster
  8 | from dask.utils import parse_bytes
  9 | from distributed import Client
 10 | 
 11 | from tests.conftest import dump_cluster_kwargs
 12 | 
 13 | from ..utils_test import (
 14 |     cluster_memory,
 15 |     print_size_info,
 16 |     run_up_to_nthreads,
 17 |     scaled_array_shape,
 18 |     wait,
 19 | )
 20 | 
 21 | xr = pytest.importorskip("xarray")
 22 | pytest.importorskip("flox")
 23 | 
 24 | 
 25 | @pytest.fixture(scope="module")
 26 | def group_reduction_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags):
 27 |     kwargs = dict(
 28 |         name=f"xarray-group-reduction-{uuid.uuid4().hex[:8]}",
 29 |         environ=dask_env_variables,
 30 |         tags=github_cluster_tags,
 31 |         **cluster_kwargs["group_reduction_cluster"],
 32 |     )
 33 |     dump_cluster_kwargs(kwargs, "group_reduction_cluster")
 34 |     with Cluster(**kwargs) as cluster:
 35 |         yield cluster
 36 | 
 37 | 
 38 | @pytest.fixture
 39 | def group_reduction_client(
 40 |     group_reduction_cluster, cluster_kwargs, benchmark_all, wait_for_workers
 41 | ):
 42 |     n_workers = cluster_kwargs["group_reduction_cluster"]["n_workers"]
 43 |     with Client(group_reduction_cluster) as client:
 44 |         group_reduction_cluster.scale(n_workers)
 45 |         wait_for_workers(client, n_workers, timeout=600)
 46 |         client.restart()
 47 |         with benchmark_all(client):
 48 |             yield client
 49 | 
 50 | 
 51 | @pytest.mark.parametrize(
 52 |     "func",
 53 |     [
 54 |         pytest.param(
 55 |             lambda x: x.groupby("time.month").mean(method="cohorts"), id="cohorts"
 56 |         ),
 57 |         pytest.param(
 58 |             lambda x: x.groupby("time.month").mean(method="map-reduce"), id="map-reduce"
 59 |         ),
 60 |         pytest.param(
 61 |             lambda x: x.chunk(time=xr.groupers.TimeResampler("ME"))
 62 |             .groupby("time.month")
 63 |             .mean(method="cohorts"),
 64 |             id="chunked-cohorts",
 65 |         ),
 66 |     ],
 67 | )
 68 | def test_xarray_groupby_reduction(group_reduction_client, func):
 69 |     ds = xr.open_zarr(
 70 |         fsspec.get_mapper(
 71 |             "s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr", anon=True
 72 |         ),
 73 |         consolidated=True,
 74 |     )
 75 |     # slice dataset properly to keep runtime in check
 76 |     subset = ds.zwattablrt.sel(time=slice("2001", "2002"))
 77 |     subset = subset.isel(x=slice(0, 350 * 8), y=slice(0, 350 * 8))
 78 |     result = func(subset)
 79 |     wait(result, group_reduction_client, 10 * 60)
 80 | 
 81 | 
 82 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset")
 83 | @pytest.mark.parametrize("backend", ["dataframe", "array"])
 84 | def test_quadratic_mean(small_client, backend):
 85 |     # https://github.com/pangeo-data/distributed-array-examples/issues/2
 86 |     # See https://github.com/dask/dask/issues/10384
 87 |     size = 5000
 88 |     ds = xr.Dataset(
 89 |         dict(
 90 |             anom_u=(
 91 |                 ["time", "face", "j", "i"],
 92 |                 da.random.random((size, 1, 987, 1920), chunks=(10, 1, -1, -1)),
 93 |             ),
 94 |             anom_v=(
 95 |                 ["time", "face", "j", "i"],
 96 |                 da.random.random((size, 1, 987, 1920), chunks=(10, 1, -1, -1)),
 97 |             ),
 98 |         )
 99 |     )
100 | 
101 |     quad = ds**2
102 |     quad["uv"] = ds.anom_u * ds.anom_v
103 |     mean = quad.mean("time")
104 |     if backend == "dataframe":
105 |         mean = mean.to_dask_dataframe()
106 | 
107 |     wait(mean, small_client, 10 * 60)
108 | 
109 | 
110 | def test_anom_mean(small_client, new_array):
111 |     """From https://github.com/dask/distributed/issues/2602#issuecomment-498718651"""
112 | 
113 |     memory = cluster_memory(small_client)  # 76.66 GiB
114 |     target_nbytes = memory // 2
115 |     data = new_array(
116 |         scaled_array_shape(target_nbytes, ("x", "10MiB")),
117 |         chunks=(1, parse_bytes("10MiB") // 8),
118 |     )
119 |     print_size_info(memory, target_nbytes, data)
120 |     # 38.32 GiB - 3925 10.00 MiB chunks
121 | 
122 |     ngroups = data.shape[0] // 100
123 |     arr = xr.DataArray(
124 |         data,
125 |         dims=["time", "x"],
126 |         coords={"day": ("time", np.arange(data.shape[0]) % ngroups)},
127 |     )
128 |     with xr.set_options(use_flox=False):
129 |         clim = arr.groupby("day").mean(dim="time")
130 |         anom = arr.groupby("day") - clim
131 |         anom_mean = anom.mean(dim="time")
132 | 
133 |     wait(anom_mean, small_client, 10 * 60)
134 | 
135 | 
136 | @pytest.mark.skip(
137 |     "fails in actual CI; see https://github.com/coiled/benchmarks/issues/253"
138 | )
139 | def test_climatic_mean(small_client, new_array):
140 |     """From https://github.com/dask/distributed/issues/2602#issuecomment-535009454"""
141 | 
142 |     memory = cluster_memory(small_client)  # 76.66 GiB
143 |     target_nbytes = memory * 2
144 |     chunks = (1, 1, 96, 21, 90, 144)
145 |     shape = (28, "x", 96, 21, 90, 144)
146 |     data = new_array(scaled_array_shape(target_nbytes, shape), chunks=chunks)
147 |     print_size_info(memory, target_nbytes, data)
148 |     # 152.62 GiB - 784 199.34 MiB chunks
149 | 
150 |     array = xr.DataArray(
151 |         data,
152 |         dims=["ensemble", "init_date", "lat", "lead_time", "level", "lon"],
153 |         # coords={"init_date": pd.date_range(start="1960", periods=arr.shape[1])},
154 |         coords={"init_date": np.arange(data.shape[1]) % 10},
155 |     )
156 |     # arr_clim = array.groupby("init_date.month").mean(dim="init_date")
157 |     with xr.set_options(use_flox=False):
158 |         arr_clim = array.groupby("init_date").mean(dim="init_date")
159 | 
160 |     wait(arr_clim, small_client, 15 * 60)
161 | 


--------------------------------------------------------------------------------
/tests/geospatial/workloads/climatology.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Literal
  2 | 
  3 | import numpy as np
  4 | import xarray as xr
  5 | from dask.delayed import Delayed
  6 | 
  7 | 
  8 | def compute_hourly_climatology(
  9 |     ds: xr.Dataset,
 10 | ) -> xr.Dataset:
 11 |     hours = xr.DataArray(range(0, 24, 6), dims=["hour"])
 12 |     window_weights = create_window_weights(61)
 13 |     return xr.concat(
 14 |         [compute_rolling_mean(select_hour(ds, hour), window_weights) for hour in hours],
 15 |         dim=hours,
 16 |     )
 17 | 
 18 | 
 19 | def compute_rolling_mean(ds: xr.Dataset, window_weights: xr.DataArray) -> xr.Dataset:
 20 |     window_size = len(window_weights)
 21 |     half_window_size = window_size // 2  # For padding
 22 |     ds = xr.concat(
 23 |         [
 24 |             replace_time_with_doy(ds.sel(time=str(y)))
 25 |             for y in np.unique(ds.time.dt.year)
 26 |         ],
 27 |         dim="year",
 28 |     )
 29 |     ds = ds.fillna(ds.sel(dayofyear=365))
 30 |     ds = ds.pad(pad_width={"dayofyear": half_window_size}, mode="wrap")
 31 |     ds = ds.rolling(dayofyear=window_size, center=True).construct("window")
 32 |     ds = ds.weighted(window_weights).mean(dim=("window", "year"))
 33 |     return ds.isel(dayofyear=slice(half_window_size, -half_window_size))
 34 | 
 35 | 
 36 | def create_window_weights(window_size: int) -> xr.DataArray:
 37 |     """Create linearly decaying window weights."""
 38 |     assert window_size % 2 == 1, "Window size must be odd."
 39 |     half_window_size = window_size // 2
 40 |     window_weights = np.concatenate(
 41 |         [
 42 |             np.linspace(0, 1, half_window_size + 1),
 43 |             np.linspace(1, 0, half_window_size + 1)[1:],
 44 |         ]
 45 |     )
 46 |     window_weights = window_weights / window_weights.mean()
 47 |     window_weights = xr.DataArray(window_weights, dims=["window"])
 48 |     return window_weights
 49 | 
 50 | 
 51 | def replace_time_with_doy(ds: xr.Dataset) -> xr.Dataset:
 52 |     """Replace time coordinate with days of year."""
 53 |     return ds.assign_coords({"time": ds.time.dt.dayofyear}).rename(
 54 |         {"time": "dayofyear"}
 55 |     )
 56 | 
 57 | 
 58 | def select_hour(ds: xr.Dataset, hour: int) -> xr.Dataset:
 59 |     """Select given hour of day from dataset."""
 60 |     # Select hour
 61 |     ds = ds.isel(time=ds.time.dt.hour == hour)
 62 |     # Adjust time dimension
 63 |     ds = ds.assign_coords({"time": ds.time.astype("datetime64[D]")})
 64 |     return ds
 65 | 
 66 | 
 67 | def rechunk_map_blocks(
 68 |     scale: Literal["small", "medium", "large"],
 69 |     storage_url: str,
 70 |     storage_options: dict[str, Any],
 71 | ) -> Delayed:
 72 |     # Load dataset
 73 |     ds = xr.open_zarr(
 74 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721.zarr",
 75 |     )
 76 | 
 77 |     if scale == "small":
 78 |         # 101.83 GiB (small)
 79 |         time_range = slice("2020-01-01", "2022-12-31")
 80 |         variables = ["sea_surface_temperature"]
 81 |     elif scale == "medium":
 82 |         # 2.12 TiB (medium)
 83 |         time_range = slice("1959-01-01", "2022-12-31")
 84 |         variables = ["sea_surface_temperature"]
 85 |     else:
 86 |         # 4.24 TiB (large)
 87 |         # This currently doesn't complete successfully.
 88 |         time_range = slice("1959-01-01", "2022-12-31")
 89 |         variables = ["sea_surface_temperature", "snow_depth"]
 90 |     ds = ds[variables].sel(time=time_range)
 91 |     original_chunks = ds.chunks
 92 | 
 93 |     ds = ds.drop_vars([k for k, v in ds.items() if "time" not in v.dims])
 94 |     pencil_chunks = {"time": -1, "longitude": "auto", "latitude": "auto"}
 95 | 
 96 |     working = ds.chunk(pencil_chunks)
 97 |     hours = xr.DataArray(range(0, 24, 6), dims=["hour"])
 98 |     daysofyear = xr.DataArray(range(1, 367), dims=["dayofyear"])
 99 |     template = (
100 |         working.isel(time=0)
101 |         .drop_vars("time")
102 |         .expand_dims(hour=hours, dayofyear=daysofyear)
103 |         .assign_coords(hour=hours, dayofyear=daysofyear)
104 |     )
105 |     working = working.map_blocks(compute_hourly_climatology, template=template)
106 | 
107 |     pancake_chunks = {
108 |         "hour": 1,
109 |         "dayofyear": 1,
110 |         "latitude": original_chunks["latitude"],
111 |         "longitude": original_chunks["longitude"],
112 |     }
113 |     result = working.chunk(pancake_chunks)
114 |     return result.to_zarr(storage_url, storage_options=storage_options, compute=False)
115 | 
116 | 
117 | def highlevel_api(
118 |     scale: Literal["small", "medium", "large"],
119 |     storage_url: str,
120 |     storage_options: dict[str, Any],
121 | ) -> Delayed:
122 |     # Load dataset
123 |     ds = xr.open_zarr(
124 |         "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721.zarr",
125 |     )
126 | 
127 |     if scale == "small":
128 |         # 101.83 GiB (small)
129 |         time_range = slice("2020-01-01", "2022-12-31")
130 |         variables = ["sea_surface_temperature"]
131 |     elif scale == "medium":
132 |         # 2.12 TiB (medium)
133 |         time_range = slice("1959-01-01", "2022-12-31")
134 |         variables = ["sea_surface_temperature"]
135 |     else:
136 |         # 4.24 TiB (large)
137 |         # This currently doesn't complete successfully.
138 |         time_range = slice("1959-01-01", "2022-12-31")
139 |         variables = ["sea_surface_temperature", "snow_depth"]
140 |     ds = ds[variables].sel(time=time_range)
141 |     original_chunks = ds.chunks
142 | 
143 |     # Drop all static variables
144 |     ds = ds.drop_vars([k for k, v in ds.items() if "time" not in v.dims])
145 | 
146 |     # Split time dimension into three dimensions
147 |     ds["dayofyear"] = ds.time.dt.dayofyear
148 |     ds["hour"] = ds.time.dt.hour
149 |     ds["year"] = ds.time.dt.year
150 |     ds = ds.set_index(time=["year", "dayofyear", "hour"]).unstack()
151 | 
152 |     # Fill empty values for non-leap years
153 |     ds = ds.ffill(dim="dayofyear", limit=1)
154 | 
155 |     # Calculate climatology
156 |     window_size = 61
157 |     window_weights = create_window_weights(window_size)
158 |     half_window_size = window_size // 2
159 |     ds = ds.pad(pad_width={"dayofyear": half_window_size}, mode="wrap")
160 |     ds = ds.rolling(dayofyear=window_size, center=True).construct("window")
161 |     ds = ds.weighted(window_weights).mean(dim=("window", "year"))
162 |     ds = ds.isel(dayofyear=slice(half_window_size, -half_window_size))
163 | 
164 |     pancake_chunks = {
165 |         "hour": 1,
166 |         "dayofyear": 1,
167 |         "latitude": original_chunks["latitude"],
168 |         "longitude": original_chunks["longitude"],
169 |     }
170 |     result = ds.chunk(pancake_chunks)
171 |     return result.to_zarr(storage_url, storage_options=storage_options, compute=False)
172 | 


--------------------------------------------------------------------------------