├── tests ├── __init__.py ├── tpch │ ├── __init__.py │ ├── test_optimization.py │ ├── README.md │ ├── plotting.py │ ├── generate_answers.py │ ├── utils.py │ ├── visualize.ipynb │ ├── test_dask.py │ ├── test_correctness.py │ └── test_duckdb.py ├── benchmarks │ ├── __init__.py │ ├── test_csv.py │ ├── test_custom.py │ ├── test_zarr.py │ ├── test_futures.py │ ├── test_spill.py │ ├── test_dataframe.py │ ├── test_join.py │ ├── test_work_stealing.py │ ├── test_rechunk.py │ ├── test_parquet.py │ ├── test_h2o.py │ └── test_xarray.py ├── runtime │ ├── __init__.py │ ├── test_cluster_creation.py │ ├── test_build.py │ └── test_xgboost.py ├── stability │ ├── __init__.py │ ├── test_install_plugins.py │ ├── test_array.py │ └── test_deadlock.py ├── workflows │ ├── __init__.py │ ├── test_uber_lyft.py │ ├── test_xgboost_optuna.py │ ├── test_embarrassingly_parallel.py │ ├── test_snowflake.py │ └── test_from_csv_to_parquet.py ├── geospatial │ ├── workloads │ │ ├── __init__.py │ │ ├── rechunking.py │ │ ├── zonal_average.py │ │ ├── regridding.py │ │ ├── atmospheric_circulation.py │ │ ├── cloud_optimize.py │ │ ├── satellite_filtering.py │ │ └── climatology.py │ ├── utils.py │ ├── test_cloud_optimize.py │ ├── test_rechunking.py │ ├── test_regridding.py │ ├── test_atmospheric_circulation.py │ ├── test_satellite_filtering.py │ ├── test_zonal_average.py │ └── test_climatology.py └── test_utils_test.py ├── alembic ├── README ├── versions │ ├── 2d2405ad763b_drop_tpc_h_data.py │ ├── 1095dfdfc4ae_add_column_for_memray_profiles_url.py │ ├── aa1fc9fdc665_add_column_for_py_spy_profiles_url.py │ ├── 25053f75e09f_add_dask_expr_version_tracking_migration.py │ ├── 912c8e30690a_remove_tests_in_test_shuffle_py.py │ ├── 924e9b1430e1_spark_test_bankruptcy.py │ ├── 149d2048065b_add_default_parameter_to_historical_.py │ ├── a97d9375430f_default_parameter_for_test_dataframe_py_.py │ ├── fa79471ffa8c_declare_bankruptcy_for_test_futures_py.py │ ├── 2764a4f5582b_declare_bankruptcy_for_cluster_startup_.py │ ├── 9813b7160e69_parametrize_test_large_map.py │ ├── 9d6f8ea24ee1_move_h2o_tests.py │ ├── f459b2c61eaf_remove_non_upstream_historical_data.py │ ├── 59c5cc87c066_drop_outdated_rechunking_data.py │ ├── 87cbf883c2be_update_tpch_refactor_from_1094.py │ ├── b0e8d5f3295d_update_test_tpch_tpch_test_dask_from_.py │ ├── 2381a77e8487_zarr.py │ ├── d58983739401_default_parameter_for_test_rechunk_in_.py │ ├── a9363331e323_clean_h2o_tests_with_removed_shuffle_.py │ ├── e11cd1aaed38_add_cluster_spec_to_db.py │ ├── a8785a7b3cae_add_entry_to_database_for_cluster_name_.py │ ├── 778e617a2886_merge_xarray_reduction_with_quadratic_.py │ ├── 24749594f367_add_prometheus_metrics.py │ ├── 1c2fe9d527e4_expand_rechunk_parameters.py │ ├── 4ee0e23d96da_compressible_variant_of_tests.py │ ├── 967e298408ed_test_spill.py │ ├── 78c6e00fee88_remove_task_based_shuffle.py │ ├── c38b9d85915e_default_parameter_for_shuffling_tests.py │ ├── 7d7844fca7cf_initial_table.py │ └── 00d5844fd364_add_tpch_run_table.py ├── script.py.mako └── env.py ├── ci ├── condarc ├── environment-geospatial.yml ├── environment-git-tip.yml ├── environment-snowflake.yml ├── environment-dashboard.yml ├── environment-tpch-nondask.yml ├── environment-test.yml ├── scripts │ ├── dask_config_to_env.py │ ├── combine-dbs.sh │ └── discover_ab_environments.py └── environment.yml ├── AB_environments ├── AB_baseline.cluster.yaml ├── AB_baseline.dask.yaml ├── AB_sample.dask.yaml ├── AB_sample.cluster.yaml ├── make_envs.py ├── AB_baseline.conda.yaml ├── config.yaml └── AB_sample.conda.yaml ├── .github └── workflows │ ├── lint.yml │ ├── geospatial.yml │ └── tpch.yml ├── .pre-commit-config.yaml ├── setup.cfg ├── LICENSE ├── plugins.py ├── .gitignore ├── cluster_kwargs.yaml ├── alembic.ini ├── benchmark_schema.py └── detect_regressions.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/tpch/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/runtime/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/stability/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/workflows/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. -------------------------------------------------------------------------------- /ci/condarc: -------------------------------------------------------------------------------- 1 | auto_activate_base: false 2 | remote_backoff_factor: 20 3 | remote_connect_timeout_secs: 20.0 4 | remote_max_retries: 10 5 | remote_read_timeout_secs: 60.0 6 | -------------------------------------------------------------------------------- /ci/environment-geospatial.yml: -------------------------------------------------------------------------------- 1 | # This is an addition to ci/environment.yml. 2 | # Add dependencies exclusively needed to run geospatial tests. 3 | channels: 4 | - conda-forge 5 | dependencies: 6 | - memray ==1.13.4 7 | - pip: 8 | - git+https://github.com/pydata/xarray 9 | -------------------------------------------------------------------------------- /AB_environments/AB_baseline.cluster.yaml: -------------------------------------------------------------------------------- 1 | # Special environment file for A/B testing, used to define cluster creation options for 2 | # the baseline environment. 3 | # Change contents, but do not rename. 4 | 5 | # Overrides ../cluster_kwargs.yaml. 6 | # Leave empty if you don't want to override anything. 7 | -------------------------------------------------------------------------------- /ci/environment-git-tip.yml: -------------------------------------------------------------------------------- 1 | # This is an addition to ci/environment.yml, which upgrades dask to the git tip. 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - pip 6 | - pip: 7 | - git+https://github.com/dask/dask 8 | - git+https://github.com/dask/distributed 9 | - git+https://github.com/dask/zict 10 | -------------------------------------------------------------------------------- /AB_environments/AB_baseline.dask.yaml: -------------------------------------------------------------------------------- 1 | # Special environment file for A/B testing, used to define dask config options 2 | # (overriding the built-in config) for the baseline environment. 3 | # Change contents, but do not rename. 4 | # Leave empty if you don't want to override anything. 5 | dask: 6 | dataframe: 7 | query-planning: True 8 | -------------------------------------------------------------------------------- /ci/environment-snowflake.yml: -------------------------------------------------------------------------------- 1 | # This is an addition to ci/environment.yml. 2 | # Add dask-snowflake and downgrade some pinned dependencies. 3 | channels: 4 | - conda-forge 5 | dependencies: 6 | - pip 7 | - snowflake-connector-python ==3.12.2 8 | - snowflake-sqlalchemy ==1.6.1 9 | - pip: 10 | - git+https://github.com/coiled/dask-snowflake 11 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | pre-commit: 11 | name: pre-commit hooks 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v4 15 | - uses: actions/setup-python@v5 16 | - uses: pre-commit/action@v3.0.1 17 | -------------------------------------------------------------------------------- /tests/geospatial/utils.py: -------------------------------------------------------------------------------- 1 | import xarray as xr 2 | 3 | 4 | def load_era5() -> xr.Dataset: 5 | return xr.open_zarr( 6 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr", 7 | chunks={ 8 | "longitude": "auto", 9 | "latitude": "auto", 10 | "levels": "auto", 11 | "time": "auto", 12 | }, 13 | ) 14 | -------------------------------------------------------------------------------- /AB_environments/AB_sample.dask.yaml: -------------------------------------------------------------------------------- 1 | # Sample dask config file for A/B testing. 2 | # Change contents/delete/rename as needed. 3 | 4 | # Every A/B environment *must* present these three files: 5 | # - AB_.conda.yaml 6 | # - AB_.dask.yaml 7 | # - AB_.cluster.yaml 8 | 9 | # Leave empty if you don't want to override anything. 10 | 11 | # distributed: 12 | # scheduler: 13 | # worker-saturation: 1.2 14 | dask: 15 | dataframe: 16 | query-planning: True 17 | -------------------------------------------------------------------------------- /AB_environments/AB_sample.cluster.yaml: -------------------------------------------------------------------------------- 1 | # Sample cluster creation options file for A/B testing. 2 | # Change contents/delete/rename as needed. 3 | 4 | # Every A/B environment *must* present these three files: 5 | # - AB_.conda.yaml 6 | # - AB_.dask.yaml 7 | # - AB_.cluster.yaml 8 | 9 | # Overrides ../cluster_kwargs.yaml. 10 | # Leave empty if you don't want to override anything. 11 | 12 | # small_cluster: 13 | # n_workers: 5 14 | # worker_vm_types: [m6i.xlarge] # 4CPU, 16GiB 15 | -------------------------------------------------------------------------------- /ci/environment-dashboard.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | - python =3.10 5 | - sqlalchemy =2.0 6 | - altair =5.0 7 | - bokeh =3.2 8 | - panel =1.2 9 | - pandas =2.0 10 | - tabulate =0.9 11 | 12 | # These imports are only needed to parse the source code of the tests and embed it in 13 | # the dashboard 14 | - coiled 15 | - conda 16 | - dask 17 | - dask-ml 18 | - distributed 19 | - filelock 20 | - optuna 21 | - pytest 22 | - s3fs 23 | - xarray 24 | - xgboost 25 | - zarr 26 | -------------------------------------------------------------------------------- /alembic/versions/2d2405ad763b_drop_tpc_h_data.py: -------------------------------------------------------------------------------- 1 | """Drop TPC-H data 2 | 3 | Revision ID: 2d2405ad763b 4 | Revises: 59c5cc87c066 5 | Create Date: 2024-08-15 13:54:45.251458 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '2d2405ad763b' 14 | down_revision = '59c5cc87c066' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute("delete from tpch_run") 21 | 22 | 23 | def downgrade() -> None: 24 | pass 25 | -------------------------------------------------------------------------------- /alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade() -> None: 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade() -> None: 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /ci/environment-tpch-nondask.yml: -------------------------------------------------------------------------------- 1 | # This is an addition to ci/environment.yml. 2 | # Add dependencies exclusively needed to run TPCH tests on dask competitors. 3 | channels: 4 | - conda-forge 5 | dependencies: 6 | # PySpark 7 | # See https://spark.apache.org/docs/latest/api/python/getting_started/install.html#dependencies 8 | - pyspark ==3.4.1 # FIXME https://github.com/coiled/benchmarks/issues/1221 9 | - openjdk ~=11.0 # Do not upgrade 10 | - grpcio ==1.61.1 11 | - grpcio-status ==1.60.1 # FIXME https://github.com/coiled/benchmarks/issues/1221 12 | - protobuf ==4.25.2 13 | 14 | # Other TPCH tests 15 | - polars ==0.20.13 16 | -------------------------------------------------------------------------------- /alembic/versions/1095dfdfc4ae_add_column_for_memray_profiles_url.py: -------------------------------------------------------------------------------- 1 | """Add column for Memray profiles url 2 | 3 | Revision ID: 1095dfdfc4ae 4 | Revises: 2d2405ad763b 5 | Create Date: 2024-10-23 11:11:15.238042 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '1095dfdfc4ae' 14 | down_revision = '2d2405ad763b' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.add_column('test_run', sa.Column('memray_profiles_url', sa.String(), nullable=True)) 21 | 22 | 23 | def downgrade() -> None: 24 | op.drop_column("test_run", "memray_profiles_url") 25 | -------------------------------------------------------------------------------- /alembic/versions/aa1fc9fdc665_add_column_for_py_spy_profiles_url.py: -------------------------------------------------------------------------------- 1 | """Add column for py-spy profiles url 2 | 3 | Revision ID: aa1fc9fdc665 4 | Revises: 1095dfdfc4ae 5 | Create Date: 2024-10-23 16:11:24.794416 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'aa1fc9fdc665' 14 | down_revision = '1095dfdfc4ae' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.add_column('test_run', sa.Column('py_spy_profiles_url', sa.String(), nullable=True)) 21 | 22 | 23 | def downgrade() -> None: 24 | op.drop_column("test_run", "py_spy_profiles_url") 25 | -------------------------------------------------------------------------------- /ci/environment-test.yml: -------------------------------------------------------------------------------- 1 | # This is an addition to either ci/environment.yml or AB_environments/AB_*.conda.yaml, 2 | # adding dependencies specific to this test suite. 3 | channels: 4 | - conda-forge 5 | dependencies: 6 | # Testing dependencies 7 | - alembic 8 | - altair 9 | - conda 10 | - filelock 11 | - jinja2 12 | - packaging 13 | - pytest 14 | - pytest-timeout 15 | - pytest-xdist 16 | - python-dotenv 17 | - pyyaml 18 | # TPC-H correctness test and DuckDB implementation 19 | # Can add duckdb back to conda install after: 20 | # https://github.com/coiled/benchmarks/issues/1418 21 | # python-duckdb ==0.10.0 22 | - pip 23 | - pip: 24 | - duckdb==0.10.0 -------------------------------------------------------------------------------- /alembic/versions/25053f75e09f_add_dask_expr_version_tracking_migration.py: -------------------------------------------------------------------------------- 1 | """Add dask-expr version tracking migration 2 | 3 | Revision ID: 25053f75e09f 4 | Revises: 24749594f367 5 | Create Date: 2024-02-26 08:04:47.704600 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '25053f75e09f' 14 | down_revision = '24749594f367' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.add_column('test_run', sa.Column('dask_expr_version', sa.String(), nullable=True)) 21 | 22 | 23 | def downgrade() -> None: 24 | op.drop_column('test_run', 'dask_expr_version') 25 | -------------------------------------------------------------------------------- /tests/geospatial/test_cloud_optimize.py: -------------------------------------------------------------------------------- 1 | from tests.geospatial.workloads.cloud_optimize import cloud_optimize 2 | 3 | 4 | def test_cloud_optimize( 5 | scale, 6 | s3, 7 | s3_url, 8 | setup_benchmark, 9 | cluster_kwargs={ 10 | "workspace": "dask-benchmarks", 11 | "region": "us-west-2", 12 | }, 13 | scale_kwargs={ 14 | "small": {"n_workers": 10}, 15 | "medium": {"n_workers": 100}, 16 | "large": {"n_workers": 200}, 17 | }, 18 | ): 19 | with setup_benchmark( 20 | **scale_kwargs[scale], **cluster_kwargs 21 | ) as benchmark: # noqa: F841 22 | benchmark(cloud_optimize, scale, fs=s3, storage_url=s3_url) 23 | -------------------------------------------------------------------------------- /alembic/versions/912c8e30690a_remove_tests_in_test_shuffle_py.py: -------------------------------------------------------------------------------- 1 | """Remove tests in test_shuffle.py 2 | 3 | Revision ID: 912c8e30690a 4 | Revises: c38b9d85915e 5 | Create Date: 2023-01-03 20:21:06.704816 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '912c8e30690a' 14 | down_revision = 'c38b9d85915e' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute(""" 21 | delete from test_run 22 | where originalname in ('test_shuffle_simple', 'test_shuffle_parquet'); 23 | """) 24 | 25 | 26 | def downgrade() -> None: 27 | pass 28 | -------------------------------------------------------------------------------- /alembic/versions/924e9b1430e1_spark_test_bankruptcy.py: -------------------------------------------------------------------------------- 1 | """spark test bankruptcy 2 | 3 | Revision ID: 924e9b1430e1 4 | Revises: 7d7844fca7cf 5 | Create Date: 2022-09-12 09:49:32.494687 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = "924e9b1430e1" 13 | down_revision = "7d7844fca7cf" 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | op.execute( 20 | """ 21 | delete from test_run 22 | where originalname = 'test_read_spark_generated_data' 23 | and path = 'benchmarks/test_parquet.py'; 24 | """ 25 | ) 26 | 27 | 28 | def downgrade() -> None: 29 | pass 30 | -------------------------------------------------------------------------------- /alembic/versions/149d2048065b_add_default_parameter_to_historical_.py: -------------------------------------------------------------------------------- 1 | """Add default parameter to historical test_basic_sum 2 | 3 | Revision ID: 149d2048065b 4 | Revises: a8785a7b3cae 5 | Create Date: 2022-10-18 15:18:00.603726 6 | 7 | """ 8 | from alembic import op 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '149d2048065b' 12 | down_revision = 'a8785a7b3cae' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade() -> None: 18 | op.execute( 19 | """ 20 | update test_run 21 | set name = 'test_basic_sum[fast-thin]' 22 | where name == 'test_basic_sum'; 23 | """ 24 | ) 25 | 26 | 27 | def downgrade() -> None: 28 | pass 29 | -------------------------------------------------------------------------------- /alembic/versions/a97d9375430f_default_parameter_for_test_dataframe_py_.py: -------------------------------------------------------------------------------- 1 | """Default parameter for test_dataframe.py::test_shuffle 2 | 3 | Revision ID: a97d9375430f 4 | Revises: 967e298408ed 5 | Create Date: 2023-01-03 19:36:30.469391 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'a97d9375430f' 14 | down_revision = '967e298408ed' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute(f""" 21 | update test_run 22 | set name = 'test_shuffle[tasks]' 23 | where name == 'test_shuffle'; 24 | """) 25 | 26 | 27 | def downgrade() -> None: 28 | pass 29 | -------------------------------------------------------------------------------- /alembic/versions/fa79471ffa8c_declare_bankruptcy_for_test_futures_py.py: -------------------------------------------------------------------------------- 1 | """Declare bankruptcy for test_futures.py 2 | 3 | Revision ID: fa79471ffa8c 4 | Revises: 149d2048065b 5 | Create Date: 2022-10-19 16:21:21.871309 6 | 7 | """ 8 | from alembic import op 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'fa79471ffa8c' 12 | down_revision = '149d2048065b' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade() -> None: 18 | op.execute( 19 | """ 20 | delete from test_run 21 | where originalname in ('test_single_future', 'test_memory_efficient') 22 | and path = 'benchmarks/test_futures.py'; 23 | """ 24 | ) 25 | 26 | 27 | def downgrade() -> None: 28 | pass 29 | -------------------------------------------------------------------------------- /alembic/versions/2764a4f5582b_declare_bankruptcy_for_cluster_startup_.py: -------------------------------------------------------------------------------- 1 | """Declare bankruptcy for cluster startup time 2 | 3 | Revision ID: 2764a4f5582b 4 | Revises: 924e9b1430e1 5 | Create Date: 2022-09-14 11:45:46.024184 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "2764a4f5582b" 14 | down_revision = "924e9b1430e1" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | delete from test_run 23 | where originalname = 'test_default_cluster_spinup_time' 24 | and path = 'benchmarks/test_coiled.py'; 25 | """ 26 | ) 27 | 28 | 29 | def downgrade() -> None: 30 | pass 31 | -------------------------------------------------------------------------------- /alembic/versions/9813b7160e69_parametrize_test_large_map.py: -------------------------------------------------------------------------------- 1 | """Parametrize test_large_map 2 | 3 | Revision ID: 9813b7160e69 4 | Revises: f459b2c61eaf 5 | Create Date: 2023-07-05 11:04:08.510205 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '9813b7160e69' 13 | down_revision = 'f459b2c61eaf' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | op.execute( 20 | """ 21 | update test_run 22 | set name = 'test_large_map[rootish]' 23 | where name == 'test_large_map'; 24 | """ 25 | ) 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | pass 30 | # ### end Alembic commands ### 31 | -------------------------------------------------------------------------------- /alembic/versions/9d6f8ea24ee1_move_h2o_tests.py: -------------------------------------------------------------------------------- 1 | """Move h2o tests 2 | 3 | Revision ID: 9d6f8ea24ee1 4 | Revises: a97d9375430f 5 | Create Date: 2023-01-13 14:29:22.118276 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '9d6f8ea24ee1' 13 | down_revision = 'a97d9375430f' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | op.execute( 20 | """ 21 | update test_run 22 | set path = 'benchmarks/test_h2o.py' 23 | where path = 'benchmarks/h2o/test_h2o_benchmarks.py'; 24 | """ 25 | ) 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | pass 31 | # ### end Alembic commands ### 32 | -------------------------------------------------------------------------------- /tests/benchmarks/test_csv.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import pandas as pd 3 | 4 | from ..utils_test import run_up_to_nthreads 5 | 6 | 7 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 8 | def test_csv_basic(small_client): 9 | ddf = dd.read_csv( 10 | "s3://coiled-runtime-ci/nyc-tlc/yellow_tripdata_2019_csv/yellow_tripdata_2019-*.csv", 11 | dtype={ 12 | "payment_type": "UInt8", 13 | "VendorID": "UInt8", 14 | "passenger_count": "UInt8", 15 | "RatecodeID": "UInt8", 16 | }, 17 | blocksize="16 MiB", 18 | ).persist() 19 | 20 | result = ddf.groupby("passenger_count").tip_amount.mean().compute() 21 | 22 | assert isinstance(result, pd.Series) 23 | assert not result.empty 24 | -------------------------------------------------------------------------------- /alembic/versions/f459b2c61eaf_remove_non_upstream_historical_data.py: -------------------------------------------------------------------------------- 1 | """Remove non-upstream historical data 2 | 3 | Revision ID: f459b2c61eaf 4 | Revises: 4ee0e23d96da 5 | Create Date: 2023-05-23 10:39:13.056358 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'f459b2c61eaf' 13 | down_revision = '4ee0e23d96da' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | op.execute( 20 | """ 21 | delete from test_run 22 | where ( 23 | coiled_runtime_version <> 'upstream' 24 | and coiled_runtime_version not like 'AB_%' 25 | ) 26 | or python_version like '3.8%'; 27 | """ 28 | ) 29 | 30 | 31 | def downgrade() -> None: 32 | pass 33 | -------------------------------------------------------------------------------- /tests/runtime/test_cluster_creation.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | from coiled import Cluster 4 | 5 | 6 | def test_default_cluster_spinup_time( 7 | benchmark_time, github_cluster_tags, get_cluster_info 8 | ): 9 | """Note: this test must be kept in a separate module from the tests that use the 10 | small_cluster fixture (which has the scope=module) or its child small_client. 11 | This prevents having the small_cluster sitting idle for 5+ minutes while this test 12 | is running. 13 | """ 14 | with benchmark_time: 15 | with Cluster( 16 | name=f"test_default_cluster_spinup_time-{uuid.uuid4().hex[:8]}", 17 | n_workers=1, 18 | tags=github_cluster_tags, 19 | ) as cluster: 20 | with get_cluster_info(cluster): 21 | pass 22 | -------------------------------------------------------------------------------- /alembic/versions/59c5cc87c066_drop_outdated_rechunking_data.py: -------------------------------------------------------------------------------- 1 | """Drop outdated rechunking data 2 | 3 | Revision ID: 59c5cc87c066 4 | Revises: e11cd1aaed38 5 | Create Date: 2024-08-16 15:16:27.114045 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '59c5cc87c066' 14 | down_revision = 'e11cd1aaed38' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | delete from test_run 23 | where originalname in ( 24 | 'test_adjacent_groups', 25 | 'test_heal_oversplit', 26 | 'test_swap_axes', 27 | 'test_tiles_to_rows' 28 | ) 29 | """ 30 | ) 31 | 32 | 33 | def downgrade() -> None: 34 | pass 35 | -------------------------------------------------------------------------------- /tests/geospatial/test_rechunking.py: -------------------------------------------------------------------------------- 1 | from coiled.credentials.google import CoiledShippedCredentials 2 | 3 | from tests.geospatial.workloads.rechunking import era5_rechunking 4 | 5 | 6 | def test_era5_rechunking( 7 | gcs_url, 8 | scale, 9 | setup_benchmark, 10 | cluster_kwargs={ 11 | "workspace": "dask-benchmarks-gcp", 12 | "region": "us-central1", 13 | }, 14 | scale_kwargs={ 15 | "small": {"n_workers": 10}, 16 | "medium": {"n_workers": 100}, 17 | "large": {"n_workers": 100}, 18 | }, 19 | ): 20 | with setup_benchmark( 21 | **scale_kwargs[scale], **cluster_kwargs 22 | ) as benchmark: # noqa: F841 23 | benchmark( 24 | era5_rechunking, 25 | scale=scale, 26 | storage_url=gcs_url, 27 | storage_options={"token": CoiledShippedCredentials()}, 28 | ) 29 | -------------------------------------------------------------------------------- /tests/geospatial/test_regridding.py: -------------------------------------------------------------------------------- 1 | from coiled.credentials.google import CoiledShippedCredentials 2 | 3 | from tests.geospatial.workloads.regridding import xesmf 4 | 5 | 6 | def test_xesmf( 7 | gcs_url, 8 | scale, 9 | setup_benchmark, 10 | cluster_kwargs={ 11 | "workspace": "dask-benchmarks-gcp", 12 | "region": "us-central1", 13 | "wait_for_workers": True, 14 | }, 15 | scale_kwargs={ 16 | "small": {"n_workers": 10}, 17 | "medium": {"n_workers": 10}, 18 | "large": {"n_workers": 10}, 19 | }, 20 | ): 21 | with setup_benchmark( 22 | **scale_kwargs[scale], **cluster_kwargs 23 | ) as benchmark: # noqa: F841 24 | benchmark( 25 | xesmf, 26 | scale=scale, 27 | storage_url=gcs_url, 28 | storage_options={"token": CoiledShippedCredentials()}, 29 | ) 30 | -------------------------------------------------------------------------------- /tests/stability/test_install_plugins.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | from distributed import PipInstall 5 | 6 | from ..utils_test import wait 7 | 8 | 9 | @pytest.mark.parametrize("restart_workers", [True, False]) 10 | def test_private_pip_install(small_client, restart_workers): 11 | small_client.cluster.send_private_envs( 12 | {"PYTHON_STUB_TOKEN": os.environ["PYTHON_STUB_PAT"]} 13 | ) 14 | 15 | plugin = PipInstall( 16 | packages=[ 17 | "python_stub@git+https://${PYTHON_STUB_TOKEN}@github.com/coiled/python-stub.git" 18 | ], 19 | restart_workers=restart_workers, 20 | ) 21 | small_client.register_plugin(plugin) 22 | 23 | def test(x): 24 | from python_stub import stub 25 | 26 | return stub.echo(x) 27 | 28 | fut = small_client.submit(test, "Hello, world!") 29 | wait(fut, small_client, 5 * 60) 30 | -------------------------------------------------------------------------------- /alembic/versions/87cbf883c2be_update_tpch_refactor_from_1094.py: -------------------------------------------------------------------------------- 1 | """Update tpch refactor from #1094 2 | 3 | Revision ID: 87cbf883c2be 4 | Revises: b0e8d5f3295d 5 | Create Date: 2023-10-18 20:31:17.848799 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "87cbf883c2be" 14 | down_revision = "b0e8d5f3295d" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | update test_run 23 | set path = substr(path, length('benchmarks/') + 1) 24 | where path like 'benchmarks/tpch/%'; 25 | """ 26 | ) 27 | 28 | 29 | def downgrade() -> None: 30 | op.execute( 31 | """ 32 | update test_run 33 | set path = 'benchmarks/' || path 34 | where path like 'tpch/%'; 35 | """ 36 | ) 37 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: check-yaml 6 | exclude: recipe/meta.yaml 7 | - repo: https://github.com/pycqa/isort 8 | rev: 5.12.0 9 | hooks: 10 | - id: isort 11 | language_version: python3 12 | - repo: https://github.com/asottile/pyupgrade 13 | rev: v3.11.1 14 | hooks: 15 | - id: pyupgrade 16 | args: 17 | - --py39-plus 18 | - repo: https://github.com/psf/black 19 | rev: 23.9.1 20 | hooks: 21 | - id: black 22 | language_version: python3 23 | args: 24 | - --target-version=py39 25 | exclude: ^alembic/versions/ 26 | - repo: https://github.com/pycqa/flake8 27 | rev: 6.1.0 28 | hooks: 29 | - id: flake8 30 | language_version: python3 31 | -------------------------------------------------------------------------------- /alembic/versions/b0e8d5f3295d_update_test_tpch_tpch_test_dask_from_.py: -------------------------------------------------------------------------------- 1 | """Update test_tpch -> tpch/test_dask from #1044 2 | 3 | Revision ID: b0e8d5f3295d 4 | Revises: 78c6e00fee88 5 | Create Date: 2023-10-18 20:14:47.476804 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'b0e8d5f3295d' 14 | down_revision = '78c6e00fee88' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | update test_run 23 | set path = 'benchmarks/tpch/test_dask.py' 24 | where path = 'benchmarks/test_tpch.py' 25 | """ 26 | ) 27 | 28 | 29 | def downgrade() -> None: 30 | op.execute( 31 | """ 32 | update test_run 33 | set path = 'benchmarks/test_tpch.py' 34 | where path = 'benchmarks/tpch/test_dask.py' 35 | """ 36 | ) 37 | -------------------------------------------------------------------------------- /tests/geospatial/test_atmospheric_circulation.py: -------------------------------------------------------------------------------- 1 | from coiled.credentials.google import CoiledShippedCredentials 2 | 3 | from tests.geospatial.workloads.atmospheric_circulation import atmospheric_circulation 4 | 5 | 6 | def test_atmospheric_circulation( 7 | gcs_url, 8 | scale, 9 | setup_benchmark, 10 | cluster_kwargs={ 11 | "workspace": "dask-benchmarks-gcp", 12 | "region": "us-central1", 13 | }, 14 | scale_kwargs={ 15 | "small": {"n_workers": 10}, 16 | "medium": {"n_workers": 100}, 17 | "large": {"n_workers": 100}, 18 | }, 19 | ): 20 | with setup_benchmark( 21 | **scale_kwargs[scale], **cluster_kwargs 22 | ) as benchmark: # noqa: F841 23 | benchmark( 24 | atmospheric_circulation, 25 | scale=scale, 26 | storage_url=gcs_url, 27 | storage_options={"token": CoiledShippedCredentials()}, 28 | ) 29 | -------------------------------------------------------------------------------- /alembic/versions/2381a77e8487_zarr.py: -------------------------------------------------------------------------------- 1 | """zarr 2 | 3 | Revision ID: 2381a77e8487 4 | Revises: d58983739401 5 | Create Date: 2023-03-13 14:57:02.474967 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '2381a77e8487' 14 | down_revision = 'd58983739401' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | update test_run 23 | set path = 'benchmarks/test_zarr.py' 24 | where path = 'benchmarks/test_array.py' 25 | and originalname in ( 26 | 'test_filter_then_average', 27 | 'test_access_slices', 28 | 'test_sum_residuals' 29 | ) 30 | """ 31 | ) 32 | 33 | 34 | def downgrade() -> None: 35 | # ### commands auto generated by Alembic - please adjust! ### 36 | pass 37 | # ### end Alembic commands ### 38 | -------------------------------------------------------------------------------- /tests/benchmarks/test_custom.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | from dask import delayed 5 | from dask.utils import parse_bytes 6 | 7 | from ..utils_test import wait 8 | 9 | 10 | def test_jobqueue(small_client): 11 | # Just using dask to run lots of embarrassingly-parallel CPU-bound tasks as fast as possible 12 | nthreads = sum( 13 | w["nthreads"] for w in small_client.scheduler_info()["workers"].values() 14 | ) 15 | max_runtime = 120 16 | max_sleep = 3 17 | n_tasks = round(max_runtime / max_sleep * nthreads) 18 | 19 | @delayed(pure=True) 20 | def task(i: int) -> int: 21 | stuff = "x" * parse_bytes("400MiB") 22 | time.sleep(random.uniform(0, max_sleep)) 23 | del stuff 24 | return i 25 | 26 | tasks = [task(i) for i in range(n_tasks)] 27 | result = delayed(sum)(tasks) # just so we have a single object 28 | 29 | wait( 30 | result, 31 | small_client, 32 | max_runtime * 1.15, 33 | ) 34 | -------------------------------------------------------------------------------- /AB_environments/make_envs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple utility to automate creation of A/B environment files""" 3 | import argparse 4 | import os 5 | import shutil 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser( 10 | description="Create A/B environment files as copies of AB_baseline" 11 | ) 12 | parser.add_argument("name", nargs="+") 13 | names = parser.parse_args().name 14 | 15 | os.chdir(os.path.dirname(__file__)) 16 | for name in names: 17 | if not name.startswith("AB_"): 18 | name = "AB_" + name 19 | for suffix in ("cluster.yaml", "conda.yaml", "dask.yaml", "requirements.in"): 20 | fname = f"{name}.{suffix}" 21 | if os.path.exists(fname): 22 | print(f"{fname} already exists") 23 | else: 24 | print(f"Creating {fname} as a copy of baseline") 25 | shutil.copy(f"AB_baseline.{suffix}", fname) 26 | 27 | 28 | if __name__ == "__main__": 29 | main() 30 | -------------------------------------------------------------------------------- /tests/geospatial/test_satellite_filtering.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from tests.geospatial.workloads.satellite_filtering import satellite_filtering 6 | 7 | 8 | def test_satellite_filtering( 9 | az_url, 10 | scale, 11 | setup_benchmark, 12 | cluster_kwargs={ 13 | "workspace": "dask-benchmarks-azure", 14 | "region": "westeurope", 15 | }, 16 | scale_kwargs={ 17 | "small": {"n_workers": 10}, 18 | "large": {"n_workers": 100}, 19 | }, 20 | ): 21 | if scale not in scale_kwargs.keys(): 22 | pytest.skip(reason=f"{scale=} not implemented") 23 | with setup_benchmark( 24 | **scale_kwargs[scale], 25 | env={ 26 | "AZURE_STORAGE_ACCOUNT_NAME": os.environ["AZURE_STORAGE_ACCOUNT_NAME"], 27 | "AZURE_STORAGE_SAS_TOKEN": os.environ["AZURE_STORAGE_SAS_TOKEN"], 28 | }, 29 | **cluster_kwargs, 30 | ) as benchmark: # noqa: F841 31 | benchmark(satellite_filtering, scale=scale, storage_url=az_url) 32 | -------------------------------------------------------------------------------- /alembic/versions/d58983739401_default_parameter_for_test_rechunk_in_.py: -------------------------------------------------------------------------------- 1 | """Default parameter for test_rechunk_in_memory 2 | 3 | Revision ID: d58983739401 4 | Revises: 9d6f8ea24ee1 5 | Create Date: 2023-03-07 11:20:28.558141 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'd58983739401' 14 | down_revision = '9d6f8ea24ee1' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute(f""" 21 | update test_run 22 | set name = 'test_rechunk_in_memory[tasks]', 23 | path = 'benchmarks/test_array.py' 24 | where name == 'test_rechunk_in_memory' 25 | and python_version like '3.9%'; 26 | """) 27 | op.execute( 28 | """ 29 | delete from test_run 30 | where name == 'test_rechunk_in_memory' 31 | and python_version not like '3.9%'; 32 | """ 33 | ) 34 | 35 | 36 | def downgrade() -> None: 37 | pass 38 | -------------------------------------------------------------------------------- /tests/stability/test_array.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import dask.array as da 4 | import pytest 5 | 6 | from ..utils_test import cluster_memory, scaled_array_shape, wait 7 | 8 | pytestmark = pytest.mark.stability 9 | 10 | pytest.importorskip("scipy") 11 | 12 | 13 | @pytest.mark.skipif( 14 | sys.platform.startswith("win"), reason="scaled_array_shape fails on windows" 15 | ) 16 | def test_ols(small_client): 17 | chunksize = int(1e6) 18 | memory = cluster_memory(small_client) 19 | target_nbytes = memory * 0.50 20 | target_shape = scaled_array_shape(target_nbytes, ("x", 100)) 21 | num_samples, num_coeffs = target_shape[0], target_shape[-1] 22 | rng = da.random.default_rng() 23 | beta = rng.normal(size=(num_coeffs,)) 24 | X = rng.normal(size=(num_samples, num_coeffs), chunks=(chunksize, -1)) 25 | y = X @ beta + rng.normal(size=(num_samples,), chunks=(chunksize,)) 26 | beta_hat = da.linalg.solve(X.T @ X, X.T @ y) # normal eq'n 27 | y_hat = X @ beta_hat 28 | wait(y_hat, small_client, 20 * 60) 29 | -------------------------------------------------------------------------------- /tests/geospatial/test_zonal_average.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example was adapted from https://github.com/dcherian/dask-demo/blob/main/nwm-aws.ipynb 3 | """ 4 | 5 | import pytest 6 | 7 | from tests.geospatial.workloads.zonal_average import nwm 8 | 9 | 10 | def test_nwm( 11 | scale, 12 | benchmark_type, 13 | setup_benchmark, 14 | cluster_kwargs={ 15 | "workspace": "dask-benchmarks", 16 | "region": "us-east-1", 17 | }, 18 | scale_kwargs={ 19 | "small": {"n_workers": 10}, 20 | "large": {"n_workers": 200, "scheduler_memory": "32 GiB"}, 21 | }, 22 | ): 23 | if benchmark_type == "submission": 24 | pytest.skip( 25 | reason="FIXME: Submission requires pre-computations, but no workers were requested." 26 | ) 27 | if scale not in scale_kwargs.keys(): 28 | pytest.skip(reason=f"{scale=} not implemented") 29 | with setup_benchmark( 30 | **scale_kwargs[scale], **cluster_kwargs 31 | ) as benchmark: # noqa: F841 32 | benchmark(nwm, scale=scale) 33 | -------------------------------------------------------------------------------- /alembic/versions/a9363331e323_clean_h2o_tests_with_removed_shuffle_.py: -------------------------------------------------------------------------------- 1 | """Clean h2o tests with removed shuffle param 2 | 3 | Revision ID: a9363331e323 4 | Revises: 912c8e30690a 5 | Create Date: 2023-01-03 19:56:22.838577 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'a9363331e323' 14 | down_revision = '912c8e30690a' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | for i in [1, 2, 3, 4, 5, 7]: 21 | test = f"test_q{i}" 22 | for ddf_param in ("0.5 GB (csv)", "0.5 GB (parquet)", "5 GB (parquet)"): 23 | op.execute(f""" 24 | update test_run 25 | set name = '{test}[{ddf_param}]' 26 | where name == '{test}[{ddf_param}-tasks]'; 27 | """) 28 | op.execute(f""" 29 | delete from test_run 30 | where name == '{test}[{ddf_param}-p2p]'; 31 | """) 32 | 33 | 34 | def downgrade() -> None: 35 | pass 36 | -------------------------------------------------------------------------------- /tests/runtime/test_build.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import shlex 5 | import subprocess 6 | 7 | import coiled 8 | from packaging.version import Version 9 | 10 | 11 | def test_latest_coiled(): 12 | # Ensure that the conda environment installs the latest version of `coiled` 13 | # FIXME this test can glitch if you install coiled from pip 14 | v_installed = Version(coiled.__version__) 15 | 16 | # Get latest `coiled` release version from conda-forge 17 | output = subprocess.check_output( 18 | shlex.split("conda search --override-channels --json -c conda-forge coiled") 19 | ) 20 | result = json.loads(output) 21 | v_latest = Version(result["coiled"][-1]["version"]) 22 | # conda can lag behind a few days from pip; allow for the next version too 23 | v_allowed = { 24 | v_latest, 25 | Version(f"{v_latest.major}.{v_latest.minor}.{v_latest.micro + 1}"), 26 | Version(f"{v_latest.major}.{v_latest.minor + 1}.0"), 27 | Version(f"{v_latest.major + 1}.0.0"), 28 | } 29 | assert v_installed in v_allowed 30 | -------------------------------------------------------------------------------- /alembic/versions/e11cd1aaed38_add_cluster_spec_to_db.py: -------------------------------------------------------------------------------- 1 | """Add cluster spec to db 2 | 3 | Revision ID: e11cd1aaed38 4 | Revises: 00d5844fd364 5 | Create Date: 2024-04-15 10:32:18.323088 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'e11cd1aaed38' 14 | down_revision = '00d5844fd364' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('tpch_run', sa.Column('n_workers', sa.Integer(), nullable=True)) 22 | op.add_column('tpch_run', sa.Column('worker_vm_type', sa.String(), nullable=True)) 23 | op.add_column('tpch_run', sa.Column('cluster_disk_size', sa.Integer(), nullable=True)) 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.drop_column('tpch_run', 'cluster_disk_size') 30 | op.drop_column('tpch_run', 'worker_vm_type') 31 | op.drop_column('tpch_run', 'n_workers') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /ci/scripts/dask_config_to_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Read a dask config file and print it out in the format `ENV=VALUE\nENV=VALUE ...` 3 | This script is a work-around to not being able to upload dask config files to 4 | `conda env create`. 5 | """ 6 | from __future__ import annotations 7 | 8 | import sys 9 | from collections.abc import Iterator 10 | 11 | import yaml 12 | 13 | 14 | def main(fname: str) -> None: 15 | with open(fname) as fh: 16 | cfg = yaml.safe_load(fh) 17 | if cfg: 18 | print("\n".join(traverse(cfg, []))) 19 | 20 | 21 | def traverse(node: dict | list | str | float | None, path: list[str]) -> Iterator[str]: 22 | if isinstance(node, dict): 23 | for k, v in node.items(): 24 | k = k.upper().replace("-", "_") 25 | yield from traverse(v, path + [k]) 26 | return 27 | 28 | if not path: 29 | raise ValueError("The top-level element must be a dict") 30 | if isinstance(node, str) and " " in node: 31 | raise ValueError("Unsupported character: whitespace") 32 | 33 | yield "DASK_" + "__".join(path) + f"={node}" 34 | 35 | 36 | if __name__ == "__main__": 37 | main(sys.argv[1]) 38 | -------------------------------------------------------------------------------- /alembic/versions/a8785a7b3cae_add_entry_to_database_for_cluster_name_.py: -------------------------------------------------------------------------------- 1 | """Add entry to database for cluster name/id/details_url 2 | 3 | Revision ID: a8785a7b3cae 4 | Revises: 2764a4f5582b 5 | Create Date: 2022-10-06 14:15:33.618367 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'a8785a7b3cae' 14 | down_revision = '2764a4f5582b' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('test_run', sa.Column('cluster_name', sa.String(), nullable=True)) 22 | op.add_column('test_run', sa.Column('cluster_id', sa.Integer(), nullable=True)) 23 | op.add_column('test_run', sa.Column('cluster_details_url', sa.String(), nullable=True)) 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade() -> None: 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.drop_column('test_run', 'cluster_details_url') 30 | op.drop_column('test_run', 'cluster_id') 31 | op.drop_column('test_run', 'cluster_name') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /tests/workflows/test_uber_lyft.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import pytest 3 | 4 | pytestmark = pytest.mark.workflows 5 | 6 | 7 | @pytest.mark.client("uber_lyft") 8 | def test_exploratory_analysis(client): 9 | """Run some exploratory aggs on the dataset""" 10 | 11 | # NYC taxi Uber/Lyft dataset 12 | df = dd.read_parquet( 13 | "s3://coiled-datasets/uber-lyft-tlc/", storage_options={"anon": True} 14 | ) 15 | 16 | # Preprocessing: 17 | # - Add a column to indicate company, instead of license number 18 | # - Add a column to indicate if a tip was given 19 | taxi_companies = { 20 | "HV0002": "Juno", 21 | "HV0003": "Uber", 22 | "HV0004": "Via", 23 | "HV0005": "Lyft", 24 | } 25 | df["company"] = df.hvfhs_license_num.replace(taxi_companies) 26 | df["tipped"] = df.tips > 0 27 | 28 | # Persist so we only read once 29 | df = df.persist() 30 | 31 | # How many riders tip? 32 | df.tipped.mean().compute() 33 | # How many riders tip for each company? 34 | df.groupby("company").tipped.value_counts().compute() 35 | # What are those as percentages? 36 | df.groupby("company").tipped.mean().compute() 37 | -------------------------------------------------------------------------------- /ci/scripts/combine-dbs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | DB_NAME=${DB_NAME:-'benchmark.db'} 5 | 6 | alembic upgrade head 7 | 8 | # Delete old records and vacuum to reduce on-disk size 9 | sqlite3 "$DB_NAME" < date('now', '-90 days')); 11 | VACUUM; 12 | EOF 13 | # Merge in the individual job dbs into our working copy 14 | for FILE in $(find . -name "*.db") 15 | do 16 | # Skip the output DB if we see it 17 | if [ ${FILE##*/} == $DB_NAME ]; then 18 | echo "Skipping $FILE" 19 | continue 20 | fi 21 | echo "Processing $FILE" 22 | DB_NAME=$FILE alembic upgrade head 23 | # Copy the individual table into the primary one. We make an intermediate 24 | # temp table so that we can null out the primary keys and reset the 25 | # autoincrementing 26 | for tab in "tpch_run" "test_run" 27 | do 28 | sqlite3 "$FILE" < None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.execute( 22 | f""" 23 | update test_run 24 | set name = 'test_quadratic_mean[array]', 25 | path = 'benchmarks/test_array.py' 26 | where name == 'test_quadratic_mean'; 27 | """ 28 | ) 29 | for backend in ["array", "dataframe"]: 30 | op.execute( 31 | f""" 32 | delete from test_run 33 | where name == 'test_xarray_reduction[{backend}]'; 34 | """ 35 | ) 36 | # ### end Alembic commands ### 37 | 38 | 39 | def downgrade() -> None: 40 | # ### commands auto generated by Alembic - please adjust! ### 41 | pass 42 | # ### end Alembic commands ### 43 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = alembic/versions/* 4 | ignore = 5 | # Extra space in brackets 6 | E20 7 | # Line break before binary operator 8 | W503 9 | # Line break after binary operator 10 | W504 11 | 12 | [isort] 13 | skip = alembic 14 | profile = black 15 | 16 | [tool:pytest] 17 | addopts = -v -rsxfE --durations=0 --color=yes --strict-markers --strict-config --dist loadscope 18 | markers = 19 | stability: stability tests; not meant to measure performance 20 | workflows: workflow tests; expensive to run. Disabled in PRs. 21 | shuffle_p2p: p2p shuffle engine 22 | shuffle_tasks: legacy tasks-based shuffle engine 23 | tpch_dask: dask implementation of the TPCH tests suite 24 | tpch_nondask: competitors' (not dask) implementation of the TPCH test suite 25 | tpch_correctness: verify correctness of the dask implementation of the TPCH tests suite 26 | 27 | # pytest-timeout settings 28 | # 'thread' kills off the whole test suite. 'signal' only kills the offending test. 29 | # However, 'signal' doesn't work on Windows (due to lack of SIGALRM). 30 | # The 'tests' CI script modifies this config file on the fly for Windows clients. 31 | timeout_method = signal 32 | timeout = 3600 33 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/rechunking.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal 2 | 3 | import xarray as xr 4 | from dask.delayed import Delayed 5 | 6 | 7 | def era5_rechunking( 8 | scale: Literal["small", "medium", "large"], 9 | storage_url: str, 10 | storage_options: dict[str, Any], 11 | ) -> Delayed: 12 | ds = xr.open_zarr( 13 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr", 14 | ).drop_encoding() 15 | 16 | if scale == "small": 17 | # 101.83 GiB (small) 18 | time_range = slice("2020-01-01", "2023-01-01") 19 | variables = ["sea_surface_temperature"] 20 | elif scale == "medium": 21 | # 2.12 TiB (medium) 22 | time_range = slice(None) 23 | variables = ["sea_surface_temperature"] 24 | else: 25 | # 4.24 TiB (large) 26 | # This currently doesn't complete successfully. 27 | time_range = slice(None) 28 | variables = ["sea_surface_temperature", "snow_depth"] 29 | subset = ds[variables].sel(time=time_range) 30 | 31 | # Rechunk 32 | result = subset.chunk({"time": -1, "longitude": "auto", "latitude": "auto"}) 33 | 34 | # Write result to cloud storage 35 | return result.to_zarr(storage_url, storage_options=storage_options, compute=False) 36 | -------------------------------------------------------------------------------- /alembic/versions/24749594f367_add_prometheus_metrics.py: -------------------------------------------------------------------------------- 1 | """Add prometheus metrics 2 | 3 | Revision ID: 24749594f367 4 | Revises: 1c2fe9d527e4 5 | Create Date: 2023-10-16 18:43:35.402355 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '24749594f367' 14 | down_revision = '1c2fe9d527e4' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('test_run', sa.Column('scheduler_cpu_avg', sa.Float(), nullable=True)) 22 | op.add_column('test_run', sa.Column('scheduler_memory_max', sa.Float(), nullable=True)) 23 | op.add_column('test_run', sa.Column('worker_max_tick', sa.Float(), nullable=True)) 24 | op.add_column('test_run', sa.Column('scheduler_max_tick', sa.Float(), nullable=True)) 25 | # ### end Alembic commands ### 26 | 27 | 28 | def downgrade() -> None: 29 | # ### commands auto generated by Alembic - please adjust! ### 30 | op.drop_column('test_run', 'scheduler_max_tick') 31 | op.drop_column('test_run', 'worker_max_tick') 32 | op.drop_column('test_run', 'scheduler_memory_max') 33 | op.drop_column('test_run', 'scheduler_cpu_avg') 34 | # ### end Alembic commands ### 35 | -------------------------------------------------------------------------------- /alembic/versions/1c2fe9d527e4_expand_rechunk_parameters.py: -------------------------------------------------------------------------------- 1 | """Expand rechunk parameters 2 | 3 | Revision ID: 1c2fe9d527e4 4 | Revises: 87cbf883c2be 5 | Create Date: 2023-10-25 16:26:23.813378 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '1c2fe9d527e4' 14 | down_revision = '87cbf883c2be' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | for test in ("test_swap_axes", "test_tiles_to_rows"): 21 | for multiplier in (0.1, 1): 22 | op.execute( 23 | f""" 24 | update test_run 25 | set name = '{test}[{multiplier}-128 MiB-p2p-disk]' 26 | where name == '{test}[{multiplier}-128 MiB-p2p]' 27 | and path == 'benchmarks/test_rechunk.py'; 28 | """ 29 | ) 30 | 31 | for test in ("test_rechunk_in_memory", "test_rechunk_striping", "test_rechunk_swap_axes"): 32 | op.execute( 33 | f""" 34 | delete from test_run 35 | where originalname == '{test}' 36 | and path == 'benchmarks/test_array.py'; 37 | """ 38 | ) 39 | 40 | 41 | def downgrade() -> None: 42 | # ### commands auto generated by Alembic - please adjust! ### 43 | pass 44 | # ### end Alembic commands ### 45 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/zonal_average.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | import flox.xarray 4 | import fsspec 5 | import numpy as np 6 | import rioxarray 7 | import xarray as xr 8 | 9 | 10 | def nwm( 11 | scale: Literal["small", "medium", "large"], 12 | ) -> xr.DataArray: 13 | ds = xr.open_zarr( 14 | "s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr", consolidated=True 15 | ) 16 | 17 | if scale == "small": 18 | # 6.03 TiB 19 | time_range = slice("2020-01-01", "2020-12-31") 20 | else: 21 | # 252.30 TiB 22 | time_range = slice("1979-02-01", "2020-12-31") 23 | subset = ds.zwattablrt.sel(time=time_range) 24 | 25 | counties = rioxarray.open_rasterio( 26 | "s3://nwm-250m-us-counties/Counties_on_250m_grid.tif", 27 | chunks="auto", 28 | opener=fsspec.open, 29 | ).squeeze() 30 | 31 | # Remove any small floating point error in coordinate locations 32 | _, counties_aligned = xr.align(subset, counties, join="override") 33 | counties_aligned = counties_aligned.persist() 34 | 35 | county_id = np.unique(counties_aligned.data).compute() 36 | county_id = county_id[county_id != 0] 37 | county_mean = flox.xarray.xarray_reduce( 38 | subset, 39 | counties_aligned.rename("county"), 40 | func="mean", 41 | expected_groups=(county_id,), 42 | ) 43 | return county_mean 44 | -------------------------------------------------------------------------------- /alembic/versions/4ee0e23d96da_compressible_variant_of_tests.py: -------------------------------------------------------------------------------- 1 | """compressible variant of tests 2 | 3 | Revision ID: 4ee0e23d96da 4 | Revises: 2381a77e8487 5 | Create Date: 2023-03-14 16:13:23.809226 6 | 7 | """ 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '4ee0e23d96da' 13 | down_revision = '2381a77e8487' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade() -> None: 19 | for name in ( 20 | "test_anom_mean", 21 | "test_vorticity", 22 | "test_double_diff", 23 | "test_dot_product", 24 | "test_map_overlap_sample", 25 | ): 26 | op.execute( 27 | f""" 28 | update test_run 29 | set name = '{name}[uncompressible]', 30 | path = 'benchmarks/test_array.py' 31 | where name == '{name}'; 32 | """ 33 | ) 34 | op.execute( 35 | """ 36 | delete from test_run 37 | where path = 'benchmarks/test_spill.py' 38 | and name in ( 39 | 'test_dot_product_spill[compressible]', 40 | 'test_spilling[compressible-keep]', 41 | 'test_spilling[compressible-release]' 42 | ) 43 | """ 44 | ) 45 | 46 | 47 | def downgrade() -> None: 48 | # ### commands auto generated by Alembic - please adjust! ### 49 | pass 50 | # ### end Alembic commands ### 51 | -------------------------------------------------------------------------------- /tests/runtime/test_xgboost.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import pytest 3 | 4 | dask_ml = pytest.importorskip("dask_ml") 5 | dxgb = pytest.importorskip("xgboost.dask") 6 | 7 | 8 | def test_xgboost_distributed_training(small_client): 9 | ddf = dd.read_parquet( 10 | "s3://coiled-datasets/synthetic-data/synth-reg-104GB.parquet", 11 | storage_options={"anon": True}, 12 | ) 13 | ddf = ddf.partitions[0:30] 14 | ddf = ddf.persist() 15 | 16 | # Create the train-test split 17 | X, y = ddf.iloc[:, :-1], ddf["target"] 18 | X_train, X_test, y_train, y_test = dask_ml.model_selection.train_test_split( 19 | X, y, test_size=0.3, shuffle=True, random_state=21 20 | ) 21 | 22 | # Create the XGBoost DMatrix for our training and testing splits 23 | dtrain = dxgb.DaskDMatrix(small_client, X_train, y_train) 24 | dtest = dxgb.DaskDMatrix(small_client, X_test, y_test) 25 | 26 | # Set model parameters (XGBoost defaults) 27 | params = { 28 | "max_depth": 6, 29 | "gamma": 0, 30 | "eta": 0.3, 31 | "min_child_weight": 30, 32 | "objective": "reg:squarederror", 33 | "grow_policy": "depthwise", 34 | } 35 | output = dxgb.train( 36 | small_client, params, dtrain, num_boost_round=5, evals=[(dtrain, "train")] 37 | ) 38 | 39 | # make predictions 40 | y_pred = dxgb.predict(small_client, output, dtest) 41 | assert y_pred.shape[0] == y_test.shape[0].compute() 42 | -------------------------------------------------------------------------------- /alembic/versions/967e298408ed_test_spill.py: -------------------------------------------------------------------------------- 1 | """test_spill 2 | 3 | Revision ID: 967e298408ed 4 | Revises: a9363331e323 5 | Create Date: 2023-01-09 17:05:13.568510 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '967e298408ed' 14 | down_revision = 'a9363331e323' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | update test_run 23 | set name = 'test_spilling[uncompressible-release]', 24 | path = 'benchmarks/test_spill.py' 25 | where name == 'test_spilling[False]' 26 | and python_version like '3.9%'; 27 | """ 28 | ) 29 | op.execute( 30 | """ 31 | update test_run 32 | set name = 'test_spilling[uncompressible-keep]', 33 | path = 'benchmarks/test_spill.py' 34 | where name == 'test_spilling[True]' 35 | and python_version like '3.9%'; 36 | """ 37 | ) 38 | op.execute( 39 | """ 40 | delete from test_run 41 | where originalname = 'test_spilling' 42 | and python_version not like '3.9%'; 43 | """ 44 | ) 45 | op.execute( 46 | """ 47 | delete from test_run 48 | where originalname = 'test_tensordot_stress'; 49 | """ 50 | ) 51 | 52 | 53 | def downgrade() -> None: 54 | # ### commands auto generated by Alembic - please adjust! ### 55 | pass 56 | # ### end Alembic commands ### 57 | -------------------------------------------------------------------------------- /alembic/versions/78c6e00fee88_remove_task_based_shuffle.py: -------------------------------------------------------------------------------- 1 | """Remove task based shuffle 2 | 3 | Revision ID: 78c6e00fee88 4 | Revises: 778e617a2886 5 | Create Date: 2023-10-19 15:26:04.281985 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = "78c6e00fee88" 14 | down_revision = "778e617a2886" 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | op.execute( 21 | """ 22 | delete from test_run 23 | where ( 24 | originalname in ('test_shuffle', 'test_cluster_reconnect') 25 | or name like 'test_join_big[%tasks%]' 26 | or name like 'test_set_index[%tasks%]' 27 | ) 28 | """ 29 | ) 30 | op.execute( 31 | """ 32 | update test_run 33 | set name = 'test_join_big[1]' 34 | where name == 'test_join_big[1-p2p]'; 35 | """ 36 | ) 37 | op.execute( 38 | """ 39 | update test_run 40 | set name = 'test_join_big[0.1]' 41 | where name == 'test_join_big[0.1-p2p]'; 42 | """ 43 | ) 44 | for b in [True, False]: 45 | for factor in [0.1, 1]: 46 | op.execute( 47 | f""" 48 | update test_run 49 | set name = 'test_set_index[{factor}-{b}]' 50 | where name == 'test_set_index[{factor}-p2p-{b}]'; 51 | """ 52 | ) 53 | 54 | 55 | def downgrade() -> None: 56 | pass 57 | -------------------------------------------------------------------------------- /tests/tpch/test_optimization.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from . import dask_queries 4 | 5 | pytestmark = pytest.mark.tpch_dask 6 | 7 | 8 | @pytest.fixture( 9 | params=[ 10 | 1, 11 | 2, 12 | 3, 13 | 4, 14 | 5, 15 | 6, 16 | 7, 17 | 8, 18 | 9, 19 | 10, 20 | 11, 21 | 12, 22 | 13, 23 | 14, 24 | 15, 25 | 16, 26 | 17, 27 | 18, 28 | 19, 29 | 20, 30 | 21, 31 | 22, 32 | ], 33 | ) 34 | def query(request): 35 | return request.param 36 | 37 | 38 | def test_optimization(query, dataset_path, fs, client, scale): 39 | func = getattr(dask_queries, f"query_{query:02d}") 40 | result = func(dataset_path, fs, scale) 41 | # We need to inject .repartition(npartitions=1) which .compute() does under the hood 42 | result.repartition(npartitions=1).optimize() 43 | 44 | 45 | @pytest.mark.skip( 46 | reason="This test does not work. See FIXME and https://github.com/dask/distributed/issues/8833." 47 | ) 48 | def test_delay_computation_start(query, dataset_path, fs, client, scale): 49 | func = getattr(dask_queries, f"query_{query:02d}") 50 | result = func(dataset_path, fs, scale).optimize() 51 | # FIXME: Client.compute unblocks only until the graph is serialized and put onto 52 | # the comm buffer. It should wait until update_graph finishes, i.e. graph is 53 | # submitted, parsed, and the tasks have been added onto the scheduler. 54 | client.compute(result) 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2022, Coiled 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /tests/stability/test_deadlock.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import dask 4 | import distributed 5 | import pytest 6 | from coiled import Cluster 7 | from distributed import Client, wait 8 | from packaging.version import Version 9 | 10 | pytestmark = pytest.mark.stability 11 | 12 | 13 | @pytest.mark.skipif( 14 | Version(distributed.__version__) < Version("2022.4.2"), 15 | reason="https://github.com/dask/distributed/issues/6110", 16 | ) 17 | def test_repeated_merge_spill( 18 | benchmark_all, 19 | cluster_kwargs, 20 | dask_env_variables, 21 | github_cluster_tags, 22 | ): 23 | with Cluster( 24 | name=f"test_repeated_merge_spill-{uuid.uuid4().hex[:8]}", 25 | environ=dask_env_variables, 26 | tags=github_cluster_tags, 27 | **cluster_kwargs["test_repeated_merge_spill"], 28 | ) as cluster: 29 | with Client(cluster) as client: 30 | with benchmark_all(client): 31 | ddf = dask.datasets.timeseries( 32 | "2020", 33 | "2025", 34 | partition_freq="2w", 35 | ) 36 | ddf2 = dask.datasets.timeseries( 37 | "2020", 38 | "2023", 39 | partition_freq="2w", 40 | ) 41 | 42 | for _ in range(10): 43 | client.restart() 44 | fs = client.compute((ddf.x + ddf.y).mean()) 45 | 46 | wait(fs, timeout=2 * 60) 47 | del fs 48 | 49 | ddf3 = ddf.merge(ddf2) 50 | fs = client.compute((ddf3.x + ddf3.y).mean()) 51 | 52 | wait(fs, timeout=2 * 60) 53 | del fs 54 | -------------------------------------------------------------------------------- /alembic/versions/c38b9d85915e_default_parameter_for_shuffling_tests.py: -------------------------------------------------------------------------------- 1 | """Default parameter for shuffling tests 2 | 3 | Revision ID: c38b9d85915e 4 | Revises: fa79471ffa8c 5 | Create Date: 2022-12-23 09:05:57.440944 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'c38b9d85915e' 14 | down_revision = 'fa79471ffa8c' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def h2o_update_query(test: str, ddf: str) -> str: 20 | return f""" 21 | update test_run 22 | set name = '{test}[{ddf}-tasks]' 23 | where name == '{test}[{ddf}]'; 24 | """ 25 | 26 | def rename_h2o_tests() -> None: 27 | for i in range(1, 10): 28 | test = f"test_q{i}" 29 | for ddf_param in ("0.5 GB (csv)", "0.5 GB (parquet)", "5 GB (parquet)"): 30 | op.execute(f""" 31 | update test_run 32 | set name = '{test}[{ddf_param}-tasks]' 33 | where name == '{test}[{ddf_param}]'; 34 | """) 35 | 36 | def rename_join_tests() -> None: 37 | for test in ("test_join_big", "test_join_big_small"): 38 | op.execute(f""" 39 | update test_run 40 | set name = '{test}[0.1-tasks]' 41 | where name == '{test}[0.1]'; 42 | """) 43 | 44 | def rename_shuffle_tests() -> None: 45 | for test in ("test_shuffle_parquet", "test_shuffle_simple"): 46 | op.execute(f""" 47 | update test_run 48 | set name = '{test}[tasks]' 49 | where name == '{test}'; 50 | """) 51 | 52 | def upgrade() -> None: 53 | rename_h2o_tests() 54 | rename_join_tests() 55 | rename_shuffle_tests() 56 | 57 | 58 | def downgrade() -> None: 59 | pass 60 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/regridding.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal 2 | 3 | import numpy as np 4 | import xarray as xr 5 | import xesmf as xe 6 | from dask.delayed import Delayed 7 | 8 | 9 | def xesmf( 10 | scale: Literal["small", "medium", "large"], 11 | storage_url: str, 12 | storage_options: dict[str, Any], 13 | ) -> Delayed: 14 | ds = xr.open_zarr( 15 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr", 16 | ) 17 | # Fixed time range and variable as the interesting part of this benchmark scales with the 18 | # regridding matrix 19 | ds = ds[["sea_surface_temperature"]].sel(time=slice("2020-01-01", "2021-12-31")) 20 | if scale == "small": 21 | # Regridding from a resolution of 0.25 degress to 1 degrees 22 | # results in 4 MiB weight matrix 23 | output_resolution = 1 24 | elif scale == "medium": 25 | # Regridding from a resolution of 0.25 degrees to 0.2 degrees 26 | # results in 100 MiB weight matrix 27 | output_resolution = 0.2 28 | else: 29 | # Regridding from a resolution of 0.25 degrees to 0.05 degrees 30 | # results in 1.55 GiB weight matrix 31 | output_resolution = 0.05 32 | 33 | out_grid = xr.Dataset( 34 | { 35 | "latitude": ( 36 | ["latitude"], 37 | np.arange(90, -90 - output_resolution, -output_resolution), 38 | {"units": "degrees_north"}, 39 | ), 40 | "longitude": ( 41 | ["longitude"], 42 | np.arange(0, 360, output_resolution), 43 | {"units": "degrees_east"}, 44 | ), 45 | } 46 | ) 47 | regridder = xe.Regridder(ds, out_grid, "bilinear", periodic=True) 48 | regridded = regridder(ds, keep_attrs=True) 49 | 50 | result = regridded.chunk(time="auto") 51 | return result.to_zarr(storage_url, storage_options=storage_options, compute=False) 52 | -------------------------------------------------------------------------------- /ci/environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | dependencies: 4 | ######################################################## 5 | # PLEASE READ: 6 | # When modifying this file, you must also align to match 7 | # - AB_environments/AB_baseline.conda.yaml 8 | # - AB_environments/AB_sample.conda.yaml 9 | ######################################################## 10 | 11 | - python >=3.10 12 | - pip 13 | - coiled >=0.2.54 14 | - numpy ==2.0.2 15 | - pandas ==2.2.3 16 | - dask ==2024.11.2 17 | - distributed ==2024.11.2 18 | - dask-labextension ==7.0.0 19 | - dask-ml ==2024.4.4 20 | - fsspec ==2024.10.0 21 | - s3fs ==2024.10.0 22 | - gcsfs ==2024.10.0 23 | - pyarrow ==18.1.0 24 | - jupyterlab ==4.3.1 25 | - lz4 ==4.3.3 26 | - ipywidgets ==8.1.5 27 | - numba ==0.60.0 28 | - scikit-learn ==1.5.2 29 | - ipycytoscape ==1.3.3 30 | - click ==8.1.7 31 | - xarray ==2024.11.0 32 | - flox ==0.9.15 33 | - zarr ==2.18.3 34 | - cftime ==1.6.4 35 | - msgpack-python 36 | - cloudpickle ==3.1.0 37 | - tornado ==6.4.2 38 | - toolz ==1.0.0 39 | - zict ==3.0.0 40 | - xgboost ==3.0.2 41 | - optuna ==4.1.0 42 | - optuna-integration ==4.1.0 43 | - scipy ==1.14.1 44 | - sqlalchemy ==2.0.36 45 | - pynvml ==11.5.3 46 | - bokeh ==3.6.1 47 | - gilknocker ==0.4.1 48 | - openssl >1.1.0g 49 | - rasterio >=1.4.0 50 | - rioxarray ==0.17.0 51 | - h5netcdf ==1.4.1 52 | - xesmf ==0.8.7 53 | - bottleneck ==1.4.2 54 | - geojson ==3.1.0 55 | - planetary-computer ==1.0.0 56 | - pystac-client ==0.8.5 57 | - odc-stac ==0.3.10 58 | - adlfs ==2024.7.0 59 | # https://github.com/coiled/benchmarks/issues/1616 60 | - cryptography ==43.0.3 61 | - pyopenssl ==24.2.1 62 | 63 | ######################################################## 64 | # PLEASE READ: 65 | # When modifying this file, you must also align to match 66 | # - AB_environments/AB_baseline.conda.yaml 67 | # - AB_environments/AB_sample.conda.yaml 68 | ######################################################## 69 | -------------------------------------------------------------------------------- /tests/geospatial/test_climatology.py: -------------------------------------------------------------------------------- 1 | """This benchmark is a port of the climatology computation implemented in 2 | https://github.com/google-research/weatherbench2/blob/47d72575cf5e99383a09bed19ba989b718d5fe30/scripts/compute_climatology.py 3 | with the parameters 4 | 5 | FREQUENCY = "hourly" 6 | HOUR_INTERVAL = 6 7 | WINDOW_SIZE = 61 8 | STATISTICS = ["mean"] 9 | METHOD = "explicit" 10 | """ 11 | 12 | from coiled.credentials.google import CoiledShippedCredentials 13 | 14 | from tests.geospatial.workloads.climatology import highlevel_api, rechunk_map_blocks 15 | 16 | 17 | def test_rechunk_map_blocks( 18 | gcs_url, 19 | scale, 20 | setup_benchmark, 21 | cluster_kwargs={ 22 | "workspace": "dask-benchmarks-gcp", 23 | "region": "us-central1", 24 | }, 25 | scale_kwargs={ 26 | "small": {"n_workers": 10}, 27 | "medium": {"n_workers": 100}, 28 | "large": {"n_workers": 100}, 29 | }, 30 | ): 31 | with setup_benchmark( 32 | **scale_kwargs[scale], **cluster_kwargs 33 | ) as benchmark: # noqa: F841 34 | benchmark( 35 | rechunk_map_blocks, 36 | scale=scale, 37 | storage_url=gcs_url, 38 | storage_options={"token": CoiledShippedCredentials()}, 39 | ) 40 | 41 | 42 | def test_highlevel_api( 43 | gcs_url, 44 | scale, 45 | setup_benchmark, 46 | cluster_kwargs={ 47 | "workspace": "dask-benchmarks-gcp", 48 | "region": "us-central1", 49 | "idle_timeout": "1h", 50 | }, 51 | scale_kwargs={ 52 | "small": {"n_workers": 10}, 53 | "medium": {"n_workers": 100}, 54 | "large": {"n_workers": 100}, 55 | }, 56 | ): 57 | with setup_benchmark( 58 | **scale_kwargs[scale], **cluster_kwargs 59 | ) as benchmark: # noqa: F841 60 | benchmark( 61 | highlevel_api, 62 | scale=scale, 63 | storage_url=gcs_url, 64 | storage_options={"token": CoiledShippedCredentials()}, 65 | ) 66 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/atmospheric_circulation.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal 2 | 3 | import xarray as xr 4 | from dask.delayed import Delayed 5 | 6 | 7 | def atmospheric_circulation( 8 | scale: Literal["small", "medium", "large"], 9 | storage_url: str, 10 | storage_options: dict[str, Any], 11 | ) -> Delayed: 12 | ds = xr.open_zarr( 13 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr", 14 | chunks={}, 15 | ) 16 | if scale == "small": 17 | # 852.56 GiB (small) 18 | time_range = slice("2020-01-01", "2020-02-01") 19 | elif scale == "medium": 20 | # 28.54 TiB (medium) 21 | time_range = slice("2020-01-01", "2023-01-01") 22 | else: 23 | # 608.42 TiB (large) 24 | time_range = slice(None) 25 | ds = ds.sel(time=time_range) 26 | 27 | ds = ds[ 28 | [ 29 | "u_component_of_wind", 30 | "v_component_of_wind", 31 | "temperature", 32 | "vertical_velocity", 33 | ] 34 | ].rename( 35 | { 36 | "u_component_of_wind": "U", 37 | "v_component_of_wind": "V", 38 | "temperature": "T", 39 | "vertical_velocity": "W", 40 | } 41 | ) 42 | 43 | zonal_means = ds.mean("longitude") 44 | anomaly = ds - zonal_means 45 | 46 | anomaly["uv"] = anomaly.U * anomaly.V 47 | anomaly["vt"] = anomaly.V * anomaly.T 48 | anomaly["uw"] = anomaly.U * anomaly.W 49 | 50 | temdiags = zonal_means.merge(anomaly[["uv", "vt", "uw"]].mean("longitude")) 51 | 52 | # This is incredibly slow, takes a while for flox to construct the graph 53 | daily = temdiags.resample(time="D").mean() 54 | 55 | # # Users often rework things via a rechunk to make this a blockwise problem 56 | # daily = ( 57 | # temdiags.chunk(time=24) 58 | # .resample(time="D") 59 | # .mean() 60 | # ) 61 | 62 | return daily.to_zarr(storage_url, storage_options=storage_options, compute=False) 63 | -------------------------------------------------------------------------------- /plugins.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collection of useful plugins for monitoring clusters. 3 | """ 4 | import sys 5 | from collections import defaultdict 6 | 7 | import cloudpickle 8 | from distributed.diagnostics import SchedulerPlugin 9 | 10 | # Tell cloudpickle we want to register objects in this module by value, 11 | # so we can send them to the scheduler without the files existing there. 12 | cloudpickle.register_pickle_by_value(sys.modules[__name__]) 13 | 14 | 15 | class Durations(SchedulerPlugin): 16 | def __init__(self): 17 | """Initialize the plugin""" 18 | self.durations = defaultdict(float) 19 | self.scheduler = None 20 | self._tracking = False 21 | # Big hack to trigger cloudpickle serialization for distributed < 2022.7.0 22 | # https://github.com/dask/distributed/pull/6466 23 | self.__main__ = "__main__" 24 | 25 | def start(self, scheduler): 26 | """Called on scheduler start as well as on registration time""" 27 | self.scheduler = scheduler 28 | scheduler.handlers["get_durations"] = self.get_durations 29 | scheduler.handlers["start_tracking_durations"] = self.start_tracking 30 | scheduler.handlers["stop_tracking_durations"] = self.stop_tracking 31 | 32 | def start_tracking(self, comm): 33 | self._tracking = True 34 | self.durations.clear() 35 | 36 | def stop_tracking(self, comm): 37 | self._tracking = False 38 | 39 | def transition(self, key, start, finish, *args, **kwargs): 40 | """On key transition to memory, update the duration data""" 41 | if not self._tracking: 42 | return 43 | 44 | if start == "processing" and finish == "memory": 45 | startstops = kwargs.get("startstops") 46 | if not startstops: 47 | return 48 | 49 | for ss in startstops: 50 | self.durations[ss["action"]] += max(ss["stop"] - ss["start"], 0) 51 | 52 | async def get_durations(self, comm): 53 | return dict(self.durations) 54 | 55 | def restart(self, scheduler): 56 | self.durations.clear() 57 | -------------------------------------------------------------------------------- /tests/workflows/test_xgboost_optuna.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dask.distributed import wait 3 | 4 | pytestmark = pytest.mark.workflows 5 | 6 | optuna = pytest.importorskip("optuna") 7 | xgb = pytest.importorskip("xgboost") 8 | pytest.importorskip("sklearn") 9 | 10 | from optuna.samplers import RandomSampler # noqa: E402 11 | from optuna_integration import DaskStorage # noqa: E402 12 | from sklearn.datasets import fetch_covtype # noqa: E402 13 | from sklearn.model_selection import KFold, cross_val_score # noqa: E402 14 | from sklearn.preprocessing import LabelEncoder # noqa: E402 15 | 16 | 17 | @pytest.mark.client("xgboost_optuna") 18 | def test_hpo(client): 19 | # We use a random sampler with a seed to get deterministic results. 20 | # This is just for benchmarking purposes. 21 | study = optuna.create_study( 22 | direction="maximize", storage=DaskStorage(), sampler=RandomSampler(seed=2) 23 | ) 24 | 25 | def objective(trial): 26 | # Dataset (241.59 MiB) from http://archive.ics.uci.edu/ml/datasets/covertype 27 | X, y = fetch_covtype(return_X_y=True) 28 | 29 | # Format training labels 30 | le = LabelEncoder() 31 | y = le.fit_transform(y) 32 | 33 | # Get hyperparameter values for this trial and model score 34 | params = { 35 | "n_estimators": trial.suggest_int("n_estimators", 2, 10), 36 | "max_depth": trial.suggest_int("max_depth", 2, 10), 37 | "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True), 38 | "subsample": trial.suggest_float("subsample", 0.2, 1.0), 39 | "n_jobs": 1, # Avoid thread oversubscription 40 | } 41 | model = xgb.XGBClassifier(**params) 42 | cv = KFold(n_splits=3, shuffle=True, random_state=2) 43 | score = cross_val_score(model, X, y, cv=cv) 44 | return score.mean() 45 | 46 | # Run HPO trials on a cluster 47 | n_trials = 200 48 | futures = [ 49 | client.submit(study.optimize, objective, n_trials=1, pure=False) 50 | for _ in range(n_trials) 51 | ] 52 | wait(futures) 53 | assert len(study.trials) >= n_trials 54 | -------------------------------------------------------------------------------- /AB_environments/AB_baseline.conda.yaml: -------------------------------------------------------------------------------- 1 | # Special environment file for A/B testing, used to define the conda environment for the 2 | # baseline environment. 3 | # Change contents, but do not rename. 4 | 5 | channels: 6 | - conda-forge 7 | dependencies: 8 | - python =3.10 # Single '=' means latest patch version available 9 | - memray ==1.13.4 10 | # Copy-paste from ci/environment.yml 11 | - pip 12 | - coiled >=0.2.54 13 | - numpy ==2.0.2 14 | - pandas ==2.2.3 15 | - dask ==2024.11.2 16 | - distributed ==2024.11.2 17 | - dask-labextension ==7.0.0 18 | - dask-ml ==2024.4.4 19 | - fsspec ==2024.10.0 20 | - s3fs ==2024.10.0 21 | - gcsfs ==2024.10.0 22 | - pyarrow ==18.1.0 23 | - jupyterlab ==4.3.1 24 | - lz4 ==4.3.3 25 | - ipywidgets ==8.1.5 26 | - numba ==0.60.0 27 | - scikit-learn ==1.5.2 28 | - ipycytoscape ==1.3.3 29 | - click ==8.1.7 30 | - xarray ==2024.11.0 31 | - flox ==0.9.15 32 | - zarr ==2.18.3 33 | - cftime ==1.6.4 34 | - msgpack-python 35 | - cloudpickle ==3.1.0 36 | - tornado ==6.4.2 37 | - toolz ==1.0.0 38 | - zict ==3.0.0 39 | - xgboost ==3.0.2 40 | - optuna ==4.1.0 41 | - optuna-integration ==4.1.0 42 | - scipy ==1.14.1 43 | - sqlalchemy ==2.0.36 44 | - pynvml ==11.5.3 45 | - bokeh ==3.6.1 46 | - gilknocker ==0.4.1 47 | - openssl >1.1.0g 48 | - rasterio >=1.4.0 49 | - rioxarray ==0.17.0 50 | - h5netcdf ==1.4.1 51 | - xesmf ==0.8.7 52 | - bottleneck ==1.4.2 53 | - geojson ==3.1.0 54 | - planetary-computer ==1.0.0 55 | - pystac-client ==0.8.5 56 | - odc-stac ==0.3.10 57 | - adlfs ==2024.7.0 58 | # https://github.com/coiled/benchmarks/issues/1616 59 | - cryptography ==43.0.3 60 | - pyopenssl ==24.2.1 61 | # End copy-paste 62 | 63 | - pip: 64 | # Make sure you install dask and distributed either both from pip or both from 65 | # conda. You may alternatively point to your own git fork (but make sure you 66 | # sync'ed tags!) 67 | # Read README.md for troubleshooting. 68 | # - git+https://github.com/dask/dask@191d39177009d2cce25b818878118e35329b6db3 69 | # - git+https://github.com/dask/distributed@0304fb6e665e36abf9e3086173cccd36e29ae84d 70 | -------------------------------------------------------------------------------- /tests/benchmarks/test_zarr.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dask.array as da 4 | import pytest 5 | 6 | from ..utils_test import run_up_to_nthreads, wait 7 | 8 | pytest.importorskip("zarr") 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def zarr_dataset(): 13 | # shape = (2000, 2000, 2000) 14 | # chunks = (200, 200, 200) 15 | # Compresses to ~42% of its original size (tested on lz4 4.0) 16 | store = ( 17 | "s3://coiled-runtime-ci/synthetic-zarr/synth_random_int_array_2000_cubed.zarr" 18 | ) 19 | return da.from_zarr(store) 20 | 21 | 22 | @pytest.fixture(scope="module") 23 | def cmip6(): 24 | xarray = pytest.importorskip("xarray") 25 | pytest.importorskip("cftime") 26 | 27 | store = "s3://coiled-runtime-ci/CMIP6/CMIP/AS-RCEC/TaiESM1/1pctCO2/r1i1p1f1/Amon/zg/gn/v20200225/" 28 | return xarray.open_dataset(store, engine="zarr", chunks={}) 29 | 30 | 31 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset") 32 | @pytest.mark.parametrize("threshold", [50, 100, 200, 255]) 33 | def test_filter_then_average(small_client, zarr_dataset, threshold): 34 | """Compute the mean for increasingly sparse boolean filters of an array""" 35 | a = zarr_dataset[zarr_dataset > threshold].mean() 36 | wait(a, small_client, 300) 37 | 38 | 39 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 40 | @pytest.mark.parametrize("N", [700, 75, 1]) 41 | def test_access_slices(small_client, zarr_dataset, N): 42 | """Accessing just a few chunks of a zarr array should be quick""" 43 | a = zarr_dataset[:N, :N, :N] 44 | wait(a, small_client, 300) 45 | 46 | 47 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 48 | def test_sum_residuals(small_client, zarr_dataset): 49 | """Compute reduce, then map, then reduce again""" 50 | a = (zarr_dataset - zarr_dataset.mean(axis=0)).sum() 51 | wait(a, small_client, 300) 52 | 53 | 54 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 55 | def test_select_scalar(small_client, cmip6): 56 | ds = cmip6.isel({"lat": 20, "lon": 40, "plev": 5, "time": 1234}).compute() 57 | assert ds.zg.shape == () 58 | assert ds.zg.size == 1 59 | -------------------------------------------------------------------------------- /AB_environments/config.yaml: -------------------------------------------------------------------------------- 1 | # Number of times to run each test suite. 2 | # Lower values are faster and cheaper but will result in higher variance. 3 | # Setting it to 5 is a good value to get statistically significant results. 4 | # This must remain set to 0 in the main branch, thus completely disabling 5 | # A/B tests, in order to avoid unnecessary runs. 6 | repeat: 0 7 | 8 | # Set to true to automatically create a verbatim copy of AB_baseline and then compare 9 | # the two in the A/B tests. Set to false to save some money if you are already confident 10 | # that the 'repeat' setting is high enough. 11 | test_null_hypothesis: true 12 | 13 | # Test directories, test files, or individual tests to run. 14 | targets: 15 | # - tests 16 | - tests/benchmarks 17 | # - tests/runtime 18 | # - tests/stability 19 | # - tests/tpch/test_dask.py 20 | # - tests/benchmarks/test_futures.py 21 | # - tests/benchmarks/test_array.py::test_basic_sum 22 | 23 | # pytest markers or marker expressions. See setup.cfg for available ones. 24 | # Leave blank to run all marked and unmarked tests. 25 | markers: not tpch_nondask 26 | # markers: shuffle_p2p 27 | # markers: shuffle_p2p or shuffle_tasks 28 | # markers: not shuffle_tasks 29 | 30 | # Enable specific H2O datasets 31 | h2o_datasets: 32 | # - 0.5 GB (csv) 33 | # - 5 GB (csv) 34 | # - 50 GB (csv) 35 | # - 0.5 GB (parquet) 36 | # - 5 GB (parquet) 37 | # - 50 GB (parquet) 38 | - 5 GB (parquet+pyarrow) 39 | # - 50 GB (parquet+pyarrow) 40 | # - 500 GB (parquet+pyarrow) 41 | 42 | # AWS implements limiters to how many EC2 instances you can spawn in parallel on the 43 | # same AWS account. If such limit is reached, tests will randomly fail when trying to 44 | # create the Coiled clusters, and restarting failed jobs won't fix the problem. 45 | # Additionally, there are problems with Coiled itself triggered by limitations that are 46 | # never actually reached with real paying users. 47 | max_parallel: 48 | # Number of parallel A/B test jobs per branch. 49 | ci_jobs: 5 50 | # Number of parallel test_*.py modules per A/B test job. 51 | # Each module typically spawns one Coiled cluster at a time. 52 | # Set to 1 to disable pytest-xdist. 53 | pytest_workers_per_job: 4 54 | -------------------------------------------------------------------------------- /tests/test_utils_test.py: -------------------------------------------------------------------------------- 1 | import dask 2 | import numpy as np 3 | import pytest 4 | from dask.sizeof import sizeof 5 | from dask.utils import parse_bytes 6 | 7 | from .utils_test import ( 8 | scaled_array_shape, 9 | scaled_array_shape_quadratic, 10 | timeseries_of_size, 11 | ) 12 | 13 | 14 | def test_scaled_array_shape(): 15 | assert scaled_array_shape(1024, (2, "x"), dtype=bool) == (2, 512) 16 | assert scaled_array_shape(1024, (2, "x"), dtype=float) == (2, 64) 17 | assert scaled_array_shape(1024, (2, "x"), dtype=np.float64) == (2, 64) 18 | assert scaled_array_shape(1024, (2, "x")) == (2, 64) 19 | 20 | assert scaled_array_shape(16, ("x", "x"), dtype=bool) == (4, 4) 21 | assert scaled_array_shape(256, ("4x", "x"), dtype=bool) == (32, 8) 22 | assert scaled_array_shape(64, ("x", "x", "x"), dtype=float) == (2, 2, 2) 23 | 24 | assert scaled_array_shape("10kb", ("x", "1kb"), dtype=bool) == (10, 1000) 25 | 26 | 27 | def test_scaled_array_shape_quadratic(): 28 | assert scaled_array_shape("1GB", ("x",)) == (125000000,) 29 | assert scaled_array_shape_quadratic("1GB", "1GB", ("x",)) == (125000000,) 30 | assert scaled_array_shape_quadratic("16GB", "1GB", ("x",)) == (500000000,) 31 | assert scaled_array_shape_quadratic("64MB", "1GB", ("x",)) == (31622776,) 32 | 33 | 34 | def sizeof_df(df): 35 | # Measure the size of each partition separately (each one has overhead of being a separate DataFrame) 36 | # TODO more efficient method than `df.partitions`? Use `dask.get` directly? 37 | parts = dask.compute( 38 | [df.partitions[i] for i in range(df.npartitions)], scheduler="threads" 39 | ) 40 | return sum(map(sizeof, parts)) 41 | 42 | 43 | def test_timeseries_of_size(): 44 | small_parts = timeseries_of_size( 45 | "1mb", freq="1s", partition_freq="100s", dtypes={"x": float} 46 | ) 47 | big_parts = timeseries_of_size( 48 | "1mb", freq="1s", partition_freq="100s", dtypes={i: float for i in range(10)} 49 | ) 50 | assert sizeof_df(small_parts) == pytest.approx(parse_bytes("1mb"), rel=0.1) 51 | assert sizeof_df(big_parts) == pytest.approx(parse_bytes("1mb"), rel=0.1) 52 | assert big_parts.npartitions < small_parts.npartitions 53 | -------------------------------------------------------------------------------- /tests/benchmarks/test_futures.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from dask.distributed import as_completed, wait 4 | from distributed.utils_test import inc, slowdec, slowinc 5 | 6 | from ..utils_test import run_up_to_nthreads 7 | 8 | 9 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 10 | def test_single_future(small_client): 11 | """How quickly can we run a simple computation? 12 | Repeat the test a few times to get a more sensible 13 | cumulative measure. 14 | """ 15 | for i in range(100): 16 | small_client.submit(inc, i).result() 17 | 18 | 19 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 20 | @pytest.mark.parametrize("rootish", ["rootish", "non-rootish"]) 21 | def test_large_map(small_client, rootish): 22 | """What's the overhead of map these days?""" 23 | if rootish == "rootish": 24 | futures = small_client.map(inc, range(100_000)) 25 | else: 26 | 27 | def inc_with_deps(i, deps): 28 | return i + 1 29 | 30 | deps = small_client.map(inc, range(5)) 31 | futures = small_client.map(inc_with_deps, range(100_000), deps=deps) 32 | 33 | wait(futures) 34 | 35 | 36 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 37 | def test_large_map_first_work(small_client): 38 | """ 39 | Large maps are fine, but it's pleasant to see work start immediately. 40 | We have a batch_size keyword that should work here but it's not on by default. 41 | Maybe it should be. 42 | """ 43 | futures = small_client.map(inc, range(100_000)) 44 | for _ in as_completed(futures): 45 | return 46 | 47 | 48 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset") 49 | def test_memory_efficient(small_client): 50 | """ 51 | We hope that we pipeline xs->ys->zs without keeping all of the xs in memory 52 | to start. This may not actually happen today. 53 | """ 54 | xs = small_client.map(np.random.random, [20_000_000] * 100, pure=False) 55 | ys = small_client.map(slowinc, xs, delay=1) 56 | zs = small_client.map(slowdec, ys, delay=1) 57 | 58 | futures = as_completed(zs) 59 | del xs, ys, zs # Don't keep references to intermediate results 60 | 61 | for _ in futures: # pass through all futures, forget them immediately 62 | continue 63 | -------------------------------------------------------------------------------- /AB_environments/AB_sample.conda.yaml: -------------------------------------------------------------------------------- 1 | # Sample conda environment file for A/B testing. 2 | # Change contents/delete/rename as needed. 3 | 4 | # Every A/B environment *must* present these three files: 5 | # - AB_.conda.yaml 6 | # - AB_.dask.yaml 7 | # - AB_.cluster.yaml 8 | 9 | # You should always start from a copy-paste from AB_baseline.conda.yaml 10 | 11 | channels: 12 | - conda-forge 13 | dependencies: 14 | - python =3.10 # Single '=' means latest patch version available 15 | - memray ==1.13.4 16 | # Copy-paste from ci/environment.yml 17 | - pip 18 | - coiled >=0.2.54 19 | - numpy ==2.0.2 20 | - pandas ==2.2.3 21 | - dask ==2024.11.2 22 | - distributed ==2024.11.2 23 | - dask-labextension ==7.0.0 24 | - dask-ml ==2024.4.4 25 | - fsspec ==2024.10.0 26 | - s3fs ==2024.10.0 27 | - gcsfs ==2024.10.0 28 | - pyarrow ==18.1.0 29 | - jupyterlab ==4.3.1 30 | - lz4 ==4.3.3 31 | - ipywidgets ==8.1.5 32 | - numba ==0.60.0 33 | - scikit-learn ==1.5.2 34 | - ipycytoscape ==1.3.3 35 | - click ==8.1.7 36 | - xarray ==2024.11.0 37 | - flox ==0.9.15 38 | - zarr ==2.18.3 39 | - cftime ==1.6.4 40 | - msgpack-python 41 | - cloudpickle ==3.1.0 42 | - tornado ==6.4.2 43 | - toolz ==1.0.0 44 | - zict ==3.0.0 45 | - xgboost ==3.0.2 46 | - optuna ==4.1.0 47 | - optuna-integration ==4.1.0 48 | - scipy ==1.14.1 49 | - sqlalchemy ==2.0.36 50 | - pynvml ==11.5.3 51 | - bokeh ==3.6.1 52 | - gilknocker ==0.4.1 53 | - openssl >1.1.0g 54 | - rasterio >=1.4.0 55 | - rioxarray ==0.17.0 56 | - h5netcdf ==1.4.1 57 | - xesmf ==0.8.7 58 | - bottleneck ==1.4.2 59 | - geojson ==3.1.0 60 | - planetary-computer ==1.0.0 61 | - pystac-client ==0.8.5 62 | - odc-stac ==0.3.10 63 | - adlfs ==2024.7.0 64 | # https://github.com/coiled/benchmarks/issues/1616 65 | - cryptography ==43.0.3 66 | - pyopenssl ==24.2.1 67 | # End copy-paste 68 | 69 | - pip: 70 | # Make sure you install dask and distributed either both from pip or both from 71 | # conda. You may alternatively point to your own git fork (but make sure you 72 | # sync'ed tags!) 73 | # Read README.md for troubleshooting. 74 | - git+https://github.com/dask/dask@191d39177009d2cce25b818878118e35329b6db3 75 | - git+https://github.com/dask/distributed@0304fb6e665e36abf9e3086173cccd36e29ae84d 76 | -------------------------------------------------------------------------------- /tests/tpch/README.md: -------------------------------------------------------------------------------- 1 | TPC-H Benchmarks 2 | ================ 3 | 4 | This document will help you run the TPC-H benchmarks in this directory. 5 | 6 | Setup 7 | ----- 8 | 9 | Clone this repository 10 | 11 | ``` 12 | git clone git@github.com:coiled/benchmarks 13 | cd benchmarks 14 | ``` 15 | 16 | Follow the environment creation steps in the root directory. Namely the 17 | following: 18 | 19 | ``` 20 | mamba env create -n tpch -f ci/environment.yml 21 | conda activate tpch 22 | mamba env update -f ci/environment-git-tip.yml 23 | mamba env update -f ci/environment-test.yml 24 | mamba install grpcio grpcio-status protobuf -y # if you want Spark 25 | ``` 26 | 27 | Run Dask Benchmarks 28 | ------------------- 29 | 30 | ``` 31 | pytest --benchmark tests/tpch/test_dask.py 32 | ``` 33 | 34 | Configure 35 | --------- 36 | 37 | By default we run Scale 100 (about 100 GB) on the cloud with Coiled. You can 38 | configure this by changing the values for `_local` and `_scale` in the 39 | `conftest.py` file in this directory (they're at the top). 40 | 41 | Local Data Generation 42 | --------------------- 43 | 44 | If you want to run locally, you'll need to generate data. Run the following 45 | from the **root directory** of this repository. 46 | 47 | ``` 48 | python tests/tpch/generate_data.py --scale 10 49 | ``` 50 | 51 | Run Many Tests 52 | -------------- 53 | 54 | When running on the cloud you can run many tests simultaneously. We recommend 55 | using pytest-xdist for this with the keywords: 56 | 57 | - `-n 4` run four parallel jobs 58 | - `--dist loadscope` split apart by module 59 | 60 | ``` 61 | py.test --benchmark -n 4 --dist loadscope tests/tpch 62 | ``` 63 | 64 | Generate Plots 65 | -------------- 66 | 67 | Timing outputs are dropped into `benchmark.db` in the root of this repository. 68 | You can generate charts analyzing results using either the notebook 69 | `visualize.ipynb` in this directory (recommended) or the `generate-plot.py` 70 | script in this directory. These require `ibis` and `altair` (not installed 71 | above). 72 | 73 | These are both meant to be run from the root directory of this repository. 74 | 75 | These pull out the most recent records for each query/library pairing. If 76 | you're changing scales and want to ensure clean results, you may want to nuke 77 | your `benchmark.db` file between experiments (it's ok, it'll regenerate 78 | automatically). 79 | -------------------------------------------------------------------------------- /alembic/versions/7d7844fca7cf_initial_table.py: -------------------------------------------------------------------------------- 1 | """Initial table 2 | 3 | Revision ID: 7d7844fca7cf 4 | Revises: 5 | Create Date: 2022-07-21 12:32:36.579599 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '7d7844fca7cf' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('test_run', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('session_id', sa.String(), nullable=False), 24 | sa.Column('name', sa.String(), nullable=False), 25 | sa.Column('originalname', sa.String(), nullable=False), 26 | sa.Column('path', sa.String(), nullable=True), 27 | sa.Column('setup_outcome', sa.String(), nullable=True), 28 | sa.Column('call_outcome', sa.String(), nullable=True), 29 | sa.Column('teardown_outcome', sa.String(), nullable=True), 30 | sa.Column('coiled_runtime_version', sa.String(), nullable=True), 31 | sa.Column('coiled_software_name', sa.String(), nullable=True), 32 | sa.Column('dask_version', sa.String(), nullable=True), 33 | sa.Column('distributed_version', sa.String(), nullable=True), 34 | sa.Column('python_version', sa.String(), nullable=True), 35 | sa.Column('platform', sa.String(), nullable=True), 36 | sa.Column('ci_run_url', sa.String(), nullable=True), 37 | sa.Column('start', sa.DateTime(), nullable=True), 38 | sa.Column('end', sa.DateTime(), nullable=True), 39 | sa.Column('duration', sa.Float(), nullable=True), 40 | sa.Column('average_memory', sa.Float(), nullable=True), 41 | sa.Column('peak_memory', sa.Float(), nullable=True), 42 | sa.Column('compute_time', sa.Float(), nullable=True), 43 | sa.Column('disk_spill_time', sa.Float(), nullable=True), 44 | sa.Column('serializing_time', sa.Float(), nullable=True), 45 | sa.Column('transfer_time', sa.Float(), nullable=True), 46 | sa.Column('performance_report_url', sa.String(), nullable=True), 47 | sa.Column('cluster_dump_url', sa.String(), nullable=True), 48 | sa.PrimaryKeyConstraint('id') 49 | ) 50 | # ### end Alembic commands ### 51 | 52 | 53 | def downgrade() -> None: 54 | # ### commands auto generated by Alembic - please adjust! ### 55 | op.drop_table('test_run') 56 | # ### end Alembic commands ### 57 | -------------------------------------------------------------------------------- /alembic/env.py: -------------------------------------------------------------------------------- 1 | import os 2 | from logging.config import fileConfig 3 | 4 | from sqlalchemy import engine_from_config, pool 5 | 6 | from alembic import context 7 | from benchmark_schema import Base 8 | 9 | # this is the Alembic Config object, which provides 10 | # access to the values within the .ini file in use. 11 | config = context.config 12 | 13 | # Interpret the config file for Python logging. 14 | # This line sets up loggers basically. 15 | if config.config_file_name is not None: 16 | fileConfig(config.config_file_name) 17 | 18 | # add your model's MetaData object here 19 | # for 'autogenerate' support 20 | target_metadata = Base.metadata 21 | 22 | # other values from the config, defined by the needs of env.py, 23 | # can be acquired: 24 | # my_important_option = config.get_main_option("my_important_option") 25 | # ... etc. 26 | 27 | # Set the database name from the DB_NAME environment variable 28 | ENGINE_URL = f"sqlite:///{os.environ.get('DB_NAME', 'benchmark.db')}" 29 | config.set_main_option("sqlalchemy.url", ENGINE_URL) 30 | 31 | 32 | def run_migrations_offline() -> None: 33 | """Run migrations in 'offline' mode. 34 | 35 | This configures the context with just a URL 36 | and not an Engine, though an Engine is acceptable 37 | here as well. By skipping the Engine creation 38 | we don't even need a DBAPI to be available. 39 | 40 | Calls to context.execute() here emit the given string to the 41 | script output. 42 | 43 | """ 44 | url = config.get_main_option("sqlalchemy.url") 45 | context.configure( 46 | url=url, 47 | target_metadata=target_metadata, 48 | literal_binds=True, 49 | dialect_opts={"paramstyle": "named"}, 50 | ) 51 | 52 | with context.begin_transaction(): 53 | context.run_migrations() 54 | 55 | 56 | def run_migrations_online() -> None: 57 | """Run migrations in 'online' mode. 58 | 59 | In this scenario we need to create an Engine 60 | and associate a connection with the context. 61 | 62 | """ 63 | connectable = engine_from_config( 64 | config.get_section(config.config_ini_section), 65 | prefix="sqlalchemy.", 66 | poolclass=pool.NullPool, 67 | ) 68 | 69 | with connectable.connect() as connection: 70 | context.configure(connection=connection, target_metadata=target_metadata) 71 | 72 | with context.begin_transaction(): 73 | context.run_migrations() 74 | 75 | 76 | if context.is_offline_mode(): 77 | run_migrations_offline() 78 | else: 79 | run_migrations_online() 80 | -------------------------------------------------------------------------------- /tests/benchmarks/test_spill.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import pytest 4 | from coiled import Cluster 5 | from dask.distributed import Client, wait 6 | from toolz import merge 7 | 8 | from ..conftest import dump_cluster_kwargs 9 | from ..utils_test import ( 10 | cluster_memory, 11 | print_size_info, 12 | scaled_array_shape, 13 | scaled_array_shape_quadratic, 14 | ) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def spill_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags): 19 | kwargs = dict( 20 | name=f"spill-{uuid.uuid4().hex[:8]}", 21 | environ=merge( 22 | dask_env_variables, 23 | { 24 | # Ensure that no tasks are not retried on worker ungraceful termination 25 | # caused by out-of-memory issues 26 | "DASK_DISTRIBUTED__SCHEDULER__ALLOWED_FAILURES": "0", 27 | }, 28 | ), 29 | tags=github_cluster_tags, 30 | **cluster_kwargs["spill_cluster"], 31 | ) 32 | dump_cluster_kwargs(kwargs, "spill") 33 | with Cluster(**kwargs) as cluster: 34 | yield cluster 35 | 36 | 37 | @pytest.fixture 38 | def spill_client(spill_cluster, cluster_kwargs, benchmark_all, wait_for_workers): 39 | n_workers = cluster_kwargs["spill_cluster"]["n_workers"] 40 | with Client(spill_cluster) as client: 41 | spill_cluster.scale(n_workers) 42 | wait_for_workers(client, n_workers, timeout=600) 43 | client.restart() 44 | with benchmark_all(client): 45 | yield client 46 | 47 | 48 | @pytest.mark.parametrize( 49 | "keep_around", [pytest.param(False, id="release"), pytest.param(True, id="keep")] 50 | ) 51 | def test_spilling(spill_client, new_array, keep_around): 52 | memory = cluster_memory(spill_client) # 36 GiB 53 | shape = scaled_array_shape(memory * 1.79, ("x", "x")) # 64 GiB 54 | a = new_array(shape) 55 | print_size_info(memory, memory * 1.79, a) 56 | 57 | a = a.persist() 58 | wait(a) 59 | b = a.sum().persist() 60 | if not keep_around: 61 | del a 62 | assert b.compute() 63 | 64 | 65 | def test_dot_product_spill(spill_client, new_array): 66 | """See also test_array.py::test_dot_product 67 | for variant that doesn't hit the spill threshold 68 | """ 69 | memory = cluster_memory(spill_client) # 38.33 GiB 70 | shape = scaled_array_shape_quadratic(memory * 0.3, "11.5 GiB", ("x", "x")) 71 | a = new_array(shape) 72 | print_size_info(memory, memory * 0.3, a) 73 | b = (a @ a.T).sum() 74 | assert b.compute() 75 | -------------------------------------------------------------------------------- /tests/tpch/plotting.py: -------------------------------------------------------------------------------- 1 | import altair as alt 2 | import pandas as pd 3 | 4 | LIBRARY_COLORS = { 5 | "dask": "#5677a4", 6 | "duckdb": "#e68b39", 7 | "polars": "#d4605b", 8 | "pyspark": "green", 9 | } 10 | 11 | 12 | def from_db(path): 13 | df = pd.read_sql_table(table_name="test_run", con=f"sqlite:///{path}") 14 | 15 | df = df[ 16 | (df.call_outcome == "passed") 17 | & (df.path.str.contains("^tpch/test_(?:dask|duckdb|polars|pyspark)")) 18 | & df.cluster_name 19 | ] 20 | df = df[["path", "name", "duration", "start", "cluster_name"]] 21 | df["library"] = df.path.map(lambda path: path.split("_")[-1].split(".")[0]) 22 | df["query"] = df.name.map(lambda name: int(name.split("_")[-1])) 23 | df["name"] = df.cluster_name.map(lambda name: name.split("-", 3)[-1]) 24 | df["scale"] = df.cluster_name.map(lambda name: int(name.split("-")[2])) 25 | del df["path"] 26 | del df["cluster_name"] 27 | return df 28 | 29 | 30 | def latest(df, n=1): 31 | df = df.sort_values(["query", "library"]) 32 | 33 | def recent(df): 34 | return df.sort_values("start").tail(n) 35 | 36 | df = df.groupby(["library", "query"]).apply(recent).reset_index(drop=True) 37 | del df["start"] 38 | return df 39 | 40 | 41 | def normalize(df): 42 | dask_durations = df[df["library"] == "dask"].set_index("query")["duration"] 43 | data = df.groupby("query").apply( 44 | lambda group: group.assign( 45 | relative_duration=group["duration"] / dask_durations[group.name] 46 | ) 47 | ) 48 | return data.reset_index(drop=True) 49 | 50 | 51 | def subplot(df, column, libraries): 52 | return ( 53 | alt.Chart(df) 54 | .mark_bar() 55 | .encode( 56 | x="query:N", 57 | y=f"{column}:Q", 58 | xOffset="library:N", 59 | color=alt.Color("library").scale( 60 | domain=libraries, 61 | range=[LIBRARY_COLORS[lib] for lib in libraries], 62 | ), 63 | tooltip=["library", column], 64 | ) 65 | ) 66 | 67 | 68 | def plot(df, libraries=None, column="duration"): 69 | if libraries is None: 70 | libraries = ["dask", "duckdb", "polars", "pyspark"] 71 | plot = subplot(df[df["query"] < 12], column=column, libraries=libraries) & subplot( 72 | df[df["query"] >= 12], column=column, libraries=libraries 73 | ) 74 | return plot.properties( 75 | title=f"TPC-H -- scale:{df.scale.iloc[0]} name:{df.name.iloc[0]}" 76 | ).configure_title( 77 | fontSize=20, 78 | ) 79 | -------------------------------------------------------------------------------- /ci/scripts/discover_ab_environments.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import glob 4 | import json 5 | import os.path 6 | from typing import TypedDict 7 | 8 | import yaml 9 | 10 | 11 | class JSONOutput(TypedDict): 12 | run_AB: bool 13 | repeat: list[int] 14 | runtime: list[str] 15 | max_parallel: int 16 | pytest_args: list[str] 17 | h2o_datasets: list[str] 18 | 19 | 20 | DO_NOT_RUN: JSONOutput = { 21 | "run_AB": False, 22 | "repeat": [], 23 | "runtime": [], 24 | "max_parallel": 1, 25 | "pytest_args": [], 26 | "h2o_datasets": [], 27 | } 28 | 29 | 30 | def build_json() -> JSONOutput: 31 | with open("AB_environments/config.yaml") as fh: 32 | cfg = yaml.safe_load(fh) 33 | 34 | if not isinstance(cfg.get("repeat"), int) or cfg["repeat"] < 0: 35 | raise ValueError("AB_environments/config.yaml: missing key {repeat: N}") 36 | for target in cfg["targets"]: 37 | target = target.split("::")[0] 38 | if not os.path.exists(target): 39 | raise FileNotFoundError(target) 40 | 41 | if not cfg["repeat"] or not cfg["targets"]: 42 | return DO_NOT_RUN 43 | 44 | runtimes = [] 45 | for conda_fname in sorted(glob.glob("AB_environments/AB_*.conda.yaml")): 46 | env_name = os.path.basename(conda_fname)[: -len(".conda.yaml")] 47 | dask_fname = f"AB_environments/{env_name}.dask.yaml" 48 | # Raise FileNotFoundError if missing 49 | open(dask_fname).close() 50 | runtimes.append(env_name) 51 | 52 | if not runtimes: 53 | return DO_NOT_RUN 54 | 55 | if "AB_baseline" not in runtimes: 56 | # If any A/B environments are defined, AB_baseline is required 57 | raise FileNotFoundError("AB_environments/AB_baseline.conda.yaml") 58 | 59 | if cfg["test_null_hypothesis"]: 60 | runtimes += ["AB_null_hypothesis"] 61 | 62 | pytest_args = [] 63 | if (n := cfg["max_parallel"]["pytest_workers_per_job"]) > 1: 64 | pytest_args.append(f"-n {n} --dist loadscope") 65 | if cfg["markers"]: 66 | pytest_args.append(f"-m '{cfg['markers']}'") 67 | for target in cfg["targets"]: 68 | pytest_args.append(f"'{target}'") 69 | 70 | return { 71 | "run_AB": True, 72 | "repeat": list(range(1, cfg["repeat"] + 1)), 73 | "runtime": runtimes, 74 | "max_parallel": cfg["max_parallel"]["ci_jobs"], 75 | "pytest_args": [" ".join(pytest_args)], 76 | "h2o_datasets": [",".join(cfg["h2o_datasets"])], 77 | } 78 | 79 | 80 | def main() -> None: 81 | print(json.dumps(build_json())) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /alembic/versions/00d5844fd364_add_tpch_run_table.py: -------------------------------------------------------------------------------- 1 | """Add tpch run table 2 | 3 | Revision ID: 00d5844fd364 4 | Revises: 25053f75e09f 5 | Create Date: 2024-04-09 13:41:39.795757 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '00d5844fd364' 14 | down_revision = '25053f75e09f' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade() -> None: 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('tpch_run', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('session_id', sa.String(), nullable=False), 24 | sa.Column('name', sa.String(), nullable=False), 25 | sa.Column('originalname', sa.String(), nullable=False), 26 | sa.Column('path', sa.String(), nullable=True), 27 | sa.Column('setup_outcome', sa.String(), nullable=True), 28 | sa.Column('call_outcome', sa.String(), nullable=True), 29 | sa.Column('teardown_outcome', sa.String(), nullable=True), 30 | sa.Column('dask_version', sa.String(), nullable=True), 31 | sa.Column('dask_expr_version', sa.String(), nullable=True), 32 | sa.Column('distributed_version', sa.String(), nullable=True), 33 | sa.Column('duckdb_version', sa.String(), nullable=True), 34 | sa.Column('pyspark_version', sa.String(), nullable=True), 35 | sa.Column('polars_version', sa.String(), nullable=True), 36 | sa.Column('python_version', sa.String(), nullable=True), 37 | sa.Column('platform', sa.String(), nullable=True), 38 | sa.Column('ci_run_url', sa.String(), nullable=True), 39 | sa.Column('start', sa.DateTime(), nullable=True), 40 | sa.Column('end', sa.DateTime(), nullable=True), 41 | sa.Column('duration', sa.Float(), nullable=True), 42 | sa.Column('average_memory', sa.Float(), nullable=True), 43 | sa.Column('peak_memory', sa.Float(), nullable=True), 44 | sa.Column('cluster_name', sa.String(), nullable=True), 45 | sa.Column('cluster_id', sa.Integer(), nullable=True), 46 | sa.Column('cluster_details_url', sa.String(), nullable=True), 47 | sa.Column('scale', sa.Integer(), nullable=False), 48 | sa.Column('query', sa.Integer(), nullable=False), 49 | sa.Column('local', sa.Boolean(), nullable=False), 50 | sa.Column('compression', sa.String(), nullable=True), 51 | sa.Column('partition_size', sa.String(), nullable=True), 52 | sa.PrimaryKeyConstraint('id') 53 | ) 54 | # ### end Alembic commands ### 55 | 56 | 57 | def downgrade() -> None: 58 | # ### commands auto generated by Alembic - please adjust! ### 59 | op.drop_table('tpch_run') 60 | # ### end Alembic commands ### 61 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/cloud_optimize.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | import xarray as xr 4 | from s3fs import S3FileSystem 5 | 6 | 7 | def cloud_optimize( 8 | scale: Literal["small", "medium", "large"], fs: S3FileSystem, storage_url: str 9 | ): 10 | models = [ 11 | "ACCESS-CM2", 12 | "ACCESS-ESM1-5", 13 | "CMCC-ESM2", 14 | "CNRM-CM6-1", 15 | "CNRM-ESM2-1", 16 | "CanESM5", 17 | "EC-Earth3", 18 | "EC-Earth3-Veg-LR", 19 | "FGOALS-g3", 20 | "GFDL-ESM4", 21 | "GISS-E2-1-G", 22 | "INM-CM4-8", 23 | "INM-CM5-0", 24 | "KACE-1-0-G", 25 | "MIROC-ES2L", 26 | "MPI-ESM1-2-HR", 27 | "MPI-ESM1-2-LR", 28 | "MRI-ESM2-0", 29 | "NorESM2-LM", 30 | "NorESM2-MM", 31 | "TaiESM1", 32 | "UKESM1-0-LL", 33 | ] 34 | variables = [ 35 | "hurs", 36 | "huss", 37 | "pr", 38 | "rlds", 39 | "rsds", 40 | "sfcWind", 41 | "tas", 42 | "tasmax", 43 | "tasmin", 44 | ] 45 | 46 | if scale == "small": 47 | # 130 files (152.83 GiB). One model and one variable. 48 | models = models[:1] 49 | variables = variables[:1] 50 | elif scale == "medium": 51 | # 390 files. Two models and two variables. 52 | # Currently fails after hitting 20 minute idle timeout 53 | # sending large graph to the scheduler. 54 | models = models[:2] 55 | variables = variables[:2] 56 | else: 57 | # 11635 files. All models and variables. 58 | pass 59 | 60 | # Get netCDF data files -- see https://registry.opendata.aws/nex-gddp-cmip6 61 | # for dataset details. 62 | files = [] 63 | for model in models: 64 | for variable in variables: 65 | data_dir = f"s3://nex-gddp-cmip6/NEX-GDDP-CMIP6/{model}/historical/r1i1p1f1/{variable}/*.nc" 66 | files += [f"s3://{path}" for path in fs.glob(data_dir)] 67 | print(f"Processing {len(files)} NetCDF files") 68 | 69 | # Load input NetCDF data files 70 | # TODO: Reduce explicit settings once https://github.com/pydata/xarray/issues/8778 is completed. 71 | ds = xr.open_mfdataset( 72 | files, 73 | engine="h5netcdf", 74 | combine="nested", 75 | concat_dim="time", 76 | data_vars="minimal", 77 | coords="minimal", 78 | compat="override", 79 | parallel=True, 80 | ) 81 | 82 | # Rechunk from "pancake" to "pencil" format 83 | ds = ds.chunk({"time": -1, "lon": "auto", "lat": "auto"}) 84 | 85 | # Write out to a Zar dataset 86 | return ds.to_zarr(storage_url, compute=False) 87 | -------------------------------------------------------------------------------- /tests/benchmarks/test_dataframe.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from dask.sizeof import sizeof 3 | from dask.utils import format_bytes 4 | 5 | from ..utils_test import cluster_memory, timeseries_of_size, wait 6 | 7 | 8 | def print_dataframe_info(df): 9 | p = df.partitions[0].compute(scheduler="threads") 10 | partition_size = sizeof(p) 11 | total_size = partition_size * df.npartitions 12 | print( 13 | f"~{len(p) * df.npartitions:,} rows x {len(df.columns)} columns, " 14 | f"{format_bytes(total_size)} total, " 15 | f"{df.npartitions:,} {format_bytes(partition_size)} partitions" 16 | ) 17 | 18 | 19 | def test_dataframe_align(small_client): 20 | memory = cluster_memory(small_client) # 76.66 GiB 21 | 22 | df = timeseries_of_size( 23 | memory // 2, 24 | start="2020-01-01", 25 | freq="600ms", 26 | partition_freq="12h", 27 | dtypes={i: float for i in range(100)}, 28 | ) 29 | print_dataframe_info(df) 30 | # ~50,904,000 rows x 100 columns, 38.31 GiB total, 707 55.48 MiB partitions 31 | 32 | df2 = timeseries_of_size( 33 | memory // 4, 34 | start="2010-01-01", 35 | freq="600ms", 36 | partition_freq="12h", 37 | dtypes={i: float for i in range(100)}, 38 | ) 39 | print_dataframe_info(df2) 40 | # ~25,488,000 rows x 100 columns, 19.18 GiB total, 354 55.48 MiB partitions 41 | 42 | final = (df2 - df).mean() # will be all NaN, just forcing alignment 43 | wait(final, small_client, 10 * 60) 44 | 45 | 46 | @pytest.mark.xfail(reason="https://github.com/coiled/benchmarks/pull/1116") 47 | def test_filter(small_client): 48 | """How fast can we filter a DataFrame?""" 49 | memory = cluster_memory(small_client) 50 | df = timeseries_of_size(memory) 51 | name = df.head(1).name.iloc[0] # Get first name that appears 52 | result = df[df.name == name] 53 | wait(result, small_client, 10 * 60) 54 | 55 | 56 | def test_dataframe_cow_chain(small_client): 57 | memory = cluster_memory(small_client) # 76.66 GiB 58 | 59 | df = timeseries_of_size( 60 | memory // 2, 61 | start="2020-01-01", 62 | freq="600ms", 63 | partition_freq="12h", 64 | dtypes={ 65 | **{i: float for i in range(40)}, 66 | **{i: int for i in range(41, 80)}, 67 | **{i: object for i in range(81, 120)}, 68 | }, 69 | ) 70 | print_dataframe_info(df) 71 | 72 | result = ( 73 | df.rename(columns={1: 1000}) 74 | .replace("x", "xxx") 75 | .fillna({i: 100 for i in range(10, 70)}) 76 | .astype({50: "float"}) 77 | .loc[:, slice(2, 100)] 78 | ) 79 | wait(result, small_client, 10 * 60) 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # macOS 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # PyCharm project settings 124 | .idea 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | # Project-specific files 138 | cluster_kwargs.*.pickle 139 | cluster_kwargs.*.yaml 140 | benchmark.db 141 | static/ 142 | mamba_env_export.yml 143 | tpch-data/ 144 | 145 | # .visualize() output of dask collections 146 | mydask.html 147 | -------------------------------------------------------------------------------- /tests/workflows/test_embarrassingly_parallel.py: -------------------------------------------------------------------------------- 1 | import io 2 | import tarfile 3 | 4 | import pandas as pd 5 | import pytest 6 | from dask.distributed import wait 7 | 8 | pytestmark = pytest.mark.workflows 9 | 10 | 11 | @pytest.mark.client("embarrassingly_parallel") 12 | def test_embarassingly_parallel(client, s3_factory): 13 | # How popular is matplotlib? 14 | s3 = s3_factory(requester_pays=True) 15 | directories = s3.ls("s3://arxiv/pdf") 16 | 17 | # We only analyze files from 1991-2022 here in order to have a consistent data volume. 18 | # This is benchmarking purposes only, as this dataset is updated monthly. 19 | years = list(range(91, 100)) + list(range(23)) 20 | directories = [ 21 | d 22 | for d in directories 23 | if d.endswith(".tar") and int(d.split("_")[2][:2]) in years 24 | ] 25 | 26 | def extract(filename: str, fs): 27 | """Extract and process one directory of arXiv data 28 | 29 | Returns 30 | ------- 31 | filename: str 32 | contains_matplotlib: boolean 33 | """ 34 | out = [] 35 | with fs.open(filename) as f: 36 | bytes_ = f.read() 37 | with io.BytesIO() as bio: 38 | bio.write(bytes_) 39 | bio.seek(0) 40 | with tarfile.TarFile(fileobj=bio) as tf: 41 | for member in tf.getmembers(): 42 | if member.isfile() and member.name.endswith(".pdf"): 43 | data = tf.extractfile(member).read() 44 | out.append((member.name, b"matplotlib" in data.lower())) 45 | return out 46 | 47 | futures = client.map(extract, directories, fs=s3) 48 | wait(futures) 49 | # We had one error in one file. Let's just ignore and move on. 50 | good = [future for future in futures if future.status == "finished"] 51 | data = client.gather(good) 52 | 53 | # Convert to Pandas 54 | dfs = [pd.DataFrame(d, columns=["filename", "has_matplotlib"]) for d in data] 55 | df = pd.concat(dfs) 56 | 57 | def filename_to_date(filename): 58 | year = int(filename.split("/")[0][:2]) 59 | month = int(filename.split("/")[0][2:4]) 60 | if year > 80: 61 | year = 1900 + year 62 | else: 63 | year = 2000 + year 64 | 65 | return pd.Timestamp(year=year, month=month, day=1) 66 | 67 | df["date"] = df.filename.map(filename_to_date) 68 | result = df.groupby("date").has_matplotlib.mean() 69 | # Some light validation to ensure results are consistent. 70 | # This is only for benchmarking. 71 | assert result.idxmin() == pd.Timestamp("1991-07-01") # Earliest timestamp 72 | assert result.idxmax() == pd.Timestamp("2022-10-01") # Row with maximum value 73 | assert result.ne(0).idxmax() == pd.Timestamp("2005-06-01") # First non-zero row 74 | -------------------------------------------------------------------------------- /.github/workflows/geospatial.yml: -------------------------------------------------------------------------------- 1 | name: Geospatial Benchmarks 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | scale: 6 | description: 'Scale' 7 | required: true 8 | default: 'small' 9 | type: choice 10 | options: 11 | - 'small' 12 | - 'medium' 13 | - 'large' 14 | 15 | defaults: 16 | # Required shell entrypoint to have properly activated conda environments 17 | run: 18 | shell: bash -l {0} 19 | 20 | jobs: 21 | geospatial: 22 | name: Geospatial 23 | runs-on: ubuntu-latest 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | 29 | - name: Set up environment 30 | uses: conda-incubator/setup-miniconda@v3 31 | with: 32 | miniforge-version: latest 33 | use-mamba: true 34 | condarc-file: ci/condarc 35 | python-version: "3.10" 36 | environment-file: ci/environment.yml 37 | conda-remove-defaults: "true" 38 | 39 | - name: Add geospatial dependencies 40 | run: mamba env update --file ci/environment-geospatial.yml 41 | 42 | - name: Upgrade dask to git tip 43 | run: mamba env update --file ci/environment-git-tip.yml 44 | 45 | - name: Add test dependencies 46 | run: mamba env update --file ci/environment-test.yml 47 | 48 | - name: Dump environment 49 | run: | 50 | # For debugging 51 | echo -e "--\n--Conda Environment (re-create this with \`conda env create --name -f \`)\n--" 52 | mamba env export | grep -E -v '^prefix:.*$' 53 | 54 | - name: Google auth 55 | uses: "google-github-actions/auth@v2" 56 | with: 57 | credentials_json: "${{ secrets.GCP_CREDENTIALS }}" 58 | 59 | - name: Run geospatial benchmarks 60 | env: 61 | DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} 62 | AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} 63 | AWS_DEFAULT_REGION: us-east-2 # this is needed for boto for some reason 64 | AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} 65 | AZURE_STORAGE_ACCOUNT_NAME: ${{ secrets.AZURE_STORAGE_ACCOUNT_NAME}} 66 | AZURE_STORAGE_SAS_TOKEN: ${{ secrets.AZURE_STORAGE_SAS_TOKEN}} 67 | COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} 68 | DB_NAME: geospatial_${{ inputs.scale }}.db 69 | MEMRAY_PROFILE: "none" 70 | run: | 71 | pytest --benchmark \ 72 | tests/geospatial -m geo_execution \ 73 | -n 4 --dist loadscope \ 74 | --scale ${{ inputs.scale }} \ 75 | --memray ${{ env.MEMRAY_PROFILE }} \ 76 | 77 | - name: Upload benchmark results 78 | uses: actions/upload-artifact@v4 79 | if: always() 80 | with: 81 | name: geospatial-benchmark 82 | path: | 83 | geospatial_${{ inputs.scale }}.db 84 | mamba_env_export.yml 85 | -------------------------------------------------------------------------------- /cluster_kwargs.yaml: -------------------------------------------------------------------------------- 1 | # Static kwargs passed to coiled.Cluster 2 | # In A/B tests, these can be overridden by AB_environments/AB_.cluster.yaml 3 | 4 | # The override priority is as follows (bottom wins): 5 | # 1. default parameters of coiled.Cluster 6 | # 2. default section of this file 7 | # 3. default section of AB_environments/AB_.cluster.yaml 8 | # 4. specific sections of this file 9 | # 5. specific sections of AB_environments/AB_.cluster.yaml 10 | 11 | # The keys 'name', 'environ', and 'tags' must not be used. 12 | 13 | # Settings for all clusters, unless overriden below 14 | default: 15 | package_sync: true 16 | wait_for_workers: true 17 | scheduler_vm_types: [m6i.large] 18 | spot_policy: spot_with_fallback 19 | 20 | # For all tests using the small_client fixture 21 | small_cluster: 22 | n_workers: 10 23 | worker_vm_types: [m6i.large] # 2CPU, 8GiB 24 | 25 | # For tests/benchmarks/test_parquet.py 26 | parquet_cluster: 27 | n_workers: 15 28 | worker_vm_types: [m5.xlarge] # 4 CPU, 16 GiB 29 | 30 | # For tests/benchmarks/test_spill.py 31 | spill_cluster: 32 | n_workers: 5 33 | worker_disk_size: 64 34 | worker_vm_types: [m6i.large] # 2CPU, 8GiB 35 | 36 | # For tests/benchmarks/test_xarray.py 37 | group_reduction_cluster: 38 | n_workers: 20 39 | worker_vm_types: [m6i.xlarge] # 4CPU, 16GiB 40 | region: "us-east-1" # Same region as dataset 41 | 42 | # For tests/workflows/test_embarrassingly_parallel.py 43 | embarrassingly_parallel: 44 | n_workers: 100 45 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 46 | region: "us-east-1" # Same region as dataset 47 | 48 | # For tests/workflows/test_xgboost_optuna.py 49 | xgboost_optuna: 50 | n_workers: 50 51 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 52 | 53 | # For tests/workflows/test_uber_lyft.py 54 | uber_lyft: 55 | n_workers: 20 56 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 57 | 58 | uber_lyft_large: 59 | n_workers: 50 60 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 61 | 62 | # For tests/workflows/test_pytorch_optuna.py 63 | pytorch_optuna: 64 | n_workers: 10 65 | worker_vm_types: [g4dn.xlarge] # 1 GPU, 4 CPU, 16 GiB 66 | worker_options: 67 | # Making workers single-threaded to avoid GPU contention. See discussion in 68 | # https://github.com/coiled/benchmarks/pull/787#discussion_r1177004248 for 69 | # more details. 70 | nthreads: 1 71 | 72 | # For tests/workflows/test_snowflake.py 73 | snowflake: 74 | n_workers: 20 75 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 76 | 77 | 78 | # Specific tests 79 | test_work_stealing_on_scaling_up: 80 | n_workers: 1 81 | worker_vm_types: [t3.medium] 82 | 83 | test_work_stealing_on_straggling_worker: 84 | n_workers: 10 85 | worker_vm_types: [t3.medium] 86 | 87 | test_repeated_merge_spill: 88 | n_workers: 20 89 | worker_vm_types: [m6i.large] 90 | 91 | # For tests/workflows/test_from_csv_to_parquet.py 92 | from_csv_to_parquet: 93 | n_workers: 10 94 | worker_vm_types: [m6i.xlarge] # 4 CPU, 16 GiB (preferred default instance) 95 | region: "us-east-1" # Same region as dataset 96 | -------------------------------------------------------------------------------- /tests/tpch/generate_answers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | import botocore.session 5 | import click 6 | import coiled 7 | import duckdb 8 | import duckdb_queries 9 | import pyarrow as pa 10 | import pyarrow.parquet as pq 11 | from utils import ( 12 | get_answers_path, 13 | get_bucket_region, 14 | get_dataset_path, 15 | get_single_vm_spec, 16 | ) 17 | 18 | 19 | def generate(scale: int, path: str, local: bool) -> None: 20 | dataset_path = get_dataset_path(local, scale) 21 | use_coiled = False 22 | 23 | if path.startswith("s3"): 24 | use_coiled = True 25 | global REGION 26 | REGION = get_bucket_region(path) 27 | else: 28 | path = pathlib.Path(path) 29 | path.mkdir(parents=True, exist_ok=True) 30 | 31 | def connection(): 32 | con = duckdb.connect() 33 | 34 | if not local: # Setup s3 credentials 35 | session = botocore.session.Session() 36 | creds = session.get_credentials() 37 | con.install_extension("httpfs") 38 | con.load_extension("httpfs") 39 | con.sql( 40 | f""" 41 | SET s3_region='us-east-2'; 42 | SET s3_access_key_id='{creds.access_key}'; 43 | SET s3_secret_access_key='{creds.secret_key}'; 44 | SET s3_session_token='{creds.token}'; 45 | """ 46 | ) 47 | return con 48 | 49 | def generate_answer(query): 50 | table = getattr(duckdb_queries, f"query_{query}")( 51 | connection(), dataset_path, scale 52 | ) 53 | relaxed_schema = table.schema 54 | for i, field in enumerate(table.schema): 55 | if pa.types.is_decimal(field.type): 56 | relaxed_schema = relaxed_schema.set(i, field.with_type(pa.float64())) 57 | elif pa.types.is_date(field.type): 58 | relaxed_schema = relaxed_schema.set( 59 | i, field.with_type(pa.timestamp("ms")) 60 | ) 61 | table = table.cast(relaxed_schema) 62 | pq.write_table(table, os.path.join(str(path), f"answer_{query}.parquet")) 63 | 64 | if use_coiled: 65 | generate_answer = coiled.function( 66 | name=f"tpch-generate-answers-{scale}", **get_single_vm_spec(scale) 67 | )(generate_answer) 68 | for query in range(1, 23): 69 | generate_answer(query) 70 | 71 | print("Finished exporting all answers!") 72 | 73 | 74 | @click.command() 75 | @click.option( 76 | "--scale", default=10, help="Scale factor to use, roughly equal to number of GB" 77 | ) 78 | @click.option( 79 | "--path", 80 | help="Local or S3 base path, will affix '/answers' subdirectory to this path", 81 | ) 82 | @click.option( 83 | "--local", 84 | is_flag=True, 85 | default=False, 86 | help="Whether to generate the answers locally", 87 | ) 88 | def main( 89 | scale: int, 90 | path: str | None, 91 | local: bool, 92 | ): 93 | if path is None: 94 | path = get_answers_path(local, scale) 95 | generate(scale, path, local) 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /tests/tpch/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import boto3 4 | 5 | 6 | def get_dataset_path(local, scale): 7 | remote_paths = { 8 | 1: "s3://coiled-runtime-ci/tpc-h/snappy/scale-1/", 9 | 10: "s3://coiled-runtime-ci/tpc-h/snappy/scale-10/", 10 | 100: "s3://coiled-runtime-ci/tpc-h/snappy/scale-100/", 11 | 1000: "s3://coiled-runtime-ci/tpc-h/snappy/scale-1000/", 12 | 10000: "s3://coiled-runtime-ci/tpc-h/snappy/scale-10000/", 13 | } 14 | local_paths = { 15 | 1: "./tpch-data/scale-1/", 16 | 10: "./tpch-data/scale-10/", 17 | 100: "./tpch-data/scale-100/", 18 | } 19 | 20 | if local: 21 | return local_paths[scale] 22 | else: 23 | return remote_paths[scale] 24 | 25 | 26 | def get_answers_path(local, scale): 27 | if local: 28 | return f"./tpch-data/answers/scale-{scale}/" 29 | return f"s3://coiled-runtime-ci/tpc-h/answers/scale-{scale}/" 30 | 31 | 32 | def get_bucket_region(path: str): 33 | if not path.startswith("s3://"): 34 | raise ValueError(f"'{path}' is not an S3 path") 35 | bucket = path.replace("s3://", "").split("/")[0] 36 | resp = boto3.client("s3").get_bucket_location(Bucket=bucket) 37 | # Buckets in region 'us-east-1' results in None, b/c why not. 38 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/get_bucket_location.html#S3.Client.get_bucket_location 39 | return resp["LocationConstraint"] or "us-east-1" 40 | 41 | 42 | def get_cluster_spec(scale: int, shutdown_on_close: bool) -> dict[str, Any]: 43 | everywhere = dict( 44 | idle_timeout="1h", 45 | wait_for_workers=True, 46 | scheduler_vm_types=["m6i.2xlarge"], 47 | shutdown_on_close=shutdown_on_close, 48 | ) 49 | 50 | if scale == 1: 51 | return { 52 | "worker_vm_types": ["m6i.large"], 53 | "n_workers": 4, 54 | **everywhere, 55 | } 56 | if scale == 10: 57 | return { 58 | "worker_vm_types": ["m6i.large"], 59 | "n_workers": 8, 60 | **everywhere, 61 | } 62 | elif scale == 100: 63 | return { 64 | "worker_vm_types": ["m6i.large"], 65 | "n_workers": 16, 66 | **everywhere, 67 | } 68 | elif scale == 1000: 69 | return { 70 | "worker_vm_types": ["m6i.xlarge"], 71 | "n_workers": 32, 72 | "worker_disk_size": 128, 73 | **everywhere, 74 | } 75 | elif scale == 10000: 76 | return { 77 | "worker_vm_types": ["m6i.xlarge"], 78 | "n_workers": 32 * 10, 79 | "worker_disk_size": 100, 80 | **everywhere, 81 | } 82 | 83 | 84 | def get_single_vm_spec(scale): 85 | if scale == 1: 86 | return { 87 | "vm_type": "m6i.2xlarge", 88 | } 89 | if scale == 10: 90 | return { 91 | "vm_type": "m6i.4xlarge", 92 | } 93 | elif scale == 100: 94 | return { 95 | "vm_type": "m6i.8xlarge", 96 | } 97 | elif scale == 1000: 98 | return { 99 | "vm_type": "m6i.32xlarge", 100 | } 101 | elif scale == 10000: 102 | return { 103 | "vm_type": "m6i.32xlarge", 104 | "disk_size": 1000, 105 | } 106 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/satellite_filtering.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Literal 3 | 4 | import fsspec 5 | import geojson 6 | import odc.stac 7 | import planetary_computer 8 | import pystac_client 9 | import xarray as xr 10 | 11 | 12 | def harmonize_to_old(data: xr.Dataset) -> xr.Dataset: 13 | """ 14 | Harmonize new Sentinel-2 data to the old baseline. 15 | 16 | Parameters 17 | ---------- 18 | data: 19 | A Dataset with various bands as data variables and three dimensions: time, y, x 20 | 21 | Returns 22 | ------- 23 | harmonized: xarray.Dataset 24 | A Dataset with all values harmonized to the old 25 | processing baseline. 26 | """ 27 | cutoff = datetime.datetime(2022, 1, 25) 28 | offset = 1000 29 | bands = [ 30 | "B01", 31 | "B02", 32 | "B03", 33 | "B04", 34 | "B05", 35 | "B06", 36 | "B07", 37 | "B08", 38 | "B8A", 39 | "B09", 40 | "B10", 41 | "B11", 42 | "B12", 43 | ] 44 | 45 | to_process = list(set(bands) & set(list(data.data_vars))) 46 | old = data.sel(time=slice(cutoff))[to_process] 47 | 48 | new = data.sel(time=slice(cutoff, None)).drop_vars(to_process) 49 | 50 | new_harmonized = data.sel(time=slice(cutoff, None))[to_process].clip(offset) 51 | new_harmonized -= offset 52 | 53 | new = xr.merge([new, new_harmonized]) 54 | return xr.concat([old, new], dim="time") 55 | 56 | 57 | def satellite_filtering( 58 | scale: Literal["small", "medium", "large"], 59 | storage_url: str, 60 | ): 61 | catalog = pystac_client.Client.open( 62 | "https://planetarycomputer.microsoft.com/api/stac/v1", 63 | modifier=planetary_computer.sign_inplace, 64 | ) 65 | 66 | # GeoJSON for region of interest is from https://github.com/isellsoap/deutschlandGeoJSON/tree/main/1_deutschland 67 | with fsspec.open( 68 | "https://raw.githubusercontent.com/isellsoap/deutschlandGeoJSON/main/1_deutschland/3_mittel.geo.json" 69 | ) as f: 70 | gj = geojson.load(f) 71 | 72 | # Flatten MultiPolygon to single Polygon 73 | coordinates = [] 74 | for x in gj.features[0]["geometry"]["coordinates"]: 75 | coordinates.extend(x) 76 | area_of_interest = { 77 | "type": "Polygon", 78 | "coordinates": coordinates, 79 | } 80 | 81 | # Get stack items 82 | if scale == "small": 83 | time_of_interest = "2024-01-01/2024-09-01" 84 | else: 85 | time_of_interest = "2015-01-01/2024-09-01" 86 | 87 | search = catalog.search( 88 | collections=["sentinel-2-l2a"], 89 | intersects=area_of_interest, 90 | datetime=time_of_interest, 91 | ) 92 | items = search.item_collection() 93 | 94 | # Construct Xarray Dataset from stack items 95 | ds = odc.stac.load( 96 | items, 97 | chunks={}, 98 | patch_url=planetary_computer.sign, 99 | resolution=40, 100 | crs="EPSG:3857", 101 | groupby="solar_day", 102 | ) 103 | # See https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a#Baseline-Change 104 | ds = harmonize_to_old(ds) 105 | 106 | # Compute humidity index 107 | humidity = (ds.B08 - ds.B11) / (ds.B08 + ds.B11) 108 | 109 | result = humidity.groupby("time.month").mean() 110 | return result.to_zarr(storage_url, compute=False) 111 | -------------------------------------------------------------------------------- /alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python-dateutil library that can be 20 | # installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to dateutil.tz.gettz() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # the output encoding used when revision files 55 | # are written from script.py.mako 56 | # output_encoding = utf-8 57 | 58 | sqlalchemy.url = sqlite:///benchmark.db 59 | 60 | 61 | [post_write_hooks] 62 | # post_write_hooks defines scripts or Python functions that are run 63 | # on newly generated revision scripts. See the documentation for further 64 | # detail and examples 65 | 66 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 67 | # hooks = black 68 | # black.type = console_scripts 69 | # black.entrypoint = black 70 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 71 | 72 | # Logging configuration 73 | [loggers] 74 | keys = root,sqlalchemy,alembic 75 | 76 | [handlers] 77 | keys = console 78 | 79 | [formatters] 80 | keys = generic 81 | 82 | [logger_root] 83 | level = WARN 84 | handlers = console 85 | qualname = 86 | 87 | [logger_sqlalchemy] 88 | level = WARN 89 | handlers = 90 | qualname = sqlalchemy.engine 91 | 92 | [logger_alembic] 93 | level = INFO 94 | handlers = 95 | qualname = alembic 96 | 97 | [handler_console] 98 | class = StreamHandler 99 | args = (sys.stderr,) 100 | level = NOTSET 101 | formatter = generic 102 | 103 | [formatter_generic] 104 | format = %(levelname)-5.5s [%(name)s] %(message)s 105 | datefmt = %H:%M:%S 106 | -------------------------------------------------------------------------------- /tests/tpch/visualize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "0cdaec38-4a9e-4a25-b45e-1188903d219d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Plot TPC-H results\n", 9 | "\n", 10 | "This currently assumes that benchmarks have been run and have populated benchmark.db. It also assumes that that database has only those results and from only one run (this is usually a bad assumption).\n", 11 | "\n", 12 | "```\n", 13 | "rm benchmark.db\n", 14 | "pytest --benchmark tests/tpch\n", 15 | "```" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "c7ec3d43-3a70-4666-9552-04d82ac42a31", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "\n", 27 | "df = pd.read_sql_table(table_name=\"test_run\", con=\"sqlite:///../../benchmark.db\")\n", 28 | "\n", 29 | "df = df[\n", 30 | " (df.call_outcome == \"passed\")\n", 31 | " & (df.path.str.contains(\"^tpch/test_(?:dask|duckdb|polars|pyspark)\"))\n", 32 | " & df.cluster_name\n", 33 | "]\n", 34 | "df = df[[\"path\", \"name\", \"duration\", \"start\", \"cluster_name\"]]\n", 35 | "\n", 36 | "df[\"library\"] = df.path.map(lambda path: path.split(\"_\")[-1].split(\".\")[0])\n", 37 | "df[\"query\"] = df.name.map(lambda name: int(name.split(\"_\")[-1]))\n", 38 | "df[\"name\"] = df.cluster_name.map(lambda name: name.split(\"-\", 3)[-1])\n", 39 | "df[\"scale\"] = df.cluster_name.map(lambda name: int(name.split(\"-\")[2]))\n", 40 | "del df[\"path\"]\n", 41 | "del df[\"cluster_name\"]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "31fbdc8a-c782-4000-9e23-5488b4d04d14", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "92f926a6-cbeb-4765-b5c1-3e8ea7c71ff6", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df = df.sort_values([\"query\", \"library\"])\n", 62 | "\n", 63 | "def recent(df):\n", 64 | " return df.sort_values(\"start\").iloc[-1]\n", 65 | "\n", 66 | "df = df.groupby([\"library\", \"query\"]).apply(recent).reset_index(drop=True)\n", 67 | "del df[\"start\"]\n", 68 | "df.head(10)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "34830787-2364-4541-8cf8-8adffbde9148", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "import altair as alt\n", 79 | "\n", 80 | "chart = alt.Chart(df).mark_bar().encode(\n", 81 | " x=\"query:N\",\n", 82 | " y=\"duration:Q\",\n", 83 | " xOffset=\"library:N\",\n", 84 | " color=alt.Color('library').scale(\n", 85 | " domain=[\"dask\", \"duckdb\", \"polars\", \"pyspark\"], \n", 86 | " range=[\"#5677a4\", \"#e68b39\", \"#d4605b\", \"green\"],\n", 87 | " ),\n", 88 | " tooltip=[\"library\", \"duration\"]\n", 89 | ").properties(\n", 90 | " title=f\"TPC-H -- scale:{df.scale.iloc[0]} name:{df.name.iloc[0]}\"\n", 91 | ").configure_title(\n", 92 | " fontSize=20,\n", 93 | "\n", 94 | ")\n", 95 | "chart" 96 | ] 97 | } 98 | ], 99 | "metadata": { 100 | "kernelspec": { 101 | "display_name": "Python 3 (ipykernel)", 102 | "language": "python", 103 | "name": "python3" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.11.7" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 5 120 | } 121 | -------------------------------------------------------------------------------- /tests/benchmarks/test_join.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import pytest 3 | 4 | from ..utils_test import cluster_memory, run_up_to_nthreads, timeseries_of_size, wait 5 | 6 | 7 | @pytest.mark.shuffle_p2p 8 | @run_up_to_nthreads("small_cluster", 40, reason="Does not finish") 9 | def test_join_big(small_client, memory_multiplier): 10 | memory = cluster_memory(small_client) # 76.66 GiB 11 | 12 | df1_big = timeseries_of_size( 13 | memory * memory_multiplier, dtypes={str(i): float for i in range(100)} 14 | ) # 66.58 MiB partitions 15 | df1_big["predicate"] = df1_big["0"] * 1e9 16 | df1_big = df1_big.astype({"predicate": "int"}) 17 | 18 | df2_big = timeseries_of_size( 19 | memory * memory_multiplier, dtypes={str(i): float for i in range(100)} 20 | ) # 66.58 MiB partitions 21 | 22 | # Control cardinality on column to join - this produces cardinality ~ to len(df) 23 | df2_big["predicate"] = df2_big["0"] * 1e9 24 | df2_big = df2_big.astype({"predicate": "int"}) 25 | 26 | join = df1_big.merge(df2_big, on="predicate", how="inner") 27 | # dask.dataframe will drop all columns except the Index for size 28 | # computations, which will optimize itself through merges, e.g. 29 | # shuffling a lot less data than what we want to test 30 | # map_partitions blocks those optimizations 31 | join = join.map_partitions(lambda x: x) 32 | result = join.size 33 | wait(result, small_client, 20 * 60) 34 | 35 | 36 | def test_join_big_small(small_client, memory_multiplier, configure_shuffling): 37 | if memory_multiplier == 0.1: 38 | raise pytest.skip(reason="Too noisy; not adding anything to multiplier=1") 39 | 40 | memory = cluster_memory(small_client) # 76.66 GiB 41 | 42 | df_big = timeseries_of_size( 43 | memory * memory_multiplier, dtypes={str(i): float for i in range(100)} 44 | ) # 66.58 MiB partitions 45 | 46 | # Control cardinality on column to join - this produces cardinality ~ to len(df) 47 | df_big["predicate"] = df_big["0"] * 1e9 48 | df_big = df_big.astype({"predicate": "int"}) 49 | 50 | df_small = timeseries_of_size( 51 | "100 MB", dtypes={str(i): float for i in range(100)} 52 | ) # make it obviously small 53 | 54 | df_small["predicate"] = df_small["0"] * 1e9 55 | df_small_pd = df_small.astype({"predicate": "int"}).compute() 56 | 57 | join = df_big.merge(df_small_pd, on="predicate", how="inner") 58 | # dask.dataframe will drop all columns except the Index for size 59 | # computations, which will optimize itself through merges, e.g. 60 | # shuffling a lot less data than what we want to test 61 | # map_partitions blocks those optimizations 62 | join = join.map_partitions(lambda x: x) 63 | result = join.size 64 | wait(result, small_client, 20 * 60) 65 | 66 | 67 | @pytest.mark.shuffle_p2p 68 | @pytest.mark.parametrize("persist", [True, False]) 69 | def test_set_index(small_client, persist, memory_multiplier): 70 | memory = cluster_memory(small_client) # 76.66 GiB 71 | 72 | df_big = timeseries_of_size( 73 | memory * memory_multiplier, dtypes={str(i): float for i in range(100)} 74 | ) # 66.58 MiB partitions 75 | df_big["predicate"] = df_big["0"] * 1e9 76 | df_big = df_big.astype({"predicate": "int"}) 77 | if persist: 78 | df_big = df_big.persist() 79 | df_indexed = df_big.set_index("0") 80 | # dask.dataframe will drop all columns except the Index for size 81 | # computations, which will optimize itself through set_index, e.g. 82 | # shuffling a lot less data than what we want to test 83 | # map_partitions blocks those optimizations 84 | df_indexed = df_indexed.map_partitions(lambda x: x) 85 | wait(df_indexed.size, small_client, 20 * 60) 86 | 87 | 88 | @pytest.mark.client("uber_lyft_large") 89 | def test_set_index_on_uber_lyft(client, configure_shuffling): 90 | df = dd.read_parquet( 91 | "s3://coiled-datasets/uber-lyft-tlc/", storage_options={"anon": True} 92 | ) 93 | result = df.set_index("PULocationID") 94 | wait(result, client, 20 * 60) 95 | -------------------------------------------------------------------------------- /.github/workflows/tpch.yml: -------------------------------------------------------------------------------- 1 | name: TPC-H Benchmarks 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | scale: 6 | description: 'Scale Factor' 7 | required: true 8 | default: 10000 9 | type: choice 10 | options: 11 | - 10000 12 | - 1000 13 | - 100 14 | - 10 15 | - 1 16 | dask: 17 | description: 'Dask' 18 | required: true 19 | default: true 20 | type: boolean 21 | duckdb: 22 | description: 'DuckDB' 23 | required: true 24 | default: true 25 | type: boolean 26 | polars: 27 | description: 'Polars' 28 | required: true 29 | default: false 30 | type: boolean 31 | pyspark: 32 | description: 'PySpark' 33 | required: true 34 | default: true 35 | type: boolean 36 | 37 | defaults: 38 | # Required shell entrypoint to have properly activated conda environments 39 | run: 40 | shell: bash -l {0} 41 | 42 | jobs: 43 | tpch: 44 | name: TPC-H 45 | runs-on: ubuntu-latest 46 | 47 | steps: 48 | - name: Checkout 49 | uses: actions/checkout@v4 50 | 51 | - name: Set up environment 52 | uses: conda-incubator/setup-miniconda@v3 53 | with: 54 | miniforge-version: latest 55 | use-mamba: true 56 | condarc-file: ci/condarc 57 | python-version: "3.10" 58 | environment-file: ci/environment.yml 59 | conda-remove-defaults: "true" 60 | 61 | - name: Add TPC-H non-dask dependencies 62 | run: mamba env update --file ci/environment-tpch-nondask.yml 63 | 64 | - name: Upgrade dask to git tip 65 | run: mamba env update --file ci/environment-git-tip.yml 66 | 67 | - name: Add test dependencies 68 | run: mamba env update --file ci/environment-test.yml 69 | 70 | - name: Dump environment 71 | run: | 72 | # For debugging 73 | echo -e "--\n--Conda Environment (re-create this with \`conda env create --name -f \`)\n--" 74 | mamba env export | grep -E -v '^prefix:.*$' 75 | 76 | - name: Add Dask to benchmark if enabled 77 | if: ${{ inputs.dask }} 78 | run: | 79 | echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_dask.py" >> $GITHUB_ENV 80 | 81 | - name: Add DuckDB to benchmark if enabled 82 | if: ${{ inputs.duckdb }} 83 | run: | 84 | echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_duckdb.py" >> $GITHUB_ENV 85 | 86 | - name: Add Polars to benchmark if enabled 87 | if: ${{ inputs.polars }} 88 | run: | 89 | echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_polars.py" >> $GITHUB_ENV 90 | 91 | - name: Add PySpark to benchmark if enabled 92 | if: ${{ inputs.pyspark }} 93 | run: | 94 | echo PYTEST_BENCHMARKS="${{ env.PYTEST_BENCHMARKS }} tests/tpch/test_pyspark.py" >> $GITHUB_ENV 95 | 96 | - name: Run TPC-H benchmarks (except polars) 97 | env: 98 | DASK_COILED__TOKEN: ${{ secrets.COILED_BENCHMARK_BOT_TOKEN }} 99 | AWS_ACCESS_KEY_ID: ${{ secrets.RUNTIME_CI_BOT_AWS_ACCESS_KEY_ID }} 100 | AWS_DEFAULT_REGION: us-east-2 # this is needed for boto for some reason 101 | AWS_SECRET_ACCESS_KEY: ${{ secrets.RUNTIME_CI_BOT_AWS_SECRET_ACCESS_KEY }} 102 | COILED_RUNTIME_VERSION: ${{ matrix.runtime-version }} 103 | DB_NAME: tpch_${{ inputs.scale }}.db 104 | DASK_DATAFRAME__QUERY_PLANNING: True 105 | run: | 106 | pytest --benchmark \ 107 | ${{ env.PYTEST_BENCHMARKS }} \ 108 | -n 4 --dist loadscope \ 109 | --scale ${{ inputs.scale }} \ 110 | 111 | - name: Upload benchmark results 112 | uses: actions/upload-artifact@v4 113 | if: always() 114 | with: 115 | name: tpch-benchmark 116 | path: | 117 | tpch_${{ inputs.scale }}.db 118 | mamba_env_export.yml 119 | -------------------------------------------------------------------------------- /tests/workflows/test_snowflake.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | 4 | import dask.dataframe as dd 5 | import pandas as pd 6 | import pytest 7 | 8 | pytestmark = pytest.mark.workflows 9 | 10 | pytest.skip( 11 | reason="https://github.com/coiled/benchmarks/issues/1341", allow_module_level=True 12 | ) 13 | 14 | pytest.importorskip("dask_snowflake") 15 | pytest.importorskip("sqlalchemy") 16 | 17 | from dask_snowflake import read_snowflake, to_snowflake # noqa: E402 18 | from snowflake.sqlalchemy import URL # noqa: E402 19 | from sqlalchemy import create_engine # noqa: E402 20 | 21 | 22 | @pytest.fixture(scope="module") 23 | def connection_kwargs(): 24 | return { 25 | "user": os.environ["SNOWFLAKE_USER"], 26 | "password": os.environ["SNOWFLAKE_PASSWORD"], 27 | "account": os.environ["SNOWFLAKE_ACCOUNT"], 28 | "warehouse": os.environ["SNOWFLAKE_WAREHOUSE"], 29 | "role": os.environ.get("SNOWFLAKE_ROLE", "public"), 30 | "database": os.environ.get("SNOWFLAKE_DATABASE") or "testdb", 31 | "schema": os.environ.get("SNOWFLAKE_SCHEMA") or "testschema", 32 | } 33 | 34 | 35 | @pytest.fixture 36 | def table(connection_kwargs): 37 | """Connect to snowflake and create table""" 38 | name = f"citibike_tripdata_{uuid.uuid4().hex}" 39 | engine = create_engine(URL(**connection_kwargs)) 40 | engine.execute(f"DROP TABLE IF EXISTS {name}") 41 | engine.execute( 42 | f"""create table if not exists {name} ( 43 | ride_id varchar not null unique, 44 | rideable_type varchar not null, 45 | started_at timestamp not null, 46 | ended_at timestamp not null, 47 | start_station_name varchar not null, 48 | start_station_id smallint not null, 49 | end_station_name varchar not null, 50 | end_station_id smallint not null, 51 | start_lat number, 52 | start_lng number, 53 | end_lat number, 54 | end_lng number, 55 | is_member boolean not null 56 | )""" 57 | ) 58 | yield name 59 | # after the data is written, delete table 60 | engine.execute(f"DROP TABLE IF EXISTS {name}") 61 | 62 | 63 | @pytest.mark.client("snowflake") 64 | def test_etl_into_snowflake(client, connection_kwargs, table): 65 | csv_paths = [ 66 | f"s3://tripdata/{ts.year}{ts.month:02}-*-*.csv.zip" 67 | for ts in pd.date_range(start="2022-01-01", end="2023-03-01", freq="MS") 68 | ] 69 | 70 | # preprocess data 71 | def safe_int(x): 72 | """Some station IDs are not correct integers""" 73 | try: 74 | return int(float(x)) 75 | except Exception: 76 | # if station ID is not an int, return -1 77 | return -1 78 | 79 | ddf = dd.read_csv( 80 | csv_paths, 81 | compression="zip", 82 | blocksize=None, 83 | converters={"start_station_id": safe_int, "end_station_id": safe_int}, 84 | storage_options={"anon": True}, 85 | ) 86 | 87 | # filter out incorrect station IDs 88 | ddf = ddf[(ddf.start_station_id != -1) & (ddf.end_station_id != -1)].reset_index( 89 | drop=True 90 | ) 91 | 92 | # create boolean is_member and drop member_casual 93 | ddf["is_member"] = ddf.member_casual == "member" 94 | 95 | ddf = ddf.drop(columns="member_casual") 96 | 97 | # repartition to ensure even chunks 98 | ddf = ddf.repartition(partition_size="100Mb") 99 | 100 | # save data to Snowflake 101 | to_snowflake(ddf, name=table, connection_kwargs=connection_kwargs) 102 | 103 | 104 | @pytest.mark.client("snowflake") 105 | def test_read(client, connection_kwargs): 106 | """Read and explore NYC bike dataset from Snowflake""" 107 | table = "citibike_tripdata" # persistent table 108 | 109 | df = read_snowflake( 110 | f"SELECT * FROM {table}", 111 | connection_kwargs=connection_kwargs, 112 | partition_size="100MiB", 113 | ) 114 | df["IS_MEMBER"].mean().compute() 115 | -------------------------------------------------------------------------------- /tests/tpch/test_dask.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from tests.tpch.utils import get_dataset_path 6 | 7 | pytestmark = pytest.mark.tpch_dask 8 | 9 | dd = pytest.importorskip("dask.dataframe") 10 | 11 | 12 | from . import dask_queries # noqa: E402 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def dataset_path(local, scale): 17 | if local: 18 | # FIXME: pyarrow local fs is a bit odd. dask.dataframe should deal with this 19 | return "file://" + os.path.abspath(get_dataset_path(local, scale)) + "/" 20 | else: 21 | return get_dataset_path(local, scale) 22 | 23 | 24 | @pytest.mark.shuffle_p2p 25 | def test_query_01(client, dataset_path, fs, scale): 26 | dask_queries.query_01(dataset_path, fs, scale).compute() 27 | 28 | 29 | @pytest.mark.shuffle_p2p 30 | def test_query_02(client, dataset_path, fs, scale): 31 | dask_queries.query_02(dataset_path, fs, scale).compute() 32 | 33 | 34 | @pytest.mark.shuffle_p2p 35 | def test_query_03(client, dataset_path, fs, scale): 36 | dask_queries.query_03(dataset_path, fs, scale).compute() 37 | 38 | 39 | @pytest.mark.shuffle_p2p 40 | def test_query_04(client, dataset_path, fs, scale): 41 | dask_queries.query_04(dataset_path, fs, scale).compute() 42 | 43 | 44 | @pytest.mark.shuffle_p2p 45 | def test_query_05(client, dataset_path, fs, scale): 46 | dask_queries.query_05(dataset_path, fs, scale).compute() 47 | 48 | 49 | def test_query_06(client, dataset_path, fs, scale): 50 | dask_queries.query_06(dataset_path, fs, scale).compute() 51 | 52 | 53 | @pytest.mark.shuffle_p2p 54 | def test_query_07(client, dataset_path, fs, scale): 55 | dask_queries.query_07(dataset_path, fs, scale).compute() 56 | 57 | 58 | @pytest.mark.shuffle_p2p 59 | def test_query_08(client, dataset_path, fs, scale): 60 | dask_queries.query_08(dataset_path, fs, scale).compute() 61 | 62 | 63 | @pytest.mark.shuffle_p2p 64 | def test_query_09(client, dataset_path, fs, scale): 65 | dask_queries.query_09(dataset_path, fs, scale).compute() 66 | 67 | 68 | @pytest.mark.shuffle_p2p 69 | def test_query_10(client, dataset_path, fs, scale): 70 | dask_queries.query_10(dataset_path, fs, scale).compute() 71 | 72 | 73 | @pytest.mark.shuffle_p2p 74 | def test_query_11(client, dataset_path, fs, scale): 75 | dask_queries.query_11(dataset_path, fs, scale).compute() 76 | 77 | 78 | @pytest.mark.shuffle_p2p 79 | def test_query_12(client, dataset_path, fs, scale): 80 | dask_queries.query_12(dataset_path, fs, scale).compute() 81 | 82 | 83 | @pytest.mark.shuffle_p2p 84 | def test_query_13(client, dataset_path, fs, scale): 85 | dask_queries.query_13(dataset_path, fs, scale).compute() 86 | 87 | 88 | @pytest.mark.shuffle_p2p 89 | def test_query_14(client, dataset_path, fs, scale): 90 | dask_queries.query_14(dataset_path, fs, scale).compute() 91 | 92 | 93 | @pytest.mark.shuffle_p2p 94 | def test_query_15(client, dataset_path, fs, scale): 95 | dask_queries.query_15(dataset_path, fs, scale).compute() 96 | 97 | 98 | @pytest.mark.shuffle_p2p 99 | def test_query_16(client, dataset_path, fs, scale): 100 | dask_queries.query_16(dataset_path, fs, scale).compute() 101 | 102 | 103 | @pytest.mark.shuffle_p2p 104 | def test_query_17(client, dataset_path, fs, scale): 105 | dask_queries.query_17(dataset_path, fs, scale).compute() 106 | 107 | 108 | @pytest.mark.shuffle_p2p 109 | def test_query_18(client, dataset_path, fs, scale): 110 | dask_queries.query_18(dataset_path, fs, scale).compute() 111 | 112 | 113 | @pytest.mark.shuffle_p2p 114 | def test_query_19(client, dataset_path, fs, scale): 115 | dask_queries.query_19(dataset_path, fs, scale).compute() 116 | 117 | 118 | @pytest.mark.shuffle_p2p 119 | def test_query_20(client, dataset_path, fs, scale): 120 | dask_queries.query_20(dataset_path, fs, scale).compute() 121 | 122 | 123 | @pytest.mark.shuffle_p2p 124 | def test_query_21(client, dataset_path, fs, scale): 125 | dask_queries.query_21(dataset_path, fs, scale).compute() 126 | 127 | 128 | @pytest.mark.shuffle_p2p 129 | def test_query_22(client, dataset_path, fs, scale): 130 | dask_queries.query_22(dataset_path, fs, scale).compute() 131 | -------------------------------------------------------------------------------- /tests/workflows/test_from_csv_to_parquet.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import dask.dataframe as dd 4 | import pytest 5 | 6 | pytestmark = pytest.mark.workflows 7 | 8 | 9 | SCHEMA = OrderedDict( 10 | [ 11 | ("GlobalEventID", "Int64"), 12 | ("Day", "Int64"), 13 | ("MonthYear", "Int64"), 14 | ("Year", "Int64"), 15 | ("FractionDate", "float64"), 16 | ("Actor1Code", "string[pyarrow]"), 17 | ("Actor1Name", "string[pyarrow]"), 18 | ("Actor1CountryCode", "string[pyarrow]"), 19 | ("Actor1KnownGroupCode", "string[pyarrow]"), 20 | ("Actor1EthnicCode", "string[pyarrow]"), 21 | ("Actor1Religion1Code", "string[pyarrow]"), 22 | ("Actor1Religion2Code", "string[pyarrow]"), 23 | ("Actor1Type1Code", "string[pyarrow]"), 24 | ("Actor1Type2Code", "string[pyarrow]"), 25 | ("Actor1Type3Code", "string[pyarrow]"), 26 | ("Actor2Code", "string[pyarrow]"), 27 | ("Actor2Name", "string[pyarrow]"), 28 | ("Actor2CountryCode", "string[pyarrow]"), 29 | ("Actor2KnownGroupCode", "string[pyarrow]"), 30 | ("Actor2EthnicCode", "string[pyarrow]"), 31 | ("Actor2Religion1Code", "string[pyarrow]"), 32 | ("Actor2Religion2Code", "string[pyarrow]"), 33 | ("Actor2Type1Code", "string[pyarrow]"), 34 | ("Actor2Type2Code", "string[pyarrow]"), 35 | ("Actor2Type3Code", "string[pyarrow]"), 36 | ("IsRootEvent", "Int64"), 37 | ("EventCode", "string[pyarrow]"), 38 | ("EventBaseCode", "string[pyarrow]"), 39 | ("EventRootCode", "string[pyarrow]"), 40 | ("QuadClass", "Int64"), 41 | ("GoldsteinScale", "float64"), 42 | ("NumMentions", "Int64"), 43 | ("NumSources", "Int64"), 44 | ("NumArticles", "Int64"), 45 | ("AvgTone", "float64"), 46 | ("Actor1Geo_Type", "Int64"), 47 | ("Actor1Geo_Fullname", "string[pyarrow]"), 48 | ("Actor1Geo_CountryCode", "string[pyarrow]"), 49 | ("Actor1Geo_ADM1Code", "string[pyarrow]"), 50 | ("Actor1Geo_Lat", "float64"), 51 | ("Actor1Geo_Long", "float64"), 52 | ("Actor1Geo_FeatureID", "string[pyarrow]"), 53 | ("Actor2Geo_Type", "Int64"), 54 | ("Actor2Geo_Fullname", "string[pyarrow]"), 55 | ("Actor2Geo_CountryCode", "string[pyarrow]"), 56 | ("Actor2Geo_ADM1Code", "string[pyarrow]"), 57 | ("Actor2Geo_Lat", "float64"), 58 | ("Actor2Geo_Long", "float64"), 59 | ("Actor2Geo_FeatureID", "string[pyarrow]"), 60 | ("ActionGeo_Type", "Int64"), 61 | ("ActionGeo_Fullname", "string[pyarrow]"), 62 | ("ActionGeo_CountryCode", "string[pyarrow]"), 63 | ("ActionGeo_ADM1Code", "string[pyarrow]"), 64 | ("ActionGeo_Lat", "float64"), 65 | ("ActionGeo_Long", "float64"), 66 | ("ActionGeo_FeatureID", "string[pyarrow]"), 67 | ("DATEADDED", "Int64"), 68 | ("SOURCEURL", "string[pyarrow]"), 69 | ] 70 | ) 71 | 72 | 73 | @pytest.mark.client("from_csv_to_parquet") 74 | def test_from_csv_to_parquet(client, s3_factory, s3_url): 75 | s3 = s3_factory(anon=True) 76 | files = s3.ls("s3://gdelt-open-data/events/")[:1000] 77 | files = [f"s3://{f}" for f in files] 78 | 79 | df = dd.read_csv( 80 | files, 81 | sep="\t", 82 | names=SCHEMA.keys(), 83 | # 'dtype' and 'converters' cannot overlap 84 | dtype={col: dtype for col, dtype in SCHEMA.items() if dtype != "float64"}, 85 | storage_options=s3.storage_options, 86 | on_bad_lines="skip", 87 | # Some bad files have '#' in float values 88 | converters={ 89 | col: lambda v: float(v.replace("#", "") or "NaN") 90 | for col, dtype in SCHEMA.items() 91 | if dtype == "float64" 92 | }, 93 | ) 94 | 95 | # Now we can safely convert the float columns 96 | df = df.astype({col: dtype for col, dtype in SCHEMA.items() if dtype == "float64"}) 97 | 98 | df = df.map_partitions( 99 | lambda xdf: xdf.drop_duplicates(subset=["SOURCEURL"], keep="first") 100 | ) 101 | df["national_paper"] = df.SOURCEURL.str.contains( 102 | "washingtonpost|nytimes", regex=True 103 | ) 104 | df = df[df["national_paper"]] 105 | df.to_parquet(f"{s3_url}/from-csv-to-parquet/", write_index=False) 106 | -------------------------------------------------------------------------------- /tests/benchmarks/test_work_stealing.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import dask.array as da 4 | import distributed 5 | import numpy as np 6 | import pytest 7 | from coiled import Cluster 8 | from dask import delayed, utils 9 | from distributed import Client 10 | from packaging.version import Version 11 | from tornado.ioloop import PeriodicCallback 12 | 13 | from ..utils_test import run_up_to_nthreads 14 | 15 | 16 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 17 | def test_trivial_workload_should_not_cause_work_stealing(small_client): 18 | root = delayed(lambda n: "x" * n)(utils.parse_bytes("1MiB"), dask_key_name="root") 19 | results = [delayed(lambda *args: None)(root, i) for i in range(10000)] 20 | futs = small_client.compute(results) 21 | small_client.gather(futs) 22 | 23 | 24 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset") 25 | def test_work_stealing_on_inhomogeneous_workload(small_client): 26 | np.random.seed(42) 27 | delays = np.random.lognormal(1, 1.3, 500) 28 | 29 | @delayed 30 | def clog(n): 31 | time.sleep(min(n, 60)) 32 | return n 33 | 34 | results = [clog(i) for i in delays] 35 | futs = small_client.compute(results) 36 | small_client.gather(futs) 37 | 38 | 39 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 40 | @pytest.mark.xfail( 41 | Version(distributed.__version__) < Version("2022.6.1"), 42 | reason="https://github.com/dask/distributed/issues/6624", 43 | ) 44 | def test_work_stealing_on_scaling_up( 45 | test_name_uuid, 46 | benchmark_all, 47 | cluster_kwargs, 48 | dask_env_variables, 49 | github_cluster_tags, 50 | ): 51 | with Cluster( 52 | name=test_name_uuid, 53 | environ=dask_env_variables, 54 | tags=github_cluster_tags, 55 | **cluster_kwargs["test_work_stealing_on_scaling_up"], 56 | ) as cluster: 57 | with Client(cluster) as client: 58 | # FIXME https://github.com/coiled/platform/issues/103 59 | client.wait_for_workers(1, timeout=300) 60 | with benchmark_all(client): 61 | # Slow task. 62 | def func1(chunk): 63 | if sum(chunk.shape) != 0: # Make initialization fast 64 | time.sleep(5) 65 | return chunk 66 | 67 | def func2(chunk): 68 | return chunk 69 | 70 | data = da.zeros((30, 30, 30), chunks=5) 71 | result = data.map_overlap(func1, depth=1, dtype=data.dtype) 72 | result = result.map_overlap(func2, depth=1, dtype=data.dtype) 73 | future = client.compute(result) 74 | 75 | print("started computation") 76 | 77 | time.sleep(11) 78 | # print('scaling to 4 workers') 79 | # client.cluster.scale(4) 80 | 81 | time.sleep(5) 82 | print("scaling to 20 workers") 83 | cluster.scale(20) 84 | 85 | _ = future.result() 86 | 87 | 88 | @run_up_to_nthreads("small_cluster", 100, reason="fixed dataset") 89 | def test_work_stealing_on_straggling_worker( 90 | test_name_uuid, 91 | benchmark_all, 92 | cluster_kwargs, 93 | dask_env_variables, 94 | github_cluster_tags, 95 | ): 96 | kwargs = cluster_kwargs["test_work_stealing_on_straggling_worker"] 97 | with Cluster( 98 | name=test_name_uuid, 99 | environ=dask_env_variables, 100 | tags=github_cluster_tags, 101 | **kwargs, 102 | ) as cluster: 103 | with Client(cluster) as client: 104 | # FIXME https://github.com/coiled/platform/issues/103 105 | client.wait_for_workers(kwargs["n_workers"], timeout=600) 106 | with benchmark_all(client): 107 | 108 | def clog(): 109 | time.sleep(1) 110 | 111 | @delayed 112 | def slowinc(i, delay): 113 | time.sleep(delay) 114 | return i + 1 115 | 116 | def install_clogging_callback(dask_worker): 117 | pc = PeriodicCallback(clog, 1500) 118 | dask_worker.periodic_callbacks["clog"] = pc 119 | pc.start() 120 | 121 | straggler = list(client.scheduler_info()["workers"].keys())[0] 122 | client.run(install_clogging_callback, workers=[straggler]) 123 | results = [slowinc(i, delay=1) for i in range(1000)] 124 | futs = client.compute(results) 125 | client.gather(futs) 126 | -------------------------------------------------------------------------------- /benchmark_schema.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Boolean, Column, DateTime, Float, Integer, String 2 | from sqlalchemy.orm import declarative_base 3 | 4 | Base = declarative_base() 5 | 6 | 7 | class TestRun(Base): 8 | __tablename__ = "test_run" 9 | 10 | # unique run ID 11 | id = Column(Integer, primary_key=True) 12 | 13 | # pytest data 14 | session_id = Column(String, nullable=False) 15 | name = Column(String, nullable=False) 16 | originalname = Column(String, nullable=False) 17 | path = Column(String, nullable=True) 18 | setup_outcome = Column(String, nullable=True) 19 | call_outcome = Column(String, nullable=True) 20 | teardown_outcome = Column(String, nullable=True) 21 | 22 | # Runtime data 23 | coiled_runtime_version = Column(String, nullable=True) 24 | coiled_software_name = Column(String, nullable=True) 25 | dask_version = Column(String, nullable=True) 26 | dask_expr_version = Column(String, nullable=True) 27 | distributed_version = Column(String, nullable=True) 28 | python_version = Column(String, nullable=True) 29 | platform = Column(String, nullable=True) 30 | 31 | # CI runner data 32 | ci_run_url = Column(String, nullable=True) 33 | 34 | # Wall clock data 35 | start = Column(DateTime, nullable=True) 36 | end = Column(DateTime, nullable=True) 37 | duration = Column(Float, nullable=True) 38 | 39 | # Memory data 40 | average_memory = Column(Float, nullable=True) 41 | peak_memory = Column(Float, nullable=True) 42 | 43 | # Durations data 44 | compute_time = Column(Float, nullable=True) 45 | disk_spill_time = Column(Float, nullable=True) 46 | serializing_time = Column(Float, nullable=True) 47 | transfer_time = Column(Float, nullable=True) 48 | 49 | # Scheduler 50 | scheduler_cpu_avg = Column(Float, nullable=True) 51 | scheduler_memory_max = Column(Float, nullable=True) 52 | 53 | # Event Loop 54 | worker_max_tick = Column(Float, nullable=True) 55 | scheduler_max_tick = Column(Float, nullable=True) 56 | 57 | # Cluster name/id/details_url 58 | cluster_name = Column(String, nullable=True) 59 | cluster_id = Column(Integer, nullable=True) 60 | cluster_details_url = Column(String, nullable=True) 61 | 62 | # Artifacts 63 | performance_report_url = Column(String, nullable=True) # Not yet collected 64 | cluster_dump_url = Column(String, nullable=True) 65 | memray_profiles_url = Column(String, nullable=True) 66 | py_spy_profiles_url = Column(String, nullable=True) 67 | 68 | 69 | class TPCHRun(Base): 70 | __tablename__ = "tpch_run" 71 | 72 | # unique run ID 73 | id = Column(Integer, primary_key=True) 74 | 75 | # pytest data 76 | session_id = Column(String, nullable=False) 77 | name = Column(String, nullable=False) 78 | originalname = Column(String, nullable=False) 79 | path = Column(String, nullable=True) 80 | setup_outcome = Column(String, nullable=True) 81 | call_outcome = Column(String, nullable=True) 82 | teardown_outcome = Column(String, nullable=True) 83 | 84 | # Runtime data 85 | dask_version = Column(String, nullable=True) 86 | dask_expr_version = Column(String, nullable=True) 87 | distributed_version = Column(String, nullable=True) 88 | duckdb_version = Column(String, nullable=True) 89 | pyspark_version = Column(String, nullable=True) 90 | polars_version = Column(String, nullable=True) 91 | 92 | python_version = Column(String, nullable=True) 93 | platform = Column(String, nullable=True) 94 | 95 | # CI runner data 96 | ci_run_url = Column(String, nullable=True) 97 | 98 | # Wall clock data 99 | start = Column(DateTime, nullable=True) 100 | end = Column(DateTime, nullable=True) 101 | duration = Column(Float, nullable=True) 102 | 103 | # Memory data 104 | average_memory = Column(Float, nullable=True) 105 | peak_memory = Column(Float, nullable=True) 106 | 107 | # Cluster name/id/details_url 108 | cluster_name = Column(String, nullable=True) 109 | cluster_id = Column(Integer, nullable=True) 110 | cluster_details_url = Column(String, nullable=True) 111 | 112 | scale = Column(Integer, nullable=False) 113 | query = Column(Integer, nullable=False) 114 | local = Column(Boolean, nullable=False) 115 | 116 | compression = Column(String, nullable=True) 117 | partition_size = Column(String, nullable=True) 118 | partition_size = Column(String, nullable=True) 119 | 120 | n_workers = Column(Integer, nullable=True) 121 | worker_vm_type = Column(String, nullable=True) 122 | cluster_disk_size = Column(Integer, nullable=True) 123 | -------------------------------------------------------------------------------- /tests/tpch/test_correctness.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import coiled 4 | import dask 5 | import pandas as pd 6 | import pytest 7 | from distributed import LocalCluster 8 | 9 | from .utils import get_answers_path, get_cluster_spec, get_dataset_path 10 | 11 | pytestmark = pytest.mark.tpch_correctness 12 | 13 | 14 | @pytest.fixture(params=[1, 10, 100], scope="session") 15 | def scale(request): 16 | scale = request.param 17 | if scale != 100: 18 | pytest.skip(reason="Don't test everything by default") 19 | return scale 20 | 21 | 22 | # Override identical fixture in conftest.py to use different scale 23 | @pytest.fixture(scope="session") 24 | def dataset_path(local, scale): 25 | return get_dataset_path(local, scale) 26 | 27 | 28 | @pytest.fixture(scope="session") 29 | def answers_path(local, scale): 30 | return get_answers_path(local, scale) 31 | 32 | 33 | # Override identical fixture in conftest.py to use different scale 34 | @pytest.fixture(scope="session") 35 | def cluster_spec(scale, shutdown_on_close): 36 | return get_cluster_spec(scale=scale, shutdown_on_close=shutdown_on_close) 37 | 38 | 39 | @pytest.fixture(scope="module") 40 | def cluster( 41 | local, 42 | scale, 43 | module, 44 | dask_env_variables, 45 | cluster_spec, 46 | github_cluster_tags, 47 | name, 48 | make_chart, 49 | ): 50 | if local: 51 | with LocalCluster() as cluster: 52 | yield cluster 53 | else: 54 | kwargs = dict( 55 | name=f"tpch-{module}-{scale}-{name}", 56 | environ=dask_env_variables, 57 | tags=github_cluster_tags, 58 | region="us-east-2", 59 | **cluster_spec, 60 | ) 61 | with dask.config.set({"distributed.scheduler.worker-saturation": "inf"}): 62 | with coiled.Cluster(**kwargs) as cluster: 63 | yield cluster 64 | 65 | 66 | @pytest.fixture 67 | def client( 68 | request, 69 | cluster, 70 | cluster_kwargs, 71 | get_cluster_info, 72 | performance_report, 73 | benchmark_time, 74 | restart, 75 | local, 76 | query, 77 | scale, 78 | ): 79 | with cluster.get_client() as client: 80 | if restart: 81 | client.restart() 82 | client.run(lambda: None) 83 | 84 | with get_cluster_info(cluster), performance_report, benchmark_time: 85 | yield client 86 | 87 | 88 | def get_expected_answer(query: int, answers_path: str, s3_storage_options): 89 | answer = pd.read_parquet( 90 | os.path.join(answers_path, f"answer_{query}.parquet"), 91 | storage_options=s3_storage_options, 92 | ) 93 | answer = answer.rename(columns=lambda x: x.strip()) 94 | if "o_orderdate" in answer.columns: 95 | answer["o_orderdate"] = pd.to_datetime(answer["o_orderdate"]) 96 | if "cntrycode" in answer.columns: 97 | answer["cntrycode"] = answer["cntrycode"].astype(str) 98 | 99 | return answer 100 | 101 | 102 | def verify_result( 103 | result: pd.DataFrame, query: int, answers_path: str, s3_storage_options 104 | ): 105 | expected = get_expected_answer(query, answers_path, s3_storage_options) 106 | 107 | for column, dtype in expected.dtypes.items(): 108 | if pd.api.types.is_object_dtype(dtype): 109 | result[column] = result[column].astype("str") 110 | expected[column] = expected[column].astype("str") 111 | # Some DuckDB results appear to be stripped, so strip them all for better comparison. 112 | result[column] = result[column].str.strip() 113 | expected[column] = expected[column].str.strip() 114 | 115 | # Query 11 is not deterministically sorted, there may be several 'ps_partkey' with the same,,l\\ 'value' 116 | if query == 11: 117 | assert result["value"].is_monotonic_decreasing 118 | assert expected["value"].is_monotonic_decreasing 119 | result = result.sort_values(["value", "ps_partkey"], ascending=[False, True]) 120 | expected = expected.sort_values( 121 | ["value", "ps_partkey"], ascending=[False, True] 122 | ) 123 | 124 | result = result.reset_index(drop=True) 125 | expected = expected.reset_index(drop=True) 126 | 127 | pd.testing.assert_frame_equal(result, expected, check_dtype=False, atol=1e-2) 128 | 129 | 130 | @pytest.mark.tpch_correctness 131 | @pytest.mark.parametrize( 132 | "query", 133 | [ 134 | 1, 135 | 2, 136 | 3, 137 | 4, 138 | 5, 139 | 6, 140 | 7, 141 | 8, 142 | 9, 143 | 10, 144 | 11, 145 | 12, 146 | 13, 147 | 14, 148 | 15, 149 | 16, 150 | 17, 151 | 18, 152 | 19, 153 | 20, 154 | 21, 155 | 22, 156 | ], 157 | ) 158 | def test_dask_results( 159 | query, scale, local, dataset_path, answers_path, s3_storage_options, client 160 | ): 161 | from . import dask_queries 162 | 163 | func = getattr(dask_queries, f"query_{query:02d}") 164 | result = func(dataset_path, None, scale).compute() 165 | verify_result(result, query, answers_path, s3_storage_options) 166 | -------------------------------------------------------------------------------- /tests/benchmarks/test_rechunk.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dask 4 | import dask.array as da 5 | import pytest 6 | 7 | from ..conftest import requires_p2p_memory, requires_p2p_rechunk 8 | from ..utils_test import cluster_memory, scaled_array_shape, wait 9 | 10 | 11 | @pytest.fixture(params=["8.5 MiB", "auto"]) 12 | def input_chunk_size(request): 13 | return request.param 14 | 15 | 16 | @pytest.fixture( 17 | params=[ 18 | pytest.param("tasks", marks=pytest.mark.shuffle_tasks), 19 | pytest.param("p2p-disk", marks=[pytest.mark.shuffle_p2p, requires_p2p_rechunk]), 20 | pytest.param( 21 | "p2p-memory", marks=[pytest.mark.shuffle_p2p, requires_p2p_memory] 22 | ), 23 | ] 24 | ) 25 | def configure_rechunking_in_memory(request): 26 | if request.param == "tasks": 27 | with dask.config.set({"array.rechunk.method": "tasks"}): 28 | yield 29 | else: 30 | disk = "disk" in request.param 31 | with dask.config.set( 32 | { 33 | "array.rechunk.method": "p2p", 34 | "distributed.p2p.disk": disk, 35 | } 36 | ): 37 | yield 38 | 39 | 40 | @pytest.fixture( 41 | params=[ 42 | pytest.param("tasks", marks=pytest.mark.shuffle_tasks), 43 | pytest.param("p2p", marks=[pytest.mark.shuffle_p2p, requires_p2p_rechunk]), 44 | ] 45 | ) 46 | def configure_rechunking_out_of_core(request): 47 | if request.param == "tasks": 48 | with dask.config.set({"array.rechunk.method": "tasks"}): 49 | yield 50 | else: 51 | with dask.config.set( 52 | { 53 | "array.rechunk.method": "p2p", 54 | "distributed.p2p.disk": True, 55 | } 56 | ): 57 | yield 58 | 59 | 60 | def test_tiles_to_rows( 61 | # Order matters: don't initialize client when skipping test 62 | input_chunk_size, 63 | configure_rechunking_in_memory, 64 | small_client, 65 | ): 66 | """2D array sliced into square tiles becomes sliced by columns. 67 | This use case can be broken down into N independent problems. 68 | In task rechunk, this generates O(N) intermediate tasks and graph edges. 69 | """ 70 | memory = cluster_memory(small_client) 71 | shape = scaled_array_shape(memory * 1.5, ("x", "x")) 72 | 73 | a = da.random.random(shape, chunks=input_chunk_size) 74 | a = a.rechunk((-1, "auto")).sum() 75 | wait(a, small_client, timeout=600) 76 | 77 | 78 | def test_swap_axes_in_memory( 79 | # Order matters: don't initialize client when skipping test 80 | input_chunk_size, 81 | configure_rechunking_in_memory, 82 | small_client, 83 | ): 84 | """2D array sliced by columns becomes sliced by rows. 85 | This is an N-to-N problem, so grouping into sub-problems is impossible. 86 | In task rechunk, this generates O(N^2) intermediate tasks and graph edges. 87 | """ 88 | memory = cluster_memory(small_client) 89 | shape = scaled_array_shape(memory * 0.5, ("x", "x")) 90 | 91 | a = da.random.random(shape, chunks=(-1, input_chunk_size)) 92 | a = a.rechunk(("auto", -1)).sum() 93 | wait(a, small_client, timeout=600) 94 | 95 | 96 | def test_swap_axes_out_of_core( 97 | # Order matters: don't initialize client when skipping test 98 | configure_rechunking_out_of_core, 99 | small_client, 100 | ): 101 | """2D array sliced by columns becomes sliced by rows. 102 | This is an N-to-N problem, so grouping into sub-problems is impossible. 103 | In task rechunk, this generates O(N^2) intermediate tasks and graph edges. 104 | """ 105 | memory = cluster_memory(small_client) 106 | shape = scaled_array_shape(memory * 1.5, ("x", "x")) 107 | 108 | a = da.random.random(shape, chunks=(-1, "auto")) 109 | a = a.rechunk(("auto", -1)).sum() 110 | wait(a, small_client, timeout=600) 111 | 112 | 113 | def test_adjacent_groups( 114 | # Order matters: don't initialize client when skipping test 115 | input_chunk_size, 116 | configure_rechunking_in_memory, 117 | small_client, 118 | ): 119 | """M-to-N use case, where each input task feeds into a localized but substantial 120 | subset of the output tasks, with partial interaction between adjacent zones. 121 | """ 122 | memory = cluster_memory(small_client) 123 | shape = scaled_array_shape(memory * 1.5, ("x", 10, 10_000)) 124 | 125 | a = da.random.random(shape, chunks=(input_chunk_size, 2, 5_000)) 126 | a = a.rechunk(("auto", 5, 10_000)).sum() 127 | wait(a, small_client, timeout=600) 128 | 129 | 130 | def test_heal_oversplit( 131 | # Order matters: don't initialize client when skipping test 132 | configure_rechunking_in_memory, 133 | small_client, 134 | ): 135 | """rechunk() is used to heal a situation where chunks are too small. 136 | This is a trivial N-to-1 reduction step that gets no benefit from p2p rechunking. 137 | """ 138 | memory = cluster_memory(small_client) 139 | shape = scaled_array_shape(memory * 1.5, ("x", "x")) 140 | # Avoid exact n:1 rechunking, which would be a simpler special case. 141 | # Dask should be smart enough to avoid splitting input chunks out to multiple output 142 | # chunks. 143 | a = da.random.random(shape, chunks="8.5 MiB") 144 | a = a.rechunk("128 MiB").sum() 145 | wait(a, small_client, timeout=600) 146 | -------------------------------------------------------------------------------- /detect_regressions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from distutils.util import strtobool 4 | 5 | import pandas as pd 6 | import sqlalchemy 7 | 8 | 9 | def detect_regressions(database_file, is_pr=False): 10 | engine = sqlalchemy.create_engine(f"sqlite:///{database_file}") 11 | 12 | # regression analysis only on tests that passed 13 | df = pd.read_sql( 14 | "select * from test_run where platform = 'linux' and call_outcome = 'passed'", 15 | engine, 16 | ) 17 | 18 | # join runtime + py version 19 | df = df.assign( 20 | runtime=( 21 | "coiled-" 22 | + df.coiled_runtime_version 23 | + "-py" 24 | + df.python_version.str.split(".", n=2).str[:2].str.join(".") 25 | ), 26 | category=df.path.str.split("/", n=1).str[0], 27 | ) 28 | 29 | reg_df = pd.DataFrame( 30 | columns=[ 31 | "category", 32 | "type", 33 | "mean", 34 | "last", 35 | "last-1", 36 | "last-2", 37 | "threshold", 38 | "str_report", 39 | ] 40 | ) 41 | if is_pr: 42 | # Only include last run in detection regression 43 | n_last = 1 44 | n_std = 3 # be a bit more aggressive on PRs 45 | else: 46 | n_last = 3 47 | n_std = 3 48 | 49 | runtimes = list(df.runtime.unique()) 50 | for runtime in runtimes: 51 | by_test = df[(df.runtime == runtime)].groupby("name") 52 | 53 | test_names = list(by_test.groups.keys()) 54 | for name in test_names: 55 | df_test = by_test.get_group(name) 56 | 57 | # check the test is not obsolete. 58 | if pd.Timestamp(df_test.start.iloc[-1]) < ( 59 | pd.Timestamp.now() - pd.Timedelta(days=7) 60 | ): 61 | # the latest run was 7+ days ago, test is obsolete 62 | pass 63 | else: 64 | for metric in ["duration", "average_memory", "peak_memory"]: 65 | # check that we have enough data to do some stats (last three plus previous ten) 66 | if len(df_test.loc[df_test[metric].notna()]) > (10 + n_last): 67 | category = df_test.category.unique()[0] 68 | 69 | if metric in ["average_memory", "peak_memory"]: 70 | units_norm = 1 / (1024**3) # to GiB to match dashboard 71 | u = "[GiB]" 72 | else: 73 | units_norm = 1 74 | u = "[s]" 75 | 76 | metric_threshold = df_test[metric][ 77 | -(10 + n_last) : -n_last 78 | ].mean() + max( 79 | n_std * df_test[metric][-(10 + n_last) : -n_last].std(), 80 | 1 / units_norm, 81 | ) 82 | 83 | if (df_test[metric].iloc[-n_last:] >= metric_threshold).all(): 84 | last_three = ( 85 | df_test[metric].iloc[-1] * units_norm, 86 | df_test[metric].iloc[-2] * units_norm, 87 | df_test[metric].iloc[-3] * units_norm, 88 | ) 89 | reg = ( 90 | f"{runtime=}, {name=}, {category=}, " 91 | f"last_three_{metric} {u} = " 92 | f"{last_three}, " 93 | f"{metric}_threshold {u} = {metric_threshold * units_norm} \n" 94 | ) 95 | 96 | # ["category", "type", "mean", "last", "last-1", "last-2", "threshold"]) 97 | reg_df.loc[f"{(runtime, name, metric)} {u}"] = [ 98 | category, 99 | metric, 100 | df_test[metric][-(10 + n_last) : -n_last].mean() 101 | * units_norm, 102 | df_test[metric].iloc[-1] * units_norm, 103 | df_test[metric].iloc[-2] * units_norm, 104 | df_test[metric].iloc[-3] * units_norm, 105 | metric_threshold * units_norm, 106 | reg, 107 | ] 108 | 109 | return reg_df 110 | 111 | 112 | def regressions_report(reg_df): 113 | # write reg_df to markdown for GHA summary 114 | cols_for_report = [ 115 | "category", 116 | "type", 117 | "mean", 118 | "last", 119 | "last-1", 120 | "last-2", 121 | "threshold", 122 | ] 123 | reg_df[cols_for_report].to_markdown("regressions_summary.md") 124 | 125 | if not reg_df.empty: 126 | # Raise exception to cause CI job to fail if we detected regressions 127 | raise Exception( 128 | f"\x1b[31m {len(reg_df)} regressions detected: \n{''.join(reg_df.str_report.values)} \x1b[0m" 129 | ) 130 | else: 131 | return 132 | 133 | 134 | if __name__ == "__main__": 135 | DB_FILE = pathlib.Path("./benchmark.db") 136 | 137 | IS_PR = strtobool(os.environ.get("IS_PR", "false")) 138 | regressions_df = detect_regressions(DB_FILE, is_pr=IS_PR) 139 | 140 | regressions_report(regressions_df) 141 | -------------------------------------------------------------------------------- /tests/tpch/test_duckdb.py: -------------------------------------------------------------------------------- 1 | import botocore.session 2 | import pytest 3 | 4 | pytestmark = pytest.mark.tpch_nondask 5 | 6 | duckdb = pytest.importorskip("duckdb") 7 | 8 | from . import duckdb_queries # noqa: E402 9 | 10 | 11 | @pytest.fixture(autouse=True) 12 | def add_duckdb_version(tpch_database_table_schema): 13 | tpch_database_table_schema.duckdb_version = duckdb.__version__ 14 | 15 | 16 | @pytest.fixture(autouse=True) 17 | def add_cluster_spec_to_db(tpch_database_table_schema, machine_spec, local): 18 | if not local: 19 | tpch_database_table_schema.n_workers = 1 20 | tpch_database_table_schema.worker_vm_type = machine_spec["vm_type"] 21 | tpch_database_table_schema.cluster_disk_size = machine_spec.get( 22 | "worker_disk_size" 23 | ) 24 | 25 | 26 | @pytest.fixture 27 | def connection(local, restart): 28 | def _(): 29 | con = duckdb.connect() 30 | 31 | if not local: # Setup s3 credentials 32 | session = botocore.session.Session() 33 | creds = session.get_credentials() 34 | con.install_extension("httpfs") 35 | con.load_extension("httpfs") 36 | con.sql( 37 | f""" 38 | SET s3_region='us-east-2'; 39 | SET s3_access_key_id='{creds.access_key}'; 40 | SET s3_secret_access_key='{creds.secret_key}'; 41 | SET s3_session_token='{creds.token}'; 42 | """ 43 | ) 44 | return con 45 | 46 | return _ 47 | 48 | 49 | def test_query_01(run, connection, dataset_path, scale): 50 | def _(): 51 | duckdb_queries.query_01(connection(), dataset_path, scale) 52 | 53 | run(_) 54 | 55 | 56 | def test_query_02(run, connection, dataset_path, scale): 57 | def _(): 58 | duckdb_queries.query_02(connection(), dataset_path, scale) 59 | 60 | run(_) 61 | 62 | 63 | def test_query_03(run, connection, dataset_path, scale): 64 | def _(): 65 | duckdb_queries.query_03(connection(), dataset_path, scale) 66 | 67 | run(_) 68 | 69 | 70 | def test_query_04(run, connection, dataset_path, scale): 71 | def _(): 72 | duckdb_queries.query_04(connection(), dataset_path, scale) 73 | 74 | run(_) 75 | 76 | 77 | def test_query_05(run, connection, dataset_path, scale): 78 | def _(): 79 | duckdb_queries.query_05(connection(), dataset_path, scale) 80 | 81 | run(_) 82 | 83 | 84 | def test_query_06(run, connection, dataset_path, scale): 85 | def _(): 86 | duckdb_queries.query_06(connection(), dataset_path, scale) 87 | 88 | run(_) 89 | 90 | 91 | def test_query_07(run, connection, dataset_path, scale): 92 | def _(): 93 | duckdb_queries.query_07(connection(), dataset_path, scale) 94 | 95 | run(_) 96 | 97 | 98 | def test_query_08(run, connection, dataset_path, scale): 99 | def _(): 100 | duckdb_queries.query_08(connection(), dataset_path, scale) 101 | 102 | run(_) 103 | 104 | 105 | def test_query_09(run, connection, dataset_path, scale): 106 | def _(): 107 | duckdb_queries.query_09(connection(), dataset_path, scale) 108 | 109 | run(_) 110 | 111 | 112 | def test_query_10(run, connection, dataset_path, scale): 113 | def _(): 114 | duckdb_queries.query_10(connection(), dataset_path, scale) 115 | 116 | run(_) 117 | 118 | 119 | def test_query_11(run, connection, dataset_path, scale): 120 | def _(): 121 | duckdb_queries.query_11(connection(), dataset_path, scale) 122 | 123 | run(_) 124 | 125 | 126 | def test_query_12(run, connection, dataset_path, scale): 127 | def _(): 128 | duckdb_queries.query_12(connection(), dataset_path, scale) 129 | 130 | run(_) 131 | 132 | 133 | def test_query_13(run, connection, dataset_path, scale): 134 | def _(): 135 | duckdb_queries.query_13(connection(), dataset_path, scale) 136 | 137 | run(_) 138 | 139 | 140 | def test_query_14(run, connection, dataset_path, scale): 141 | def _(): 142 | duckdb_queries.query_14(connection(), dataset_path, scale) 143 | 144 | run(_) 145 | 146 | 147 | def test_query_15(run, connection, dataset_path, scale): 148 | def _(): 149 | duckdb_queries.query_15(connection(), dataset_path, scale) 150 | 151 | run(_) 152 | 153 | 154 | def test_query_16(run, connection, dataset_path, scale): 155 | def _(): 156 | duckdb_queries.query_16(connection(), dataset_path, scale) 157 | 158 | run(_) 159 | 160 | 161 | def test_query_17(run, connection, dataset_path, scale): 162 | def _(): 163 | duckdb_queries.query_17(connection(), dataset_path, scale) 164 | 165 | run(_) 166 | 167 | 168 | def test_query_18(run, connection, dataset_path, scale): 169 | def _(): 170 | duckdb_queries.query_18(connection(), dataset_path, scale) 171 | 172 | run(_) 173 | 174 | 175 | def test_query_19(run, connection, dataset_path, scale): 176 | def _(): 177 | duckdb_queries.query_19(connection(), dataset_path, scale) 178 | 179 | run(_) 180 | 181 | 182 | def test_query_20(run, connection, dataset_path, scale): 183 | def _(): 184 | duckdb_queries.query_20(connection(), dataset_path, scale) 185 | 186 | run(_) 187 | 188 | 189 | def test_query_21(run, connection, dataset_path, scale): 190 | def _(): 191 | duckdb_queries.query_21(connection(), dataset_path, scale) 192 | 193 | run(_) 194 | 195 | 196 | def test_query_22(run, connection, dataset_path, scale): 197 | def _(): 198 | duckdb_queries.query_22(connection(), dataset_path, scale) 199 | 200 | run(_) 201 | -------------------------------------------------------------------------------- /tests/benchmarks/test_parquet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parquet-related benchmarks. 3 | """ 4 | import io 5 | import uuid 6 | 7 | import boto3 8 | import dask.dataframe as dd 9 | import dask.datasets 10 | import distributed 11 | import fsspec 12 | import pandas 13 | import pytest 14 | from coiled import Cluster 15 | from packaging.version import Version 16 | 17 | from ..conftest import dump_cluster_kwargs 18 | from ..utils_test import run_up_to_nthreads, wait 19 | 20 | try: 21 | import pyarrow 22 | 23 | HAS_PYARROW12 = Version(pyarrow.__version__) >= Version("12.0.0") 24 | except ImportError: 25 | HAS_PYARROW12 = False 26 | 27 | 28 | @pytest.fixture(scope="module") 29 | def parquet_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags): 30 | kwargs = dict( 31 | name=f"parquet-{uuid.uuid4().hex[:8]}", 32 | environ=dask_env_variables, 33 | tags=github_cluster_tags, 34 | **cluster_kwargs["parquet_cluster"], 35 | ) 36 | dump_cluster_kwargs(kwargs, "parquet") 37 | 38 | with Cluster(**kwargs) as cluster: 39 | yield cluster 40 | 41 | 42 | @pytest.fixture 43 | def parquet_client(parquet_cluster, cluster_kwargs, benchmark_all, wait_for_workers): 44 | n_workers = cluster_kwargs["parquet_cluster"]["n_workers"] 45 | with distributed.Client(parquet_cluster) as client: 46 | parquet_cluster.scale(n_workers) 47 | wait_for_workers(client, n_workers, timeout=600) 48 | client.restart() 49 | with benchmark_all(client): 50 | yield client 51 | 52 | 53 | @pytest.mark.skipif( 54 | HAS_PYARROW12, 55 | reason="50x slower than PyArrow 11; https://github.com/coiled/benchmarks/issues/998", 56 | ) 57 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset") 58 | def test_read_spark_generated_data(parquet_client): 59 | """ 60 | Read a ~15 GB subset of a ~800 GB spark-generated 61 | open dataset on AWS. 62 | 63 | The dataset was copied from AWS open data on 2022-05-25 64 | https://registry.opendata.aws/1000-genomes-data-lakehouse-ready/ 65 | Citation: https://www.nature.com/articles/s41467-018-08148-z 66 | """ 67 | ddf = dd.read_parquet( 68 | "s3://coiled-runtime-ci/thousandgenomes_dagen/NA21**.parquet", 69 | engine="pyarrow", 70 | index="sample_id", 71 | ) 72 | coll = ddf.groupby(ddf.index).first() 73 | wait(coll, parquet_client, 500) 74 | 75 | 76 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset") 77 | def test_read_hive_partitioned_data(parquet_client): 78 | """ 79 | Read a dataset partitioned by year and quarter. 80 | 81 | The dataset was copied from AWS open data on 2022-05-25 82 | https://registry.opendata.aws/speedtest-global-performance/ 83 | """ 84 | ddf = dd.read_parquet( 85 | "s3://coiled-runtime-ci/ookla-open-data/type=fixed/*/*/*.parquet", 86 | engine="pyarrow", 87 | ) 88 | coll = ddf.groupby(["year", "quarter"]).first() 89 | wait(coll, parquet_client, 100) 90 | 91 | 92 | @run_up_to_nthreads("parquet_cluster", 100, reason="fixed dataset") 93 | def test_write_wide_data(parquet_client, s3_url): 94 | # Write a ~700 partition, ~200 GB dataset with a lot of columns 95 | ddf = dask.datasets.timeseries( 96 | dtypes={ 97 | **{f"name-{i}": str for i in range(25)}, 98 | **{f"price-{i}": float for i in range(25)}, 99 | **{f"id-{i}": int for i in range(25)}, 100 | **{f"cat-{i}": "category" for i in range(25)}, 101 | }, 102 | start="2021-01-01", 103 | end="2021-02-01", 104 | freq="10ms", 105 | partition_freq="1H", 106 | ) 107 | ddf.to_parquet(s3_url + "/wide-data/") 108 | 109 | 110 | @run_up_to_nthreads("parquet_cluster", 60, reason="fixed dataset") 111 | @pytest.mark.parametrize("kind", ["boto3", "s3fs", "pandas", "pandas+boto3", "dask"]) 112 | def test_download_throughput(parquet_client, kind): 113 | """Test throughput for downloading and parsing a single 563 MB parquet file. 114 | 115 | Note 116 | ---- 117 | I/O performance on S3 is heavily dependent on how many times the same file has been 118 | requested over the last few seconds. In A/B tests, this could lead to a false 119 | impression that test cases later in this list are faster than the earlier ones. 120 | Read more: https://github.com/coiled/benchmarks/issues/821 121 | """ 122 | path = ( 123 | "s3://coiled-runtime-ci/ookla-open-data/" 124 | "type=fixed/year=2022/quarter=1/2022-01-01_performance_fixed_tiles.parquet" 125 | ) 126 | 127 | def boto3_load(path): 128 | s3 = boto3.client("s3") 129 | _, _, bucket_name, key = path.split("/", maxsplit=3) 130 | response = s3.get_object(Bucket=bucket_name, Key=key) 131 | return response["Body"].read() 132 | 133 | if kind == "boto3": 134 | fut = parquet_client.submit(boto3_load, path) 135 | 136 | elif kind == "s3fs": 137 | 138 | def load(path): 139 | with fsspec.open(path) as f: 140 | return f.read() 141 | 142 | fut = parquet_client.submit(load, path) 143 | 144 | elif kind == "pandas": 145 | fut = parquet_client.submit(pandas.read_parquet, path, engine="pyarrow") 146 | 147 | elif kind == "pandas+boto3": 148 | 149 | def load(path): 150 | raw = boto3_load(path) 151 | buf = io.BytesIO(raw) 152 | return pandas.read_parquet(buf, engine="pyarrow") 153 | 154 | fut = parquet_client.submit(load, path) 155 | 156 | elif kind == "dask": 157 | fut = dd.read_parquet(path, engine="pyarrow") 158 | 159 | wait(fut, parquet_client, timeout=60) 160 | -------------------------------------------------------------------------------- /tests/benchmarks/test_h2o.py: -------------------------------------------------------------------------------- 1 | """ 2 | h2o-ai benchmark groupby part running on coiled. 3 | 4 | Note: Only holistic aggregations (median and groupby-apply) use a shuffle with the 5 | default split_out=1. 6 | """ 7 | import os 8 | 9 | import dask.dataframe as dd 10 | import pandas as pd 11 | import pytest 12 | 13 | from ..utils_test import run_up_to_nthreads 14 | 15 | DATASETS = { 16 | "0.5 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2/*.csv", 17 | "5 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2/*.csv", 18 | "50 GB (csv)": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2/*.csv", 19 | "0.5 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e7_K_1e2_parquet/*.parquet", 20 | "5 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e8_K_1e2_parquet/*.parquet", 21 | "50 GB (parquet)": "s3://coiled-datasets/h2o-benchmark/N_1e9_K_1e2_parquet/*.parquet", 22 | "5 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e8_K_1e2/*.parquet", 23 | "50 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e9_K_1e2/*.parquet", 24 | "500 GB (parquet+pyarrow)": "s3://coiled-datasets/h2o-benchmark/pyarrow_strings/N_1e10_K_1e2/*.parquet", 25 | } 26 | 27 | enabled_datasets = os.getenv("H2O_DATASETS") 28 | if enabled_datasets is not None: 29 | enabled_datasets = {k.strip() for k in enabled_datasets.split(",")} 30 | if unknown_datasets := enabled_datasets - DATASETS.keys(): 31 | raise ValueError("Unknown h2o dataset(s): ", unknown_datasets) 32 | else: 33 | enabled_datasets = { 34 | "5 GB (parquet)", 35 | } 36 | 37 | 38 | @pytest.fixture(autouse=True) 39 | def client(small_client): 40 | yield small_client 41 | 42 | 43 | @pytest.fixture(params=sorted(enabled_datasets), scope="module") 44 | def ddf(request): 45 | n_gib = float(request.param.split(" GB ")[0]) 46 | # 0.5 GB datasets are broken in 5~10 files 47 | # 5 GB -> 100 files 48 | # 50 GB -> 1000 files 49 | # 500 GB -> 10,000 files 50 | max_threads = max(20, int(n_gib * 20)) 51 | run_up_to_nthreads( 52 | "small_cluster", max_threads, reason="fixed data size", as_decorator=False 53 | ) 54 | 55 | uri = DATASETS[request.param] 56 | 57 | if uri.endswith("csv"): 58 | yield dd.read_csv( 59 | uri, 60 | dtype={ 61 | "id1": "category", 62 | "id2": "category", 63 | "id3": "category", 64 | "id4": "Int32", 65 | "id5": "Int32", 66 | "id6": "Int32", 67 | "v1": "Int32", 68 | "v2": "Int32", 69 | "v3": "float64", 70 | }, 71 | storage_options={"anon": True}, 72 | ) 73 | else: 74 | yield dd.read_parquet(uri, engine="pyarrow", storage_options={"anon": True}) 75 | 76 | 77 | def test_q1(ddf): 78 | ddf = ddf[["id1", "v1"]] 79 | ddf.groupby("id1", dropna=False, observed=True).agg({"v1": "sum"}).compute() 80 | 81 | 82 | def test_q2(ddf): 83 | ddf = ddf[["id1", "id2", "v1"]] 84 | ( 85 | ddf.groupby(["id1", "id2"], dropna=False, observed=True) 86 | .agg({"v1": "sum"}) 87 | .compute() 88 | ) 89 | 90 | 91 | def test_q3(ddf): 92 | ddf = ddf[["id3", "v1", "v3"]] 93 | ( 94 | ddf.groupby("id3", dropna=False, observed=True) 95 | .agg({"v1": "sum", "v3": "mean"}) 96 | .compute() 97 | ) 98 | 99 | 100 | def test_q4(ddf): 101 | ddf = ddf[["id4", "v1", "v2", "v3"]] 102 | ( 103 | ddf.groupby("id4", dropna=False, observed=True) 104 | .agg({"v1": "mean", "v2": "mean", "v3": "mean"}) 105 | .compute() 106 | ) 107 | 108 | 109 | def test_q5(ddf): 110 | ddf = ddf[["id6", "v1", "v2", "v3"]] 111 | ( 112 | ddf.groupby("id6", dropna=False, observed=True) 113 | .agg( 114 | {"v1": "sum", "v2": "sum", "v3": "sum"}, 115 | ) 116 | .compute() 117 | ) 118 | 119 | 120 | def test_q6(ddf, shuffle_method): 121 | # Median aggregation uses an explicitly-set shuffle 122 | ddf = ddf[["id4", "id5", "v3"]] 123 | ( 124 | ddf.groupby(["id4", "id5"], dropna=False, observed=True) 125 | .agg({"v3": ["median", "std"]}, shuffle=shuffle_method) 126 | .compute() # requires shuffle arg to be set explicitly 127 | ) 128 | 129 | 130 | def test_q7(ddf): 131 | ddf = ddf[["id3", "v1", "v2"]] 132 | ( 133 | ddf.groupby("id3", dropna=False, observed=True) 134 | .agg({"v1": "max", "v2": "min"}) 135 | .assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]] 136 | .compute() 137 | ) 138 | 139 | 140 | def test_q8(ddf, configure_shuffling): 141 | # .groupby(...).apply(...) uses a shuffle to transfer data before applying the function 142 | ddf = ddf[["id6", "v1", "v2", "v3"]] 143 | ( 144 | ddf[~ddf["v3"].isna()][["id6", "v3"]] 145 | .groupby("id6", dropna=False, observed=True) 146 | .apply( 147 | lambda x: x.nlargest(2, columns="v3"), 148 | meta={"id6": "Int64", "v3": "float64"}, 149 | )[["v3"]] 150 | .compute() 151 | ) 152 | 153 | 154 | def test_q9(ddf, configure_shuffling): 155 | # .groupby(...).apply(...) uses a shuffle to transfer data before applying the function 156 | ddf = ddf[["id2", "id4", "v1", "v2"]] 157 | ( 158 | ddf[["id2", "id4", "v1", "v2"]] 159 | .groupby(["id2", "id4"], dropna=False, observed=True) 160 | .apply( 161 | lambda x: pd.Series({"r2": x.corr(numeric_only=True)["v1"]["v2"] ** 2}), 162 | meta={"r2": "float64"}, 163 | ) 164 | .compute() 165 | ) 166 | -------------------------------------------------------------------------------- /tests/benchmarks/test_xarray.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | import dask.array as da 4 | import fsspec 5 | import numpy as np 6 | import pytest 7 | from coiled import Cluster 8 | from dask.utils import parse_bytes 9 | from distributed import Client 10 | 11 | from tests.conftest import dump_cluster_kwargs 12 | 13 | from ..utils_test import ( 14 | cluster_memory, 15 | print_size_info, 16 | run_up_to_nthreads, 17 | scaled_array_shape, 18 | wait, 19 | ) 20 | 21 | xr = pytest.importorskip("xarray") 22 | pytest.importorskip("flox") 23 | 24 | 25 | @pytest.fixture(scope="module") 26 | def group_reduction_cluster(dask_env_variables, cluster_kwargs, github_cluster_tags): 27 | kwargs = dict( 28 | name=f"xarray-group-reduction-{uuid.uuid4().hex[:8]}", 29 | environ=dask_env_variables, 30 | tags=github_cluster_tags, 31 | **cluster_kwargs["group_reduction_cluster"], 32 | ) 33 | dump_cluster_kwargs(kwargs, "group_reduction_cluster") 34 | with Cluster(**kwargs) as cluster: 35 | yield cluster 36 | 37 | 38 | @pytest.fixture 39 | def group_reduction_client( 40 | group_reduction_cluster, cluster_kwargs, benchmark_all, wait_for_workers 41 | ): 42 | n_workers = cluster_kwargs["group_reduction_cluster"]["n_workers"] 43 | with Client(group_reduction_cluster) as client: 44 | group_reduction_cluster.scale(n_workers) 45 | wait_for_workers(client, n_workers, timeout=600) 46 | client.restart() 47 | with benchmark_all(client): 48 | yield client 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "func", 53 | [ 54 | pytest.param( 55 | lambda x: x.groupby("time.month").mean(method="cohorts"), id="cohorts" 56 | ), 57 | pytest.param( 58 | lambda x: x.groupby("time.month").mean(method="map-reduce"), id="map-reduce" 59 | ), 60 | pytest.param( 61 | lambda x: x.chunk(time=xr.groupers.TimeResampler("ME")) 62 | .groupby("time.month") 63 | .mean(method="cohorts"), 64 | id="chunked-cohorts", 65 | ), 66 | ], 67 | ) 68 | def test_xarray_groupby_reduction(group_reduction_client, func): 69 | ds = xr.open_zarr( 70 | fsspec.get_mapper( 71 | "s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr", anon=True 72 | ), 73 | consolidated=True, 74 | ) 75 | # slice dataset properly to keep runtime in check 76 | subset = ds.zwattablrt.sel(time=slice("2001", "2002")) 77 | subset = subset.isel(x=slice(0, 350 * 8), y=slice(0, 350 * 8)) 78 | result = func(subset) 79 | wait(result, group_reduction_client, 10 * 60) 80 | 81 | 82 | @run_up_to_nthreads("small_cluster", 50, reason="fixed dataset") 83 | @pytest.mark.parametrize("backend", ["dataframe", "array"]) 84 | def test_quadratic_mean(small_client, backend): 85 | # https://github.com/pangeo-data/distributed-array-examples/issues/2 86 | # See https://github.com/dask/dask/issues/10384 87 | size = 5000 88 | ds = xr.Dataset( 89 | dict( 90 | anom_u=( 91 | ["time", "face", "j", "i"], 92 | da.random.random((size, 1, 987, 1920), chunks=(10, 1, -1, -1)), 93 | ), 94 | anom_v=( 95 | ["time", "face", "j", "i"], 96 | da.random.random((size, 1, 987, 1920), chunks=(10, 1, -1, -1)), 97 | ), 98 | ) 99 | ) 100 | 101 | quad = ds**2 102 | quad["uv"] = ds.anom_u * ds.anom_v 103 | mean = quad.mean("time") 104 | if backend == "dataframe": 105 | mean = mean.to_dask_dataframe() 106 | 107 | wait(mean, small_client, 10 * 60) 108 | 109 | 110 | def test_anom_mean(small_client, new_array): 111 | """From https://github.com/dask/distributed/issues/2602#issuecomment-498718651""" 112 | 113 | memory = cluster_memory(small_client) # 76.66 GiB 114 | target_nbytes = memory // 2 115 | data = new_array( 116 | scaled_array_shape(target_nbytes, ("x", "10MiB")), 117 | chunks=(1, parse_bytes("10MiB") // 8), 118 | ) 119 | print_size_info(memory, target_nbytes, data) 120 | # 38.32 GiB - 3925 10.00 MiB chunks 121 | 122 | ngroups = data.shape[0] // 100 123 | arr = xr.DataArray( 124 | data, 125 | dims=["time", "x"], 126 | coords={"day": ("time", np.arange(data.shape[0]) % ngroups)}, 127 | ) 128 | with xr.set_options(use_flox=False): 129 | clim = arr.groupby("day").mean(dim="time") 130 | anom = arr.groupby("day") - clim 131 | anom_mean = anom.mean(dim="time") 132 | 133 | wait(anom_mean, small_client, 10 * 60) 134 | 135 | 136 | @pytest.mark.skip( 137 | "fails in actual CI; see https://github.com/coiled/benchmarks/issues/253" 138 | ) 139 | def test_climatic_mean(small_client, new_array): 140 | """From https://github.com/dask/distributed/issues/2602#issuecomment-535009454""" 141 | 142 | memory = cluster_memory(small_client) # 76.66 GiB 143 | target_nbytes = memory * 2 144 | chunks = (1, 1, 96, 21, 90, 144) 145 | shape = (28, "x", 96, 21, 90, 144) 146 | data = new_array(scaled_array_shape(target_nbytes, shape), chunks=chunks) 147 | print_size_info(memory, target_nbytes, data) 148 | # 152.62 GiB - 784 199.34 MiB chunks 149 | 150 | array = xr.DataArray( 151 | data, 152 | dims=["ensemble", "init_date", "lat", "lead_time", "level", "lon"], 153 | # coords={"init_date": pd.date_range(start="1960", periods=arr.shape[1])}, 154 | coords={"init_date": np.arange(data.shape[1]) % 10}, 155 | ) 156 | # arr_clim = array.groupby("init_date.month").mean(dim="init_date") 157 | with xr.set_options(use_flox=False): 158 | arr_clim = array.groupby("init_date").mean(dim="init_date") 159 | 160 | wait(arr_clim, small_client, 15 * 60) 161 | -------------------------------------------------------------------------------- /tests/geospatial/workloads/climatology.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal 2 | 3 | import numpy as np 4 | import xarray as xr 5 | from dask.delayed import Delayed 6 | 7 | 8 | def compute_hourly_climatology( 9 | ds: xr.Dataset, 10 | ) -> xr.Dataset: 11 | hours = xr.DataArray(range(0, 24, 6), dims=["hour"]) 12 | window_weights = create_window_weights(61) 13 | return xr.concat( 14 | [compute_rolling_mean(select_hour(ds, hour), window_weights) for hour in hours], 15 | dim=hours, 16 | ) 17 | 18 | 19 | def compute_rolling_mean(ds: xr.Dataset, window_weights: xr.DataArray) -> xr.Dataset: 20 | window_size = len(window_weights) 21 | half_window_size = window_size // 2 # For padding 22 | ds = xr.concat( 23 | [ 24 | replace_time_with_doy(ds.sel(time=str(y))) 25 | for y in np.unique(ds.time.dt.year) 26 | ], 27 | dim="year", 28 | ) 29 | ds = ds.fillna(ds.sel(dayofyear=365)) 30 | ds = ds.pad(pad_width={"dayofyear": half_window_size}, mode="wrap") 31 | ds = ds.rolling(dayofyear=window_size, center=True).construct("window") 32 | ds = ds.weighted(window_weights).mean(dim=("window", "year")) 33 | return ds.isel(dayofyear=slice(half_window_size, -half_window_size)) 34 | 35 | 36 | def create_window_weights(window_size: int) -> xr.DataArray: 37 | """Create linearly decaying window weights.""" 38 | assert window_size % 2 == 1, "Window size must be odd." 39 | half_window_size = window_size // 2 40 | window_weights = np.concatenate( 41 | [ 42 | np.linspace(0, 1, half_window_size + 1), 43 | np.linspace(1, 0, half_window_size + 1)[1:], 44 | ] 45 | ) 46 | window_weights = window_weights / window_weights.mean() 47 | window_weights = xr.DataArray(window_weights, dims=["window"]) 48 | return window_weights 49 | 50 | 51 | def replace_time_with_doy(ds: xr.Dataset) -> xr.Dataset: 52 | """Replace time coordinate with days of year.""" 53 | return ds.assign_coords({"time": ds.time.dt.dayofyear}).rename( 54 | {"time": "dayofyear"} 55 | ) 56 | 57 | 58 | def select_hour(ds: xr.Dataset, hour: int) -> xr.Dataset: 59 | """Select given hour of day from dataset.""" 60 | # Select hour 61 | ds = ds.isel(time=ds.time.dt.hour == hour) 62 | # Adjust time dimension 63 | ds = ds.assign_coords({"time": ds.time.astype("datetime64[D]")}) 64 | return ds 65 | 66 | 67 | def rechunk_map_blocks( 68 | scale: Literal["small", "medium", "large"], 69 | storage_url: str, 70 | storage_options: dict[str, Any], 71 | ) -> Delayed: 72 | # Load dataset 73 | ds = xr.open_zarr( 74 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721.zarr", 75 | ) 76 | 77 | if scale == "small": 78 | # 101.83 GiB (small) 79 | time_range = slice("2020-01-01", "2022-12-31") 80 | variables = ["sea_surface_temperature"] 81 | elif scale == "medium": 82 | # 2.12 TiB (medium) 83 | time_range = slice("1959-01-01", "2022-12-31") 84 | variables = ["sea_surface_temperature"] 85 | else: 86 | # 4.24 TiB (large) 87 | # This currently doesn't complete successfully. 88 | time_range = slice("1959-01-01", "2022-12-31") 89 | variables = ["sea_surface_temperature", "snow_depth"] 90 | ds = ds[variables].sel(time=time_range) 91 | original_chunks = ds.chunks 92 | 93 | ds = ds.drop_vars([k for k, v in ds.items() if "time" not in v.dims]) 94 | pencil_chunks = {"time": -1, "longitude": "auto", "latitude": "auto"} 95 | 96 | working = ds.chunk(pencil_chunks) 97 | hours = xr.DataArray(range(0, 24, 6), dims=["hour"]) 98 | daysofyear = xr.DataArray(range(1, 367), dims=["dayofyear"]) 99 | template = ( 100 | working.isel(time=0) 101 | .drop_vars("time") 102 | .expand_dims(hour=hours, dayofyear=daysofyear) 103 | .assign_coords(hour=hours, dayofyear=daysofyear) 104 | ) 105 | working = working.map_blocks(compute_hourly_climatology, template=template) 106 | 107 | pancake_chunks = { 108 | "hour": 1, 109 | "dayofyear": 1, 110 | "latitude": original_chunks["latitude"], 111 | "longitude": original_chunks["longitude"], 112 | } 113 | result = working.chunk(pancake_chunks) 114 | return result.to_zarr(storage_url, storage_options=storage_options, compute=False) 115 | 116 | 117 | def highlevel_api( 118 | scale: Literal["small", "medium", "large"], 119 | storage_url: str, 120 | storage_options: dict[str, Any], 121 | ) -> Delayed: 122 | # Load dataset 123 | ds = xr.open_zarr( 124 | "gs://weatherbench2/datasets/era5/1959-2023_01_10-wb13-6h-1440x721.zarr", 125 | ) 126 | 127 | if scale == "small": 128 | # 101.83 GiB (small) 129 | time_range = slice("2020-01-01", "2022-12-31") 130 | variables = ["sea_surface_temperature"] 131 | elif scale == "medium": 132 | # 2.12 TiB (medium) 133 | time_range = slice("1959-01-01", "2022-12-31") 134 | variables = ["sea_surface_temperature"] 135 | else: 136 | # 4.24 TiB (large) 137 | # This currently doesn't complete successfully. 138 | time_range = slice("1959-01-01", "2022-12-31") 139 | variables = ["sea_surface_temperature", "snow_depth"] 140 | ds = ds[variables].sel(time=time_range) 141 | original_chunks = ds.chunks 142 | 143 | # Drop all static variables 144 | ds = ds.drop_vars([k for k, v in ds.items() if "time" not in v.dims]) 145 | 146 | # Split time dimension into three dimensions 147 | ds["dayofyear"] = ds.time.dt.dayofyear 148 | ds["hour"] = ds.time.dt.hour 149 | ds["year"] = ds.time.dt.year 150 | ds = ds.set_index(time=["year", "dayofyear", "hour"]).unstack() 151 | 152 | # Fill empty values for non-leap years 153 | ds = ds.ffill(dim="dayofyear", limit=1) 154 | 155 | # Calculate climatology 156 | window_size = 61 157 | window_weights = create_window_weights(window_size) 158 | half_window_size = window_size // 2 159 | ds = ds.pad(pad_width={"dayofyear": half_window_size}, mode="wrap") 160 | ds = ds.rolling(dayofyear=window_size, center=True).construct("window") 161 | ds = ds.weighted(window_weights).mean(dim=("window", "year")) 162 | ds = ds.isel(dayofyear=slice(half_window_size, -half_window_size)) 163 | 164 | pancake_chunks = { 165 | "hour": 1, 166 | "dayofyear": 1, 167 | "latitude": original_chunks["latitude"], 168 | "longitude": original_chunks["longitude"], 169 | } 170 | result = ds.chunk(pancake_chunks) 171 | return result.to_zarr(storage_url, storage_options=storage_options, compute=False) 172 | --------------------------------------------------------------------------------