├── dask ├── py.typed ├── tests │ ├── __init__.py │ ├── warning_aliases.py │ ├── test_compatibility.py │ ├── test_ml.py │ ├── test_backends.py │ ├── test_docs.py │ ├── test_hashing.py │ ├── test_datasets.py │ ├── test_ci.py │ ├── test_context.py │ ├── test_system.py │ ├── test_utils_test.py │ ├── test_cache.py │ └── test_callbacks.py ├── array │ ├── tests │ │ ├── __init__.py │ │ ├── test_testing.py │ │ ├── test_cupy_gufunc.py │ │ ├── test_numpy_compat.py │ │ ├── test_xarray.py │ │ ├── test_image.py │ │ ├── test_cupy_reductions.py │ │ ├── test_wrap.py │ │ ├── test_svg.py │ │ └── test_cupy_percentile.py │ ├── lib │ │ ├── __init__.py │ │ └── stride_tricks.py │ ├── dispatch.py │ ├── NUMPY_LICENSE.txt │ ├── image.py │ └── cupy_entry_point.py ├── bag │ ├── tests │ │ └── __init__.py │ ├── utils.py │ ├── chunk.py │ └── __init__.py ├── bytes │ ├── tests │ │ ├── __init__.py │ │ └── test_compression.py │ ├── __init__.py │ └── utils.py ├── dataframe │ ├── tests │ │ ├── __init__.py │ │ ├── test_methods.py │ │ ├── test_boolean.py │ │ ├── test_optimize_dataframe.py │ │ ├── test_extensions.py │ │ ├── test_numeric.py │ │ ├── test_hashing.py │ │ └── test_hyperloglog.py │ ├── io │ │ ├── tests │ │ │ └── __init__.py │ │ ├── orc │ │ │ ├── __init__.py │ │ │ └── utils.py │ │ ├── parquet │ │ │ └── __init__.py │ │ └── __init__.py │ ├── tseries │ │ ├── __init__.py │ │ └── tests │ │ │ └── __init__.py │ ├── extensions.py │ ├── numeric.py │ ├── __init__.py │ ├── _pyarrow_compat.py │ ├── _dtypes.py │ └── hyperloglog.py ├── diagnostics │ ├── tests │ │ └── __init__.py │ └── __init__.py ├── widgets │ ├── tests │ │ ├── templates │ │ │ ├── example.html.j2 │ │ │ ├── bytes.html.j2 │ │ │ └── custom_filter.html.j2 │ │ └── test_widgets.py │ ├── templates │ │ ├── dataframe.html.j2 │ │ ├── array.html.j2 │ │ ├── highlevelgraph_layer.html.j2 │ │ └── highlevelgraph.html.j2 │ ├── __init__.py │ └── widgets.py ├── __main__.py ├── ml.py ├── __init__.py ├── compatibility.py ├── _compatibility.py ├── distributed.py ├── system.py ├── context.py ├── cache.py ├── dask.yaml └── hashing.py ├── docs ├── source │ ├── daskcheatsheet.pdf │ ├── images │ │ ├── reshape.png │ │ ├── gputester-msg.png │ │ ├── merge_chunks.png │ │ ├── order-failure.png │ │ ├── order-success.png │ │ ├── scaling-edges.png │ │ ├── scaling-nodes.png │ │ ├── simple-dask.png │ │ ├── dashboard_link.png │ │ ├── reshape_problem.png │ │ ├── HHMI_Janelia_Color.png │ │ ├── async-embarrassing.gif │ │ ├── dashboard_memory.png │ │ ├── dashboard_progress.png │ │ ├── dashboard_status.png │ │ ├── merge_chunks_false.png │ │ ├── reshape_rechunked.png │ │ ├── 10_minutes_bag_graph.png │ │ ├── dashboard_jupyterlab.png │ │ ├── dashboard_memory_new.gif │ │ ├── growth_of_languages.png │ │ ├── growth_of_libraries.png │ │ ├── map_blocks_drop_axis.png │ │ ├── 10_minutes_array_graph.png │ │ ├── transpose-hlg-html-repr.png │ │ ├── dashboard_task_processing.png │ │ ├── 10_minutes_dataframe_graph.png │ │ ├── concurrent-futures-threaded.webp │ │ ├── dashboard_taskstream_healthy.png │ │ ├── transpose-hlg-hovertooltip.png │ │ ├── dashboard_task_stream_unhealthy.png │ │ ├── dask_icon_black.svg │ │ ├── dask_icon.svg │ │ ├── dask_icon_on_pink.svg │ │ ├── dask_icon_white.svg │ │ ├── unoverlapping-neighbors.svg │ │ ├── optimize_dask5.svg │ │ └── dask_horizontal.svg │ ├── _static │ │ ├── dask-simple.png │ │ ├── theme_overrides.css │ │ ├── style.css │ │ └── main-page.css │ ├── _templates │ │ └── layout.html │ ├── cheatsheet.rst │ ├── internals.rst │ ├── debugging-performance.rst │ ├── how-to │ │ ├── index.rst │ │ ├── setup-prometheus.rst │ │ └── extend-sizeof.rst │ ├── logos.rst │ ├── dashboard-progress-script.py │ ├── array-stats.rst │ ├── delayed-collections.rst │ ├── deploying-ssh.rst │ ├── delayed-api.rst │ ├── understanding-performance.rst │ ├── bag-api.rst │ ├── array-stack.rst │ ├── graph_manipulation.rst │ ├── deploying-docker.rst │ ├── array-gufunc.rst │ ├── deploying-cloud.rst │ ├── deploying-python.rst │ └── array-assignment.rst ├── requirements-docs.txt └── README.rst ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── CONTRIBUTING.md ├── dependabot.yml ├── workflows │ ├── label-prs.yml │ ├── label-all.yml │ ├── pre-commit.yml │ ├── stale-bot.yaml │ ├── additional.yml │ ├── conda.yml │ ├── upstream.yml │ └── update-gpuci.yml ├── labeler.yml └── release.yml ├── continuous_integration ├── gpuci │ ├── axis.yaml │ └── build.sh ├── scripts │ ├── run_tests.sh │ ├── test_imports.sh │ └── install.sh ├── environment-mindeps-non-optional.yaml ├── environment-mindeps-array.yaml ├── environment-mindeps-dataframe.yaml ├── environment-mindeps-distributed.yaml ├── recipe │ └── meta.yaml ├── environment-mindeps-optional.yaml ├── environment-3.9.yaml ├── environment-3.10.yaml └── environment-3.11.yaml ├── setup.py ├── CONTRIBUTING.md ├── .readthedocs.yaml ├── MANIFEST.in ├── .gitignore ├── .git-blame-ignore-revs ├── codecov.yml ├── .flake8 ├── README.rst ├── LICENSE.txt ├── .pre-commit-config.yaml └── conftest.py /dask/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/array/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/bag/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/bytes/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/dataframe/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/dataframe/io/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/dataframe/tseries/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/diagnostics/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/dataframe/tseries/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dask/widgets/tests/templates/example.html.j2: -------------------------------------------------------------------------------- 1 |

2 | Hello {{ foo }}! 3 |

4 | -------------------------------------------------------------------------------- /dask/widgets/tests/templates/bytes.html.j2: -------------------------------------------------------------------------------- 1 |

2 | {{ foo | format_bytes }} 3 |

4 | -------------------------------------------------------------------------------- /dask/widgets/tests/templates/custom_filter.html.j2: -------------------------------------------------------------------------------- 1 |

2 | {{ foo | custom_filter }} 3 |

4 | -------------------------------------------------------------------------------- /dask/bytes/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.bytes.core import read_bytes 4 | -------------------------------------------------------------------------------- /docs/source/daskcheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/daskcheatsheet.pdf -------------------------------------------------------------------------------- /docs/source/images/reshape.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape.png -------------------------------------------------------------------------------- /dask/array/lib/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.array.lib import stride_tricks 4 | -------------------------------------------------------------------------------- /docs/source/_static/dask-simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/_static/dask-simple.png -------------------------------------------------------------------------------- /docs/source/images/gputester-msg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/gputester-msg.png -------------------------------------------------------------------------------- /docs/source/images/merge_chunks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks.png -------------------------------------------------------------------------------- /docs/source/images/order-failure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-failure.png -------------------------------------------------------------------------------- /docs/source/images/order-success.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-success.png -------------------------------------------------------------------------------- /docs/source/images/scaling-edges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-edges.png -------------------------------------------------------------------------------- /docs/source/images/scaling-nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-nodes.png -------------------------------------------------------------------------------- /docs/source/images/simple-dask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/simple-dask.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_link.png -------------------------------------------------------------------------------- /docs/source/images/reshape_problem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_problem.png -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | - [ ] Closes #xxxx 2 | - [ ] Tests added / passed 3 | - [ ] Passes `pre-commit run --all-files` 4 | -------------------------------------------------------------------------------- /docs/source/images/HHMI_Janelia_Color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/HHMI_Janelia_Color.png -------------------------------------------------------------------------------- /docs/source/images/async-embarrassing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/async-embarrassing.gif -------------------------------------------------------------------------------- /docs/source/images/dashboard_memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_progress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_progress.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_status.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_status.png -------------------------------------------------------------------------------- /docs/source/images/merge_chunks_false.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks_false.png -------------------------------------------------------------------------------- /docs/source/images/reshape_rechunked.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_rechunked.png -------------------------------------------------------------------------------- /dask/dataframe/io/orc/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.dataframe.io.orc.core import read_orc, to_orc 4 | -------------------------------------------------------------------------------- /docs/source/images/10_minutes_bag_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_bag_graph.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_jupyterlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_jupyterlab.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_memory_new.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory_new.gif -------------------------------------------------------------------------------- /docs/source/images/growth_of_languages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_languages.png -------------------------------------------------------------------------------- /docs/source/images/growth_of_libraries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_libraries.png -------------------------------------------------------------------------------- /docs/source/images/map_blocks_drop_axis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/map_blocks_drop_axis.png -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | See [developer documentation](https://docs.dask.org/en/latest/develop.html) 2 | for tips on how to get started. 3 | -------------------------------------------------------------------------------- /docs/source/images/10_minutes_array_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_array_graph.png -------------------------------------------------------------------------------- /docs/source/images/transpose-hlg-html-repr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-html-repr.png -------------------------------------------------------------------------------- /dask/array/lib/stride_tricks.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.array.overlap import sliding_window_view # noqa: F401 4 | -------------------------------------------------------------------------------- /docs/source/images/dashboard_task_processing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_processing.png -------------------------------------------------------------------------------- /docs/source/images/10_minutes_dataframe_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_dataframe_graph.png -------------------------------------------------------------------------------- /docs/source/images/concurrent-futures-threaded.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/concurrent-futures-threaded.webp -------------------------------------------------------------------------------- /docs/source/images/dashboard_taskstream_healthy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_taskstream_healthy.png -------------------------------------------------------------------------------- /docs/source/images/transpose-hlg-hovertooltip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-hovertooltip.png -------------------------------------------------------------------------------- /docs/source/images/dashboard_task_stream_unhealthy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_stream_unhealthy.png -------------------------------------------------------------------------------- /dask/widgets/templates/dataframe.html.j2: -------------------------------------------------------------------------------- 1 |
Dask DataFrame Structure:
2 | {{ data }} 3 |
Dask Name: {{ name | key_split }}, {{ layers }}
4 | -------------------------------------------------------------------------------- /dask/__main__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.cli import run_cli 4 | 5 | 6 | def main(): 7 | run_cli() 8 | 9 | 10 | if __name__ == "__main__": 11 | main() 12 | -------------------------------------------------------------------------------- /continuous_integration/gpuci/axis.yaml: -------------------------------------------------------------------------------- 1 | PYTHON_VER: 2 | - "3.9" 3 | - "3.10" 4 | 5 | CUDA_VER: 6 | - "11.5" 7 | 8 | LINUX_VER: 9 | - ubuntu18.04 10 | 11 | RAPIDS_VER: 12 | - "23.10" 13 | 14 | excludes: 15 | -------------------------------------------------------------------------------- /docs/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% set css_files = css_files + ["_static/style.css"] %} 3 | {% set script_files = script_files + ["_static/yaml.min.js", "_static/config_converter.js"] %} 4 | -------------------------------------------------------------------------------- /dask/dataframe/io/parquet/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.dataframe.io.parquet.core import ( 4 | create_metadata_file, 5 | read_parquet, 6 | read_parquet_part, 7 | to_parquet, 8 | ) 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import annotations 4 | 5 | import versioneer 6 | from setuptools import setup 7 | 8 | setup( 9 | version=versioneer.get_version(), 10 | cmdclass=versioneer.get_cmdclass(), 11 | ) 12 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 2 | 3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html. 4 | -------------------------------------------------------------------------------- /docs/source/cheatsheet.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Dask Cheat Sheet 4 | ================ 5 | 6 | The 300KB pdf :download:`Dask cheat sheet ` 7 | is a single page summary about using Dask. 8 | It is commonly distributed at conferences and trade shows. 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Set update schedule for GitHub Actions 2 | 3 | version: 2 4 | updates: 5 | - package-ecosystem: "github-actions" 6 | directory: "/" 7 | schedule: 8 | # Check for updates to GitHub Actions every weekday 9 | interval: "weekly" 10 | -------------------------------------------------------------------------------- /dask/tests/warning_aliases.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | try: 4 | from sqlalchemy.exc import RemovedIn20Warning 5 | except ImportError: 6 | 7 | class _RemovedIn20Warning(Warning): 8 | pass 9 | 10 | RemovedIn20Warning = _RemovedIn20Warning 11 | -------------------------------------------------------------------------------- /.github/workflows/label-prs.yml: -------------------------------------------------------------------------------- 1 | name: "PR Labeler" 2 | on: 3 | - pull_request_target 4 | 5 | jobs: 6 | label: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/labeler@main 10 | with: 11 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 12 | sync-labels: false 13 | -------------------------------------------------------------------------------- /dask/bag/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def assert_eq(a, b, scheduler="sync"): 5 | if hasattr(a, "compute"): 6 | a = a.compute(scheduler=scheduler) 7 | if hasattr(b, "compute"): 8 | b = b.compute(scheduler=scheduler) 9 | 10 | assert a == b 11 | -------------------------------------------------------------------------------- /dask/diagnostics/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.callbacks import Callback 4 | from dask.diagnostics.profile import CacheProfiler, Profiler, ResourceProfiler 5 | from dask.diagnostics.profile_visualize import visualize 6 | from dask.diagnostics.progress import ProgressBar 7 | -------------------------------------------------------------------------------- /.github/workflows/label-all.yml: -------------------------------------------------------------------------------- 1 | name: "Issue and PR Labeler" 2 | on: 3 | pull_request: 4 | types: [opened] 5 | issues: 6 | types: [opened, reopened] 7 | jobs: 8 | label-all-on-open: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: andymckay/labeler@1.0.4 12 | with: 13 | add-labels: "needs triage" 14 | ignore-if-labeled: false 15 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 2 | version: 2 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | fail_on_warning: true 11 | 12 | python: 13 | install: 14 | - requirements: docs/requirements-docs.txt 15 | - method: pip 16 | path: . 17 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include dask *.py 2 | recursive-include dask *.j2 3 | recursive-include docs/source * 4 | include docs/Makefile docs/make.bat 5 | 6 | include setup.py 7 | include README.rst 8 | include MANIFEST.in 9 | include dask/dask.yaml 10 | include dask/dask-schema.yaml 11 | include dask/py.typed 12 | 13 | include versioneer.py 14 | include dask/_version.py 15 | 16 | include conftest.py 17 | -------------------------------------------------------------------------------- /continuous_integration/scripts/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | if [[ $PARALLEL == 'true' ]]; then 6 | export XTRATESTARGS="-n4 $XTRATESTARGS" 7 | fi 8 | 9 | if [[ $COVERAGE == 'true' ]]; then 10 | export XTRATESTARGS="--cov=dask --cov-report=xml $XTRATESTARGS" 11 | fi 12 | 13 | echo "py.test dask --runslow $XTRATESTARGS" 14 | py.test dask --runslow $XTRATESTARGS 15 | 16 | set +e 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .hypothesis 2 | *.py[cod] 3 | __pycache__/ 4 | *.egg-info 5 | .mypy_cache 6 | dask-worker-space/ 7 | docs/build 8 | docs/source/generated 9 | build/ 10 | dist/ 11 | .idea/ 12 | log.* 13 | log 14 | .pytest_cache/ 15 | .coverage 16 | .coverage.* 17 | coverage.xml 18 | .DS_Store 19 | *.sqlite 20 | *.swp 21 | *.swo 22 | .cache/ 23 | hdfs-initialized-indicator 24 | .ipynb_checkpoints 25 | .vscode/ 26 | .history 27 | -------------------------------------------------------------------------------- /.github/workflows/pre-commit.yml: -------------------------------------------------------------------------------- 1 | name: Linting 2 | 3 | on: 4 | push: 5 | branches: main 6 | pull_request: 7 | branches: main 8 | 9 | jobs: 10 | checks: 11 | name: pre-commit hooks 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3.5.3 15 | - uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.9' 18 | - uses: pre-commit/action@v3.0.0 19 | -------------------------------------------------------------------------------- /docs/source/internals.rst: -------------------------------------------------------------------------------- 1 | Dask Internals 2 | ============== 3 | 4 | This section is intended for contributors and power users who are interested in 5 | learning more about how Dask works internally. 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | user-interfaces.rst 11 | understanding-performance.rst 12 | phases-of-computation.rst 13 | order.rst 14 | caching.rst 15 | shared.rst 16 | scheduling-policy.rst 17 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | numpydoc 2 | sphinx>=4.0.0 3 | dask-sphinx-theme>=3.0.0 4 | sphinx-click 5 | sphinx-copybutton 6 | sphinx-remove-toctrees 7 | sphinx_autosummary_accessors 8 | sphinx-tabs 9 | sphinx-design 10 | jupyter_sphinx 11 | toolz 12 | cloudpickle>=1.5.0 13 | pandas>=1.4.0 14 | git+https://github.com/dask/distributed 15 | fsspec 16 | scipy 17 | pytest 18 | pytest-check-links 19 | requests-cache 20 | ipython 21 | ipykernel<6.22.0 22 | -------------------------------------------------------------------------------- /docs/source/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | @media screen and (min-width: 767px) { 3 | 4 | .wy-table-responsive table td { 5 | /* !important prevents the common CSS stylesheets from overriding 6 | this as on RTD they are loaded after this stylesheet */ 7 | white-space: normal !important; 8 | } 9 | 10 | .wy-table-responsive { 11 | overflow: visible !important; 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /docs/source/debugging-performance.rst: -------------------------------------------------------------------------------- 1 | Debugging and Performance 2 | ========================== 3 | 4 | This section contains resources to help you debug and understand performance. 5 | 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | how-to/debug.rst 11 | Visualize task graphs 12 | Dashboard 13 | diagnostics-local.rst 14 | diagnostics-distributed.rst 15 | Phases of computation 16 | -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | dataframe: 2 | - dask/dataframe/* 3 | - dask/dataframe/**/* 4 | 5 | array: 6 | - dask/array/* 7 | - dask/array/**/* 8 | 9 | io: 10 | - dask/dataframe/io/* 11 | - dask/dataframe/io/**/* 12 | 13 | documentation: 14 | - docs/* 15 | - docs/**/* 16 | 17 | dispatch: 18 | - dask/array/backends.py 19 | - dask/array/dispatch.py 20 | - dask/dataframe/backends.py 21 | - dask/dataframe/dispatch.py 22 | - dask/dataframe/extensions.py 23 | -------------------------------------------------------------------------------- /dask/tests/test_compatibility.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from dask._compatibility import entry_points 6 | 7 | 8 | def test_deprecation(): 9 | with pytest.warns(DeprecationWarning): 10 | from dask.compatibility import _EMSCRIPTEN # noqa 11 | 12 | 13 | def test_entry_points(): 14 | with pytest.warns(DeprecationWarning): 15 | assert "pytest" in [ep.name for ep in entry_points(group="console_scripts")] 16 | -------------------------------------------------------------------------------- /dask/tests/test_ml.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def test_basic(): 5 | try: 6 | import dask_ml # noqa: F401 7 | except ImportError: 8 | try: 9 | from dask.ml.model_selection import GridSearchCV # noqa: F401 10 | except ImportError as e: 11 | assert "conda install dask-ml" in str(e) 12 | else: 13 | assert False 14 | else: 15 | from dask.ml.model_selection import GridSearchCV # noqa: F401 16 | -------------------------------------------------------------------------------- /continuous_integration/environment-mindeps-non-optional.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - packaging=20.0 7 | - python=3.9 8 | - pyyaml=5.3.1 9 | - click=8.0 10 | - cloudpickle=1.5.0 11 | - partd=1.2.0 12 | - fsspec=2021.09.0 13 | - importlib-metadata=4.13.0 14 | - toolz=0.10.0 15 | # test dependencies 16 | - pre-commit 17 | - pytest 18 | - pytest-cov 19 | - pytest-rerunfailures 20 | - pytest-xdist 21 | -------------------------------------------------------------------------------- /docs/source/how-to/index.rst: -------------------------------------------------------------------------------- 1 | How To... 2 | ========= 3 | 4 | This section contains snippets and suggestions about how to perform different actions 5 | using Dask. If you have an idea of a how-to that we should add, please 6 | `make a suggestion `_! 7 | 8 | .. Articles in this section should be short and not contain much explanation. 9 | 10 | .. toctree:: 11 | :caption: How To... 12 | :maxdepth: 1 13 | :glob: 14 | 15 | * 16 | Use GPUs <../gpu.rst> 17 | -------------------------------------------------------------------------------- /dask/ml.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def __getattr__(value): 5 | try: 6 | import dask_ml 7 | except ImportError as e: 8 | msg = ( 9 | "Dask-ML is not installed.\n\n" 10 | "Please either conda or pip install dask-ml:\n\n" 11 | " conda install dask-ml # either conda install\n" 12 | " python -m pip install dask-ml --upgrade # or pip install" 13 | ) 14 | raise ImportError(msg) from e 15 | return getattr(dask_ml, value) 16 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_methods.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import dask.dataframe.methods as methods 7 | from dask.dataframe._compat import PANDAS_GE_140 8 | 9 | 10 | def test_assign_not_modifying_array_inplace(): 11 | df = pd.DataFrame({"a": [1, 2, 3], "b": 1.5}) 12 | result = methods.assign(df, "a", 5) 13 | assert not np.shares_memory(df["a"].values, result["a"].values) 14 | if PANDAS_GE_140: 15 | assert np.shares_memory(df["b"].values, result["b"].values) 16 | -------------------------------------------------------------------------------- /continuous_integration/environment-mindeps-array.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - packaging=20.0 7 | - python=3.9 8 | - pyyaml=5.3.1 9 | - click=8.0 10 | - cloudpickle=1.5.0 11 | - partd=1.2.0 12 | - fsspec=2021.09.0 13 | - importlib-metadata=4.13.0 14 | - toolz=0.10.0 15 | # optional dependencies pulled in by pip install dask[array] 16 | - numpy=1.21 17 | # test dependencies 18 | - pre-commit 19 | - pytest 20 | - pytest-cov 21 | - pytest-rerunfailures 22 | - pytest-xdist 23 | -------------------------------------------------------------------------------- /dask/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask import config, datasets 4 | from dask._version import get_versions 5 | from dask.base import ( 6 | annotate, 7 | compute, 8 | get_annotations, 9 | is_dask_collection, 10 | optimize, 11 | persist, 12 | visualize, 13 | ) 14 | from dask.core import istask 15 | from dask.delayed import delayed 16 | from dask.local import get_sync as get 17 | 18 | versions = get_versions() 19 | __version__ = versions["version"] 20 | __git_revision__ = versions["full-revisionid"] 21 | del get_versions, versions 22 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | # .github/release.yml 2 | 3 | changelog: 4 | categories: 5 | - title: New Features 6 | labels: 7 | - feature 8 | - title: Enhancements 9 | labels: 10 | - enhancement 11 | - title: Bug Fixes 12 | labels: 13 | - bug 14 | - title: Deprecations 15 | labels: 16 | - deprecation 17 | - title: Documentation 18 | labels: 19 | - documentation 20 | - title: Maintenance 21 | labels: 22 | - tests 23 | - hygiene 24 | - title: Misc 25 | labels: 26 | - "*" 27 | -------------------------------------------------------------------------------- /dask/compatibility.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | from dask._compatibility import EMSCRIPTEN as _EMSCRIPTEN # noqa 6 | from dask._compatibility import PY_VERSION as _PY_VERSION # noqa 7 | from dask._compatibility import entry_points, parse_version # noqa 8 | 9 | warnings.warn( 10 | "`dask.compatibility` is not intended for external use and has been renamed to `dask._compatibility`. " 11 | "This backward-compatible shim will be removed in a future release. Please find an alternative.", 12 | DeprecationWarning, 13 | stacklevel=2, 14 | ) 15 | -------------------------------------------------------------------------------- /docs/README.rst: -------------------------------------------------------------------------------- 1 | To build a local copy of the Dask documentation, install the packages in 2 | ``requirements-docs.txt`` and run ``make html``. 3 | 4 | Optionally create and activate a ``conda`` environment first:: 5 | 6 | conda create -n daskdocs -c conda-forge python=3.11 7 | conda activate daskdocs 8 | 9 | Install the dependencies with ``pip``:: 10 | 11 | python -m pip install -r requirements-docs.txt 12 | 13 | After running ``make html`` the generated HTML documentation can be found in 14 | the ``build/html`` directory. Open ``build/html/index.html`` to view the home 15 | page for the documentation. 16 | -------------------------------------------------------------------------------- /continuous_integration/environment-mindeps-dataframe.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - packaging=20.0 7 | - python=3.9 8 | - pyyaml=5.3.1 9 | - click=8.0 10 | - cloudpickle=1.5.0 11 | - partd=1.2.0 12 | - fsspec=2021.09.0 13 | - importlib-metadata=4.13.0 14 | - toolz=0.10.0 15 | # optional dependencies pulled in by pip install dask[dataframe] 16 | - numpy=1.21 17 | - pandas=1.3 18 | # test dependencies 19 | - pre-commit 20 | - pytest 21 | - pytest-cov 22 | - pytest-rerunfailures 23 | - pytest-xdist 24 | -------------------------------------------------------------------------------- /dask/dataframe/io/orc/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | class ORCEngine: 5 | """The API necessary to provide a new ORC reader/writer""" 6 | 7 | @classmethod 8 | def read_metadata( 9 | cls, fs, paths, columns, index, split_stripes, aggregate_files, **kwargs 10 | ): 11 | raise NotImplementedError() 12 | 13 | @classmethod 14 | def read_partition(cls, fs, part, columns, **kwargs): 15 | raise NotImplementedError() 16 | 17 | @classmethod 18 | def write_partition(cls, df, path, fs, filename, **kwargs): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /dask/array/dispatch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dispatch in dask.array. 3 | 4 | Also see backends.py 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from dask.utils import Dispatch 10 | 11 | concatenate_lookup = Dispatch("concatenate") 12 | tensordot_lookup = Dispatch("tensordot") 13 | einsum_lookup = Dispatch("einsum") 14 | empty_lookup = Dispatch("empty") 15 | divide_lookup = Dispatch("divide") 16 | percentile_lookup = Dispatch("percentile") 17 | numel_lookup = Dispatch("numel") 18 | nannumel_lookup = Dispatch("nannumel") 19 | to_numpy_dispatch = Dispatch("to_numpy_dispatch") 20 | to_cupy_dispatch = Dispatch("to_cupy_dispatch") 21 | -------------------------------------------------------------------------------- /dask/bytes/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import bz2 4 | import gzip 5 | import io 6 | import lzma 7 | import zipfile 8 | 9 | 10 | def zip_compress(data): 11 | """Write data into zipfile and return the bytes""" 12 | out = io.BytesIO() 13 | with zipfile.ZipFile(file=out, mode="w") as z: 14 | with z.open("myfile", "w") as zf: 15 | zf.write(data) 16 | out.seek(0) 17 | return out.read() 18 | 19 | 20 | compress = { 21 | "gzip": gzip.compress, 22 | "bz2": bz2.compress, 23 | None: lambda x: x, 24 | "xz": lzma.compress, 25 | "zip": zip_compress, 26 | } 27 | -------------------------------------------------------------------------------- /dask/array/tests/test_testing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | import dask.array as da 9 | from dask.array.utils import assert_eq 10 | 11 | 12 | @pytest.mark.skipif(bool(sys.flags.optimize), reason="Assertions disabled.") 13 | def test_assert_eq_checks_scalars(): 14 | # https://github.com/dask/dask/issues/2680 15 | with pytest.raises(AssertionError): 16 | assert_eq(np.array(0), np.array(1)) 17 | 18 | a = da.from_array(np.array([0]), 1)[0] 19 | b = np.array([1])[0] 20 | with pytest.raises(AssertionError): 21 | assert_eq(a, b) 22 | -------------------------------------------------------------------------------- /docs/source/_static/style.css: -------------------------------------------------------------------------------- 1 | .configTextArea { 2 | font-family: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace; 3 | margin-bottom: 24px; 4 | } 5 | 6 | .classifier::before { 7 | content: ": "; 8 | } 9 | 10 | /* options for jupyter-sphinx extension */ 11 | div.jupyter_container { 12 | box-shadow: None; 13 | font-family: var(--pst-font-family-monospace); 14 | border-radius: 0.4em; 15 | } 16 | 17 | .jupyter_container div.code_cell { 18 | padding: 10px; 19 | max-width: None !important; 20 | } 21 | 22 | .jupyter_container .output { 23 | font-size: 16px; 24 | padding: 10px 25 | } 26 | -------------------------------------------------------------------------------- /dask/bytes/tests/test_compression.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from io import BytesIO 4 | 5 | import pytest 6 | from fsspec.compression import compr 7 | 8 | from dask.bytes.utils import compress 9 | 10 | 11 | @pytest.mark.parametrize("fmt,File", compr.items()) 12 | def test_files(fmt, File): 13 | if fmt not in compress: 14 | pytest.skip("compression function not provided") 15 | if fmt is None: 16 | return 17 | data = b"1234" * 1000 18 | compressed = compress[fmt](data) 19 | 20 | b = BytesIO(compressed) 21 | g = File(b, mode="rb") 22 | data2 = g.read() 23 | g.close() 24 | assert data == data2 25 | -------------------------------------------------------------------------------- /continuous_integration/environment-mindeps-distributed.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - packaging=20.0 7 | - python=3.9 8 | - pyyaml=5.3.1 9 | - click=8.0 10 | - cloudpickle=1.5.0 11 | - partd=1.2.0 12 | - fsspec=2021.09.0 13 | - importlib-metadata=4.13.0 14 | - toolz=0.10.0 15 | # optional dependencies pulled in by pip install dask[distributed] 16 | - pip 17 | - pip: 18 | - git+https://github.com/dask/distributed 19 | # test dependencies 20 | - pre-commit 21 | - pytest 22 | - pytest-cov 23 | - pytest-rerunfailures 24 | - pytest-timeout 25 | - pytest-xdist 26 | -------------------------------------------------------------------------------- /dask/dataframe/extensions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Support for pandas ExtensionArray in dask.dataframe. 3 | 4 | See :ref:`extensionarrays` for more. 5 | """ 6 | from __future__ import annotations 7 | 8 | from dask.dataframe.accessor import ( 9 | register_dataframe_accessor, 10 | register_index_accessor, 11 | register_series_accessor, 12 | ) 13 | from dask.utils import Dispatch 14 | 15 | make_array_nonempty = Dispatch("make_array_nonempty") 16 | make_scalar = Dispatch("make_scalar") 17 | 18 | 19 | __all__ = [ 20 | "make_array_nonempty", 21 | "make_scalar", 22 | "register_dataframe_accessor", 23 | "register_index_accessor", 24 | "register_series_accessor", 25 | ] 26 | -------------------------------------------------------------------------------- /dask/array/tests/test_cupy_gufunc.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | pytestmark = pytest.mark.gpu 7 | 8 | import dask.array as da 9 | from dask.array.gufunc import apply_gufunc 10 | from dask.array.utils import assert_eq 11 | 12 | cupy = pytest.importorskip("cupy") 13 | 14 | 15 | def test_apply_gufunc_axis(): 16 | def mydiff(x): 17 | return np.diff(x) 18 | 19 | a = cupy.random.default_rng().standard_normal((3, 6, 4)) 20 | da_ = da.from_array(a, chunks=2, asarray=False) 21 | 22 | m = np.diff(a, axis=1) 23 | dm = apply_gufunc( 24 | mydiff, "(i)->(i)", da_, axis=1, output_sizes={"i": 5}, allow_rechunk=True 25 | ) 26 | assert_eq(m, dm) 27 | -------------------------------------------------------------------------------- /dask/_compatibility.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | import warnings 5 | 6 | from importlib_metadata import entry_points as _entry_points 7 | from packaging.version import parse as parse_version 8 | 9 | PY_VERSION = parse_version(".".join(map(str, sys.version_info[:3]))) 10 | 11 | EMSCRIPTEN = sys.platform == "emscripten" 12 | 13 | 14 | def entry_points(group=None): 15 | warnings.warn( 16 | "`dask._compatibility.entry_points` has been replaced by `importlib_metadata.entry_points` and will be removed " 17 | "in a future version. Please use `importlib_metadata.entry_points` instead.", 18 | DeprecationWarning, 19 | stacklevel=2, 20 | ) 21 | return _entry_points(group=group) 22 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # .git-blame-ignore-revs 2 | # absolufy-imports - No relative - PEP8 (#8796) 3 | cccb9d8d8e33a891396b1275c2448c352ef40c27 4 | 5 | # Update `pre-commit` version (#8691) 6 | 510bbc380531cbf56a409f1ae68e6fd84a9599e6 7 | 8 | # Run pyupgrade in CI (#8246) 9 | 80a82008d5b02a08f6ff59d802defcc43247eb1a 10 | 11 | # Bump pre-commit hook versions (#7676) 12 | d6bbbb08c92652eae2820e93edc2f3fe502391d3 13 | 14 | # Start adding isort (#7370) 15 | a31c0fc72e1cc59b8b0254965824abb0718c5f56 16 | 17 | # Rerun with latest black release (#6568) 18 | 64e2a9b3b9992503221a074a547827501927d1fa 19 | 20 | # LINT: Fixup black string normalization (#5227) 21 | d92f4015a1da3da10c04c682ed2acae8469e9576 22 | 23 | # Apply Black formatting (#4983) 24 | 7e4beffb339c69278091d4e305c2ae18ddf8c74f 25 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | require_ci_to_pass: yes 3 | # codecov pushes a failing status update to github actions before all the 4 | # test runs have completed (this is later updated to passing after more test 5 | # runs pass, but the initial red X is annoying). As far as I can tell from 6 | # https://docs.codecov.com/docs/merging-reports this shouldn't be happening, 7 | # but it is. Here we set a minimum number of builds before notifying in the 8 | # hopes that it will stop this behavior. 9 | notify: 10 | after_n_builds: 10 11 | 12 | coverage: 13 | precision: 2 14 | round: down 15 | range: "90...100" 16 | 17 | status: 18 | project: 19 | default: 20 | target: 90% 21 | threshold: 1% 22 | patch: no 23 | changes: no 24 | 25 | comment: off 26 | -------------------------------------------------------------------------------- /.github/workflows/stale-bot.yaml: -------------------------------------------------------------------------------- 1 | name: 'Label stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '30 1 * * 1' # runs once a week 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v8 11 | with: 12 | stale-issue-message: '' # no comment left if string is empty 13 | stale-pr-message: '' # no comment left if string is empty 14 | days-before-stale: 30 15 | days-before-close: -1 16 | stale-issue-label: 'needs attention' 17 | stale-pr-label: 'needs attention' 18 | exempt-issue-labels: 'good intro to dask,good first issue,Good First Issue,good second issue,feature request' 19 | exempt-draft-pr: true 20 | start-date: '2020-04-18T00:00:00Z' # ignore before this date, ISO 8601 or RFC 2822 21 | -------------------------------------------------------------------------------- /dask/distributed.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from __future__ import annotations 4 | 5 | _import_error_message = ( 6 | "dask.distributed is not installed.\n\n" 7 | "Please either conda or pip install distributed:\n\n" 8 | " conda install dask distributed # either conda install\n" 9 | ' python -m pip install "dask[distributed]" --upgrade # or pip install' 10 | ) 11 | 12 | try: 13 | from distributed import * 14 | except ImportError as e: 15 | if e.msg == "No module named 'distributed'": 16 | raise ImportError(_import_error_message) from e 17 | else: 18 | raise 19 | 20 | 21 | def __getattr__(value): 22 | try: 23 | import distributed 24 | except ImportError as e: 25 | raise ImportError(_import_error_message) from e 26 | return getattr(distributed, value) 27 | -------------------------------------------------------------------------------- /dask/dataframe/io/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from dask.dataframe.io import demo 4 | from dask.dataframe.io.csv import read_csv, read_fwf, read_table, to_csv 5 | from dask.dataframe.io.hdf import read_hdf, to_hdf 6 | from dask.dataframe.io.io import ( 7 | from_array, 8 | from_dask_array, 9 | from_delayed, 10 | from_dict, 11 | from_map, 12 | from_pandas, 13 | to_backend, 14 | to_bag, 15 | to_records, 16 | ) 17 | from dask.dataframe.io.json import read_json, to_json 18 | from dask.dataframe.io.sql import read_sql, read_sql_query, read_sql_table, to_sql 19 | 20 | try: 21 | from dask.dataframe.io.parquet import read_parquet, to_parquet 22 | except ImportError: 23 | pass 24 | 25 | try: 26 | from dask.dataframe.io.orc import read_orc, to_orc 27 | except ImportError: 28 | pass 29 | -------------------------------------------------------------------------------- /dask/tests/test_backends.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | import dask 6 | 7 | 8 | @pytest.mark.gpu 9 | @pytest.mark.parametrize("backend", ["pandas", "cudf"]) 10 | def test_CreationDispatch_error_informative_message(backend): 11 | # Check that an informative error is emitted when a backend dispatch 12 | # method fails 13 | pytest.importorskip(backend) 14 | dd = pytest.importorskip("dask.dataframe") 15 | data = {"a": [1, 2, 3, 4], "B": [10, 11, 12, 13]} 16 | with dask.config.set({"dataframe.backend": backend}): 17 | with pytest.raises(TypeError) as excinfo: 18 | dd.from_dict(data, npartitions=2, unsupported_kwarg=True) 19 | 20 | msg = str(excinfo.value) 21 | assert "error occurred while calling the from_dict method" in msg 22 | assert backend in msg 23 | -------------------------------------------------------------------------------- /dask/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | try: 4 | from dask.widgets.widgets import ( 5 | FILTERS, 6 | TEMPLATE_PATHS, 7 | get_environment, 8 | get_template, 9 | ) 10 | 11 | except ImportError as e: 12 | msg = ( 13 | "Dask diagnostics requirements are not installed.\n\n" 14 | "Please either conda or pip install as follows:\n\n" 15 | " conda install dask # either conda install\n" 16 | ' python -m pip install "dask[diagnostics]" --upgrade # or python -m pip install' 17 | ) 18 | exception = e # Explicit reference for e as it will be lost outside the try block 19 | FILTERS = {} 20 | TEMPLATE_PATHS = [] 21 | 22 | def get_environment(): 23 | raise ImportError(msg) from exception 24 | 25 | def get_template(name: str): 26 | raise ImportError(msg) from exception 27 | -------------------------------------------------------------------------------- /dask/bag/chunk.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def barrier(*args): 5 | return None 6 | 7 | 8 | def getitem(x, key): 9 | """Like :func:`operator.getitem`, but allows setting key using partial 10 | ``partial(chunk.getitem, key=key) 11 | """ 12 | return x[key] 13 | 14 | 15 | def foldby_combine2(combine, acc, x): 16 | return combine(acc, x[1]) 17 | 18 | 19 | def groupby_tasks_group_hash(x, hash, grouper): 20 | return hash(grouper(x)), x 21 | 22 | 23 | def var_chunk(seq): 24 | squares, total, n = 0.0, 0.0, 0 25 | for x in seq: 26 | squares += x**2 27 | total += x 28 | n += 1 29 | return squares, total, n 30 | 31 | 32 | def var_aggregate(x, ddof): 33 | squares, totals, counts = list(zip(*x)) 34 | x2, x, n = float(sum(squares)), float(sum(totals)), sum(counts) 35 | result = (x2 / n) - (x / n) ** 2 36 | return result * n / (n - ddof) 37 | -------------------------------------------------------------------------------- /docs/source/how-to/setup-prometheus.rst: -------------------------------------------------------------------------------- 1 | .. When modifying the contents of this page, please adjust the corresponding page in the dask.distributed documentation accordingly. 2 | 3 | Setup Prometheus monitoring 4 | =========================== 5 | 6 | Prometheus_ is a widely popular tool for monitoring and alerting a wide variety of 7 | systems. A distributed cluster offers a number of Prometheus metrics if the 8 | prometheus_client_ package is installed. The metrics are exposed in Prometheus' 9 | text-based format at the ``/metrics`` endpoint on both schedulers and workers. 10 | 11 | 12 | Available metrics 13 | ----------------- 14 | 15 | Apart from the metrics exposed per default by the prometheus_client_, schedulers and 16 | workers expose a number of Dask-specific metrics. 17 | See the `dask.distributed documentation 18 | `_ for details. 19 | 20 | 21 | .. _Prometheus: https://prometheus.io 22 | .. _prometheus_client: https://github.com/prometheus/client_python 23 | -------------------------------------------------------------------------------- /dask/bag/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | try: 4 | from dask.bag.avro import read_avro 5 | from dask.bag.core import Bag, Item 6 | from dask.bag.core import bag_map as map 7 | from dask.bag.core import bag_range as range 8 | from dask.bag.core import bag_zip as zip 9 | from dask.bag.core import ( 10 | concat, 11 | from_delayed, 12 | from_sequence, 13 | from_url, 14 | map_partitions, 15 | to_textfiles, 16 | ) 17 | from dask.bag.text import read_text 18 | from dask.bag.utils import assert_eq 19 | from dask.base import compute 20 | except ImportError as e: 21 | msg = ( 22 | "Dask bag requirements are not installed.\n\n" 23 | "Please either conda or pip install as follows:\n\n" 24 | " conda install dask # either conda install\n" 25 | ' python -m pip install "dask[bag]" --upgrade # or python -m pip install' 26 | ) 27 | raise ImportError(str(e) + "\n\n" + msg) from e 28 | -------------------------------------------------------------------------------- /docs/source/logos.rst: -------------------------------------------------------------------------------- 1 | :orphan: 2 | 3 | Images and Logos 4 | ================ 5 | 6 | Here are some commonly used Dask icons and logos 7 | (see the `Dask style guide `_ for more details). 8 | 9 | .. image:: images/dask_icon.svg 10 | :alt: Primary Dask icon. 11 | 12 | .. image:: images/dask_icon_black.svg 13 | :alt: Dask icon in black. 14 | 15 | .. image:: images/dask_icon_white.svg 16 | :alt: Dask icon in white. 17 | 18 | .. image:: images/dask_icon_on_pink.svg 19 | :alt: Dask icon to use on a pink background. 20 | 21 | .. image:: images/dask_horizontal.svg 22 | :alt: Primary Dask logo. 23 | 24 | .. image:: images/dask_horizontal_black.svg 25 | :alt: Dask logo in black. 26 | 27 | .. image:: images/dask_horizontal_white.svg 28 | :alt: Dask logo in white. 29 | 30 | .. image:: images/dask_horizontal_on_pink.svg 31 | :alt: Dask logo to use on a pink background. 32 | 33 | .. image:: images/dask_horizontal_on_blue.svg 34 | :alt: Dask logo to use on a blue background. 35 | -------------------------------------------------------------------------------- /dask/tests/test_docs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | 8 | def test_development_guidelines_matches_ci(): 9 | """When the environment.yaml changes in CI, make sure to change it in the docs as well""" 10 | root_dir = Path(__file__).parent.parent.parent 11 | 12 | if not (root_dir / ".github" / "workflows").exists(): 13 | pytest.skip("Test can only be run on an editable install") 14 | 15 | development_doc_file = root_dir / "docs" / "source" / "develop.rst" 16 | additional_ci_file = root_dir / ".github" / "workflows" / "additional.yml" 17 | upstream_ci_file = root_dir / ".github" / "workflows" / "upstream.yml" 18 | latest_env = "environment-3.10.yaml" 19 | 20 | for filename in [development_doc_file, additional_ci_file, upstream_ci_file]: 21 | with open(filename, encoding="utf8") as f: 22 | assert any( 23 | latest_env in line for line in f 24 | ), f"{latest_env} not found in {filename}" 25 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_boolean.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | 5 | import dask.dataframe as dd 6 | 7 | 8 | def test_meta(): 9 | values = pd.array([True, False, None], dtype="boolean") 10 | ds = dd.from_pandas(pd.Series(values), 2) 11 | assert ds.dtype == pd.BooleanDtype() 12 | 13 | dd.utils.assert_eq(ds._meta_nonempty, pd.Series([True, pd.NA], dtype="boolean")) 14 | 15 | ddf = dd.from_pandas(pd.DataFrame({"A": values}), 2) 16 | assert ddf.dtypes["A"] == pd.BooleanDtype() 17 | 18 | dd.utils.assert_eq( 19 | ddf._meta_nonempty, 20 | pd.DataFrame({"A": pd.array([True, pd.NA], dtype="boolean")}), 21 | ) 22 | 23 | 24 | def test_ops(): 25 | s1 = pd.Series(pd.array([True, False, None] * 3, dtype="boolean")) 26 | s2 = pd.Series(pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")) 27 | 28 | ds1 = dd.from_pandas(s1, 2) 29 | ds2 = dd.from_pandas(s2, 2) 30 | 31 | dd.utils.assert_eq(ds1 | ds2, s1 | s2) 32 | dd.utils.assert_eq(ds1 & ds2, s1 & s2) 33 | dd.utils.assert_eq(ds1 ^ ds2, s1 ^ s2) 34 | -------------------------------------------------------------------------------- /docs/source/dashboard-progress-script.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script was run to produce some of the screenshots on https://docs.dask.org/en/stable/dashboard.html 3 | """ 4 | from __future__ import annotations 5 | 6 | import time 7 | 8 | from dask import delayed 9 | from dask.distributed import Client, wait 10 | 11 | 12 | @delayed 13 | def inc(x): 14 | time.sleep(0.1) 15 | return x + 1 16 | 17 | 18 | @delayed 19 | def double(x): 20 | time.sleep(0.1) 21 | return 2 * x 22 | 23 | 24 | @delayed 25 | def add(x, y): 26 | time.sleep(0.1) 27 | return x + y 28 | 29 | 30 | if __name__ == "__main__": 31 | with Client(n_workers=4, threads_per_worker=2, memory_limit="4 GiB") as client: 32 | while True: 33 | data = list(range(1000)) 34 | output = [] 35 | for x in data: 36 | a = inc(x) 37 | b = double(x) 38 | c = add(a, b) 39 | output.append(c) 40 | 41 | total = delayed(sum)(output) 42 | total = total.persist() 43 | wait(total) 44 | time.sleep(5) 45 | del total 46 | time.sleep(2) 47 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234 2 | [flake8] 3 | # References: 4 | # https://flake8.readthedocs.io/en/latest/user/configuration.html 5 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html 6 | # https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes 7 | exclude = __init__.py 8 | ignore = 9 | # Extra space in brackets 10 | E20 11 | # Multiple spaces around "," 12 | E231,E241 13 | # Comments 14 | E26 15 | # Import formatting 16 | E4 17 | # Comparing types instead of isinstance 18 | E721 19 | # Assigning lambda expression 20 | E731 21 | # Ambiguous variable names 22 | E741 23 | # Line break before binary operator 24 | W503 25 | # Line break after binary operator 26 | W504 27 | # Redefinition of unused 'loop' from line 10 28 | F811 29 | # No explicit stacklevel in warnings.warn. FIXME we should correct this in the code 30 | B028 31 | 32 | max-line-length = 120 33 | per-file-ignores = 34 | *_test.py: 35 | # Do not call assert False since python -O removes these calls 36 | B011, 37 | **/tests/*: 38 | # Do not call assert False since python -O removes these calls 39 | B011, 40 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_optimize_dataframe.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | 5 | import dask 6 | import dask.dataframe as dd 7 | 8 | dsk = { 9 | ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]), 10 | ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]), 11 | ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]), 12 | } 13 | dfs = list(dsk.values()) 14 | 15 | 16 | def test_fuse_ave_width(): 17 | df = pd.DataFrame({"x": range(10)}) 18 | df = dd.from_pandas(df, npartitions=5) 19 | 20 | s = (df.x + 1) + (df.x + 2) 21 | 22 | with dask.config.set({"optimization.fuse.ave-width": 4}): 23 | a = s.__dask_optimize__(s.dask, s.__dask_keys__()) 24 | 25 | b = s.__dask_optimize__(s.dask, s.__dask_keys__()) 26 | 27 | assert len(a) <= 15 28 | assert len(b) <= 15 29 | 30 | 31 | def test_optimize_blockwise(): 32 | from dask.array.optimization import optimize_blockwise 33 | 34 | df = pd.DataFrame({"x": range(10), "y": range(10)}) 35 | ddf = dd.from_pandas(df, npartitions=2) 36 | 37 | for _ in range(10): 38 | ddf["x"] = ddf.x + 1 + ddf.y 39 | 40 | graph = optimize_blockwise(ddf.dask) 41 | 42 | assert len(graph) <= 4 43 | -------------------------------------------------------------------------------- /dask/array/tests/test_numpy_compat.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | import dask.array as da 7 | from dask.array.utils import assert_eq 8 | 9 | 10 | @pytest.fixture( 11 | params=[ 12 | [("A", ("f4", (3, 2))), ("B", ("f4", 3)), ("C", ("f8", 3))], 13 | [("A", ("i4", (3, 2))), ("B", ("f4", 3)), ("C", ("S4", 3))], 14 | ] 15 | ) 16 | def dtype(request): 17 | return np.dtype(request.param) 18 | 19 | 20 | @pytest.fixture(params=[["A"], ["A", "B"], ["A", "B", "C"]]) 21 | def index(request): 22 | return request.param 23 | 24 | 25 | def test_basic(): 26 | # sanity check 27 | dtype = [("a", "f8"), ("b", "f8"), ("c", "f8")] 28 | x = np.ones((5, 3), dtype=dtype) 29 | dx = da.ones((5, 3), dtype=dtype, chunks=3) 30 | result = dx[["a", "b"]] 31 | expected = x[["a", "b"]] 32 | assert_eq(result, expected) 33 | 34 | 35 | def test_min_max_round_funcs(): 36 | # Regression test for gh-5031 37 | image = da.from_array(np.array([[0, 1], [1, 2]]), chunks=(1, 2)) 38 | # These use __array_function__ (and min/max/round are aliased, 39 | # to amin/amax/round_ in numpy) 40 | assert int(np.min(image)) == 0 41 | assert int(np.max(image)) == 2 42 | assert np.round(image)[1, 1] == 2 43 | -------------------------------------------------------------------------------- /dask/tests/test_hashing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from dask.hashing import hash_buffer, hash_buffer_hex, hashers 6 | 7 | np = pytest.importorskip("numpy") 8 | 9 | buffers = [ 10 | b"abc", 11 | bytearray(b"123"), 12 | memoryview(b"456"), 13 | np.array(42), 14 | np.ones((100, 100)), 15 | np.zeros((100, 100), dtype=[("a", "i4"), ("b", "i2")]), 16 | np.ones(10000, dtype=np.int8)[1:], # unaligned 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("x", buffers) 21 | def test_hash_buffer(x): 22 | for hasher in [None] + hashers: 23 | h = hash_buffer(x, hasher=hasher) 24 | assert isinstance(h, bytes) 25 | assert 8 <= len(h) < 32 26 | assert h == hash_buffer(x, hasher=hasher) 27 | 28 | 29 | @pytest.mark.parametrize("x", buffers) 30 | def test_hash_buffer_hex(x): 31 | for hasher in [None] + hashers: 32 | h = hash_buffer_hex(x, hasher=hasher) 33 | assert isinstance(h, str) 34 | assert 16 <= len(h) < 64 35 | assert h == hash_buffer_hex(x, hasher=hasher) 36 | 37 | 38 | @pytest.mark.parametrize("hasher", hashers) 39 | def test_hashers(hasher): 40 | # Sanity check 41 | x = b"x" 42 | h = hasher(x) 43 | assert isinstance(h, bytes) 44 | assert 8 <= len(h) < 32 45 | -------------------------------------------------------------------------------- /dask/widgets/widgets.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | import html 5 | import os.path 6 | 7 | from jinja2 import Environment, FileSystemLoader, Template 8 | from jinja2.exceptions import TemplateNotFound 9 | 10 | from dask.utils import format_bytes, format_time, format_time_ago, key_split, typename 11 | 12 | FILTERS = { 13 | "datetime_from_timestamp": datetime.datetime.fromtimestamp, 14 | "format_bytes": format_bytes, 15 | "format_time": format_time, 16 | "format_time_ago": format_time_ago, 17 | "html_escape": html.escape, 18 | "key_split": key_split, 19 | "type": type, 20 | "typename": typename, 21 | } 22 | 23 | TEMPLATE_PATHS = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates")] 24 | 25 | 26 | def get_environment() -> Environment: 27 | loader = FileSystemLoader(TEMPLATE_PATHS) 28 | environment = Environment(loader=loader) 29 | environment.filters.update(FILTERS) 30 | 31 | return environment 32 | 33 | 34 | def get_template(name: str) -> Template: 35 | try: 36 | return get_environment().get_template(name) 37 | except TemplateNotFound as e: 38 | raise TemplateNotFound( 39 | f"Unable to find {name} in dask.widgets.TEMPLATE_PATHS {TEMPLATE_PATHS}" 40 | ) from e 41 | -------------------------------------------------------------------------------- /docs/source/array-stats.rst: -------------------------------------------------------------------------------- 1 | Stats 2 | ===== 3 | 4 | Dask Array implements a subset of the `scipy.stats`_ package. 5 | 6 | Statistical Functions 7 | --------------------- 8 | 9 | You can calculate various measures of an array including skewness, kurtosis, and arbitrary moments. 10 | 11 | .. code-block:: python 12 | 13 | >>> from dask.array import stats 14 | >>> rng = da.random.default_rng() 15 | >>> x = rng.beta(1, 1, size=(1000,), chunks=10) 16 | >>> k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)] 17 | >>> dask.compute(k, s, m) 18 | (1.7612340817172787, -0.064073498030693302, -0.00054523780628304799) 19 | 20 | 21 | Statistical Tests 22 | ----------------- 23 | 24 | You can perform basic statistical tests on Dask arrays. 25 | Each of these tests return a ``dask.delayed`` wrapping one of the scipy ``namedtuple`` 26 | results. 27 | 28 | 29 | .. code-block:: python 30 | 31 | >>> rng = da.random.default_rng() 32 | >>> a = rng.uniform(size=(50,), chunks=(25,)) 33 | >>> b = a + rng.uniform(low=-0.15, high=0.15, size=(50,), chunks=(25,)) 34 | >>> result = stats.ttest_rel(a, b) 35 | >>> result.compute() 36 | Ttest_relResult(statistic=-1.5102104380013242, pvalue=0.13741197274874514) 37 | 38 | .. _scipy.stats: https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html 39 | -------------------------------------------------------------------------------- /dask/tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | import dask 6 | 7 | 8 | def test_mimesis(): 9 | pytest.importorskip("mimesis") 10 | 11 | b = dask.datasets.make_people() 12 | assert b.take(5) 13 | 14 | assert b.take(3) == b.take(3) 15 | 16 | 17 | def test_full_dataset(): 18 | pytest.importorskip("mimesis") 19 | b = dask.datasets.make_people(npartitions=2, records_per_partition=10) 20 | assert b.count().compute() == 20 21 | 22 | 23 | def test_make_dataset_with_processes(): 24 | pytest.importorskip("mimesis") 25 | b = dask.datasets.make_people(npartitions=2) 26 | try: 27 | b.compute(scheduler="processes") 28 | except TypeError: 29 | pytest.fail("Failed to execute make_people using processes") 30 | 31 | 32 | def test_no_mimesis(): 33 | try: 34 | import mimesis # noqa: F401 35 | except ImportError: 36 | with pytest.raises(Exception) as info: 37 | dask.datasets.make_people() 38 | 39 | assert "python -m pip install mimesis" in str(info.value) 40 | 41 | 42 | def test_deterministic(): 43 | pytest.importorskip("mimesis") 44 | 45 | a = dask.datasets.make_people(seed=123) 46 | b = dask.datasets.make_people(seed=123) 47 | 48 | assert a.take(1)[0]["name"] == b.take(1)[0]["name"] 49 | -------------------------------------------------------------------------------- /dask/tests/test_ci.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | import importlib_metadata 6 | import pytest 7 | from packaging.version import Version 8 | 9 | 10 | @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/9735", strict=False) 11 | @pytest.mark.skipif( 12 | not os.environ.get("UPSTREAM_DEV", False), 13 | reason="Only check for dev packages in `upstream` CI build", 14 | ) 15 | def test_upstream_packages_installed(): 16 | # List of packages should match those specified in 17 | # `continuous_integration/scripts/install.sh` 18 | 19 | # FIXME: This test isn't sensative to projects that use git tags 20 | # to determine versions (e.g. versionseer) when installed 21 | # directly from GitHub as the latest `main` branch can sometimes 22 | # be pointing to a released version of the project. 23 | packages = [ 24 | "bokeh", 25 | # "dask", 26 | # "distributed", 27 | # "fastparquet", 28 | # "fsspec", 29 | "numpy", 30 | "pandas", 31 | # "partd", 32 | "pyarrow", 33 | # "s3fs", 34 | "scipy", 35 | # "sparse", 36 | # "zarr", 37 | # "zict", 38 | ] 39 | for package in packages: 40 | v = Version(importlib_metadata.version(package)) 41 | assert v.is_prerelease or v.local is not None, (package, str(v)) 42 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Dask 2 | ==== 3 | 4 | |Build Status| |Coverage| |Doc Status| |Discourse| |Version Status| |NumFOCUS| 5 | 6 | Dask is a flexible parallel computing library for analytics. See 7 | documentation_ for more information. 8 | 9 | 10 | LICENSE 11 | ------- 12 | 13 | New BSD. See `License File `__. 14 | 15 | .. _documentation: https://dask.org 16 | .. |Build Status| image:: https://github.com/dask/dask/actions/workflows/tests.yml/badge.svg 17 | :target: https://github.com/dask/dask/actions/workflows/tests.yml 18 | .. |Coverage| image:: https://codecov.io/gh/dask/dask/branch/main/graph/badge.svg 19 | :target: https://codecov.io/gh/dask/dask/branch/main 20 | :alt: Coverage status 21 | .. |Doc Status| image:: https://readthedocs.org/projects/dask/badge/?version=latest 22 | :target: https://dask.org 23 | :alt: Documentation Status 24 | .. |Discourse| image:: https://img.shields.io/discourse/users?logo=discourse&server=https%3A%2F%2Fdask.discourse.group 25 | :alt: Discuss Dask-related things and ask for help 26 | :target: https://dask.discourse.group 27 | .. |Version Status| image:: https://img.shields.io/pypi/v/dask.svg 28 | :target: https://pypi.python.org/pypi/dask/ 29 | .. |NumFOCUS| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A 30 | :target: https://www.numfocus.org/ 31 | -------------------------------------------------------------------------------- /dask/tests/test_context.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | import dask 6 | from dask.context import globalmethod 7 | 8 | 9 | def test_with_get(): 10 | da = pytest.importorskip("dask.array") 11 | var = [0] 12 | 13 | def myget(dsk, keys, **kwargs): 14 | var[0] = var[0] + 1 15 | return dask.get(dsk, keys, **kwargs) 16 | 17 | x = da.ones(10, chunks=(5,)) 18 | 19 | assert x.sum().compute() == 10 20 | assert var[0] == 0 21 | 22 | with dask.config.set(scheduler=myget): 23 | assert x.sum().compute() == 10 24 | assert var[0] == 1 25 | 26 | # Make sure we've cleaned up 27 | assert x.sum().compute() == 10 28 | assert var[0] == 1 29 | 30 | 31 | def foo(): 32 | return "foo" 33 | 34 | 35 | def bar(): 36 | return "bar" 37 | 38 | 39 | class Foo: 40 | @globalmethod(key="f") 41 | def f(): # type: ignore 42 | return 1 43 | 44 | g = globalmethod(foo, key="g", falsey=bar) 45 | 46 | 47 | def test_globalmethod(): 48 | x = Foo() 49 | 50 | assert x.f() == 1 51 | 52 | with dask.config.set(f=lambda: 2): 53 | assert x.f() == 2 54 | 55 | with dask.config.set(f=foo): 56 | assert x.f is foo 57 | assert x.f() == "foo" 58 | 59 | assert x.g is foo 60 | assert x.g() == "foo" 61 | 62 | with dask.config.set(g=False): 63 | assert x.g is bar 64 | assert x.g() == "bar" 65 | -------------------------------------------------------------------------------- /dask/widgets/templates/array.html.j2: -------------------------------------------------------------------------------- 1 | 2 | 3 | 36 | 39 | 40 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | {% if nbytes %} 14 | 15 | 16 | 17 | 18 | 19 | {% endif %} 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 |
Array Chunk
Bytes {{ nbytes }} {{ cbytes }}
Shape {{ array.shape }} {{ array.chunksize }}
Dask graph {{ array.npartitions }} chunks in {{ layers }}
Data type {{ array.dtype }} {{ array._meta | type | typename }}
35 |
37 | {{grid}} 38 |
41 | -------------------------------------------------------------------------------- /continuous_integration/scripts/test_imports.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -o errexit 3 | 4 | 5 | test_import () { 6 | echo "Create environment: python=$PYTHON_VERSION $1" 7 | # Create an empty environment 8 | mamba create -q -y -n test-imports -c conda-forge python=$PYTHON_VERSION packaging pyyaml fsspec toolz partd click cloudpickle importlib-metadata $1 9 | conda activate test-imports 10 | if [[ $1 =~ "distributed" ]]; then 11 | # dask[distributed] depends on the latest version of distributed 12 | python -m pip install git+https://github.com/dask/distributed 13 | fi 14 | python -m pip install -e . 15 | mamba list 16 | echo "python -c '$2'" 17 | python -c "$2" 18 | # Ensure that no non-deterministic objects are tokenized at init time, 19 | # which can prevent the library from being imported at all. 20 | echo "python -c '$2' (ensure deterministic)" 21 | DASK_TOKENIZE__ENSURE_DETERMINISTIC=True python -c "$2" 22 | conda deactivate 23 | mamba env remove -n test-imports 24 | } 25 | 26 | test_import "" "import dask, dask.base, dask.multiprocessing, dask.threaded, dask.optimization, dask.bag, dask.delayed, dask.graph_manipulation, dask.layers" 27 | test_import "numpy" "import dask.array" 28 | test_import "pandas" "import dask.dataframe" 29 | test_import "bokeh" "import dask.diagnostics" 30 | test_import "distributed" "import dask.distributed" 31 | -------------------------------------------------------------------------------- /continuous_integration/recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set major_minor_patch = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').split('.') %} 2 | {% set new_patch = major_minor_patch[2] | int + 1 %} 3 | {% set version = (major_minor_patch[:2] + [new_patch]) | join('.') + environ.get('VERSION_SUFFIX', '') %} 4 | 5 | 6 | package: 7 | name: dask-core 8 | version: {{ version }} 9 | 10 | source: 11 | git_url: ../.. 12 | 13 | build: 14 | number: {{ GIT_DESCRIBE_NUMBER }} 15 | noarch: python 16 | string: py_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} 17 | script: {{ PYTHON }} -m pip install . -vv 18 | entry_points: 19 | - dask = dask.__main__:main 20 | 21 | requirements: 22 | host: 23 | - python >=3.9 24 | - pip 25 | - versioneer =0.28 26 | - tomli # [py<311] 27 | 28 | run: 29 | - python >=3.9 30 | - click >=8.0 31 | - cloudpickle >=1.5.0 32 | - fsspec >=2021.09.0 33 | - packaging >=20.0 34 | - partd >=1.2.0 35 | - pyyaml >=5.3.1 36 | - toolz >=0.10.0 37 | - importlib_metadata >=4.13.0 38 | 39 | test: 40 | imports: 41 | - dask 42 | commands: 43 | - pip check 44 | - dask docs --help 45 | - dask info --help 46 | - dask info versions --help 47 | requires: 48 | - pip 49 | 50 | about: 51 | home: https://github.com/dask/dask/ 52 | license: BSD-3-Clause 53 | license_file: 54 | - LICENSE.txt 55 | - dask/array/NUMPY_LICENSE.txt 56 | summary: Parallel Python with task scheduling 57 | doc_url: https://dask.org/ 58 | dev_url: https://github.com/dask/dask 59 | -------------------------------------------------------------------------------- /dask/array/tests/test_xarray.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | import dask.array as da 6 | from dask.array.utils import assert_eq 7 | 8 | xr = pytest.importorskip("xarray") 9 | 10 | 11 | def test_mean(): 12 | y = da.mean(xr.DataArray([1, 2, 3.0])) 13 | assert isinstance(y, da.Array) 14 | assert_eq(y, y) 15 | 16 | 17 | def test_asarray(): 18 | y = da.asarray(xr.DataArray([1, 2, 3.0])) 19 | assert isinstance(y, da.Array) 20 | assert_eq(y, y) 21 | 22 | 23 | def test_asanyarray(): 24 | y = da.asanyarray(xr.DataArray([1, 2, 3.0])) 25 | assert isinstance(y, da.Array) 26 | assert_eq(y, y) 27 | 28 | 29 | def test_asarray_xarray_intersphinx_workaround(): 30 | # test that the intersphinx workaround in https://github.com/pydata/xarray/issues/4279 works 31 | module = xr.DataArray.__module__ 32 | try: 33 | xr.DataArray.__module__ = "xarray" 34 | y = da.asarray(xr.DataArray([1, 2, 3.0])) 35 | assert isinstance(y, da.Array) 36 | assert type(y._meta).__name__ == "ndarray" 37 | assert_eq(y, y) 38 | finally: 39 | xr.DataArray.__module__ = module 40 | 41 | 42 | def test_fft(): 43 | # Regression test for https://github.com/dask/dask/issues/9679 44 | coord = da.arange(8, chunks=-1) 45 | data = da.random.random((8, 8), chunks=-1) + 1 46 | x = xr.DataArray(data, coords={"x": coord, "y": coord}, dims=["x", "y"]) 47 | result = da.fft.fft(x) 48 | expected = da.fft.fft(x.data) 49 | assert_eq(result, expected) 50 | -------------------------------------------------------------------------------- /dask/widgets/tests/test_widgets.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os.path 4 | 5 | import pytest 6 | 7 | jinja2 = pytest.importorskip("jinja2") 8 | 9 | from dask.utils import format_bytes 10 | from dask.widgets import FILTERS, TEMPLATE_PATHS, get_environment, get_template 11 | 12 | 13 | @pytest.fixture(autouse=True) 14 | def setup_testing(): 15 | TEMPLATE_PATHS.append( 16 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates") 17 | ) 18 | FILTERS["custom_filter"] = lambda x: "baz" 19 | 20 | 21 | def test_widgets(): 22 | template = get_template("example.html.j2") 23 | assert isinstance(template, jinja2.Template) 24 | rendered = template.render(foo="bar") 25 | assert "Hello bar" in rendered 26 | 27 | 28 | def test_environment(): 29 | environment = get_environment() 30 | assert isinstance(environment, jinja2.Environment) 31 | 32 | 33 | def test_unknown_template(): 34 | with pytest.raises(jinja2.TemplateNotFound) as e: 35 | get_template("does_not_exist.html.j2") 36 | 37 | # The error should contain all the registered template directories to help the user 38 | # understand where jinja2 is looking. Including the one we registered in the fixture. 39 | assert os.path.dirname(os.path.abspath(__file__)) in str(e) 40 | 41 | 42 | def test_filters(): 43 | template = get_template("bytes.html.j2") 44 | assert format_bytes in FILTERS.values() 45 | assert format_bytes(2e9) in template.render(foo=2e9) 46 | 47 | template = get_template("custom_filter.html.j2") 48 | assert "baz" in template.render(foo=None) 49 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2014, Anaconda, Inc. and contributors 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /continuous_integration/environment-mindeps-optional.yaml: -------------------------------------------------------------------------------- 1 | name: test-environment 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | # required dependencies 6 | - packaging=20.0 7 | - python=3.9 8 | - pyyaml=5.3.1 9 | - click=8.0 10 | - cloudpickle=1.5.0 11 | - partd=1.2.0 12 | - fsspec=2021.09.0 13 | - importlib-metadata=4.13.0 14 | - toolz=0.10.0 15 | # optional dependencies pulled in by pip install dask[array,dataframe] 16 | - numpy=1.21 17 | - pandas=1.3 18 | # optional dependencies pulled in by pip install dask[diagnostics] 19 | - bokeh=2.4.2 20 | - jinja2=2.10.3 21 | # optional dependencies pulled in by pip install dask[complete] 22 | - pyarrow=7.0 23 | - lz4=4.3.2 24 | # optional dependencies used by dask 25 | - cachey=0.1.1 26 | - crick=0.0.3 27 | - cytoolz=0.11.0 28 | - dask-ml=1.4.0 29 | - fastavro=1.1.0 30 | - fastparquet=0.8.2 31 | - h5py=2.10.0 32 | - ipycytoscape=1.0.1 33 | - IPython=7.16.1 34 | - matplotlib=3.4.1 35 | - mimesis=5.3.0 36 | - mmh3=2.5.1 37 | - psutil=5.7.2 38 | - python-cityhash=0.4.6 39 | - python-graphviz=0.8.4 40 | - python-snappy=0.5.4 41 | - python-xxhash=2.0.0 42 | - s3fs=2021.9.0 43 | - scikit-image=0.17.2 44 | - scipy=1.5.2 45 | - sparse=0.12.0 46 | - sqlalchemy=1.4.16 47 | - tblib=1.6.0 48 | - tiledb-py=0.8.1 49 | - zarr=2.12.0 50 | - pip 51 | - pip: 52 | # optional dependencies pulled in by pip install dask[distributed] 53 | - git+https://github.com/dask/distributed 54 | # test dependencies 55 | - pre-commit 56 | - pytest 57 | - pytest-cov 58 | - pytest-rerunfailures 59 | - pytest-xdist 60 | -------------------------------------------------------------------------------- /dask/tests/test_system.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import builtins 4 | import io 5 | import os 6 | import sys 7 | 8 | import pytest 9 | 10 | from dask.system import cpu_count 11 | 12 | psutil = pytest.importorskip("psutil") 13 | 14 | 15 | def test_cpu_count(): 16 | count = cpu_count() 17 | assert isinstance(count, int) 18 | assert count <= os.cpu_count() 19 | assert count >= 1 20 | 21 | 22 | @pytest.mark.parametrize("dirname", ["cpuacct,cpu", "cpu,cpuacct", None]) 23 | def test_cpu_count_cgroups(dirname, monkeypatch): 24 | def mycpu_count(): 25 | # Absurdly high, unlikely to match real value 26 | return 250 27 | 28 | monkeypatch.setattr(os, "cpu_count", mycpu_count) 29 | 30 | class MyProcess: 31 | def cpu_affinity(self): 32 | # No affinity set 33 | return [] 34 | 35 | monkeypatch.setattr(psutil, "Process", MyProcess) 36 | 37 | if dirname: 38 | paths = { 39 | "/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname: io.StringIO("2005"), 40 | "/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname: io.StringIO("10"), 41 | } 42 | builtin_open = builtins.open 43 | 44 | def myopen(path, *args, **kwargs): 45 | if path in paths: 46 | return paths.get(path) 47 | return builtin_open(path, *args, **kwargs) 48 | 49 | monkeypatch.setattr(builtins, "open", myopen) 50 | monkeypatch.setattr(sys, "platform", "linux") 51 | 52 | count = cpu_count() 53 | if dirname: 54 | # Rounds up 55 | assert count == 201 56 | else: 57 | assert count == 250 58 | -------------------------------------------------------------------------------- /dask/array/NUMPY_LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005-2015, NumPy Developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the NumPy Developers nor the names of any 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /docs/source/delayed-collections.rst: -------------------------------------------------------------------------------- 1 | Working with Collections 2 | ======================== 3 | 4 | Often we want to do a bit of custom work with ``dask.delayed`` (for example, 5 | for complex data ingest), then leverage the algorithms in ``dask.array`` or 6 | ``dask.dataframe``, and then switch back to custom work. To this end, all 7 | collections support ``from_delayed`` functions and ``to_delayed`` 8 | methods. 9 | 10 | As an example, consider the case where we store tabular data in a custom format 11 | not known by Dask DataFrame. This format is naturally broken apart into 12 | pieces and we have a function that reads one piece into a Pandas DataFrame. 13 | We use ``dask.delayed`` to lazily read these files into Pandas DataFrames, 14 | use ``dd.from_delayed`` to wrap these pieces up into a single 15 | Dask DataFrame, use the complex algorithms within the DataFrame 16 | (groupby, join, etc.), and then switch back to ``dask.delayed`` to save our results 17 | back to the custom format: 18 | 19 | .. code-block:: python 20 | 21 | import dask.dataframe as dd 22 | from dask.delayed import delayed 23 | 24 | from my_custom_library import load, save 25 | 26 | filenames = ... 27 | dfs = [delayed(load)(fn) for fn in filenames] 28 | 29 | df = dd.from_delayed(dfs) 30 | df = ... # do work with dask.dataframe 31 | 32 | dfs = df.to_delayed() 33 | writes = [delayed(save)(df, fn) for df, fn in zip(dfs, filenames)] 34 | 35 | dd.compute(*writes) 36 | 37 | Data science is often complex, and ``dask.delayed`` provides a release valve for 38 | users to manage this complexity on their own, and solve the last mile problem 39 | for custom formats and complex situations. 40 | -------------------------------------------------------------------------------- /docs/source/images/dask_icon_black.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 12 | 13 | 17 | 20 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_extensions.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from decimal import Decimal 4 | 5 | import pytest 6 | 7 | import dask.dataframe as dd 8 | from dask.dataframe.utils import assert_eq 9 | 10 | pd = pytest.importorskip("pandas") 11 | 12 | from pandas.tests.extension.decimal.array import DecimalArray, DecimalDtype 13 | 14 | from dask.dataframe.extensions import make_array_nonempty, make_scalar 15 | 16 | 17 | @make_array_nonempty.register(DecimalDtype) 18 | def _(dtype): 19 | return DecimalArray._from_sequence([Decimal("0"), Decimal("NaN")], dtype=dtype) 20 | 21 | 22 | @make_scalar.register(Decimal) 23 | def _(x): 24 | return Decimal("1") 25 | 26 | 27 | def test_register_extension_type(): 28 | arr = DecimalArray._from_sequence([Decimal("1.0")] * 10) 29 | ser = pd.Series(arr) 30 | dser = dd.from_pandas(ser, 2) 31 | assert_eq(ser, dser) 32 | 33 | df = pd.DataFrame({"A": ser}) 34 | ddf = dd.from_pandas(df, 2) 35 | assert_eq(df, ddf) 36 | 37 | 38 | def test_reduction(): 39 | ser = pd.Series(DecimalArray._from_sequence([Decimal("0"), Decimal("1")])) 40 | dser = dd.from_pandas(ser, 2) 41 | assert_eq(ser.mean(skipna=False), dser.mean(skipna=False)) 42 | 43 | # It's unclear whether this can be reliably provided, at least with the current 44 | # implementation, which uses pandas.DataFrame.sum(), returning a (homogenous) 45 | # series which has potentially cast values. 46 | 47 | # assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False)) 48 | 49 | 50 | def test_scalar(): 51 | result = dd.utils.make_meta(Decimal("1.0"), parent_meta=pd.DataFrame()) 52 | assert result == Decimal("1.0") 53 | -------------------------------------------------------------------------------- /docs/source/images/dask_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 12 | 13 | 17 | 20 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/source/images/dask_icon_on_pink.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 12 | 13 | 17 | 20 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/source/images/dask_icon_white.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 12 | 13 | 17 | 20 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /dask/array/tests/test_image.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from contextlib import contextmanager 5 | 6 | import pytest 7 | 8 | pytest.importorskip("skimage") 9 | import numpy as np 10 | from skimage.io import imsave 11 | 12 | from dask.array.image import imread as da_imread 13 | from dask.utils import tmpdir 14 | 15 | 16 | @contextmanager 17 | def random_images(n, shape): 18 | with tmpdir() as dirname: 19 | for i in range(n): 20 | fn = os.path.join(dirname, "image.%d.png" % i) 21 | x = np.random.randint(0, 255, size=shape).astype("u1") 22 | imsave(fn, x, check_contrast=False) 23 | 24 | yield os.path.join(dirname, "*.png") 25 | 26 | 27 | def test_imread(): 28 | with random_images(4, (5, 6, 3)) as globstring: 29 | im = da_imread(globstring) 30 | assert im.shape == (4, 5, 6, 3) 31 | assert im.chunks == ((1, 1, 1, 1), (5,), (6,), (3,)) 32 | assert im.dtype == "uint8" 33 | 34 | assert im.compute().shape == (4, 5, 6, 3) 35 | assert im.compute().dtype == "uint8" 36 | 37 | 38 | def test_imread_with_custom_function(): 39 | def imread2(fn): 40 | return np.ones((2, 3, 4), dtype="i1") 41 | 42 | with random_images(4, (5, 6, 3)) as globstring: 43 | im = da_imread(globstring, imread=imread2) 44 | assert (im.compute() == np.ones((4, 2, 3, 4), dtype="u1")).all() 45 | 46 | 47 | def test_preprocess(): 48 | def preprocess(x): 49 | x[:] = 1 50 | return x[:, :, 0] 51 | 52 | with random_images(4, (2, 3, 4)) as globstring: 53 | im = da_imread(globstring, preprocess=preprocess) 54 | assert (im.compute() == np.ones((4, 2, 3), dtype="u1")).all() 55 | -------------------------------------------------------------------------------- /continuous_integration/environment-3.9.yaml: -------------------------------------------------------------------------------- 1 | # This job includes coverage 2 | name: test-environment 3 | channels: 4 | - conda-forge 5 | - nodefaults 6 | dependencies: 7 | # required dependencies 8 | - python=3.9 9 | - packaging 10 | - pyyaml 11 | - click 12 | - cloudpickle 13 | - partd 14 | - fsspec 15 | - importlib_metadata 16 | - toolz 17 | # test dependencies 18 | - pre-commit 19 | - pytest 20 | - pytest-cov 21 | - pytest-rerunfailures 22 | - pytest-timeout 23 | - pytest-xdist 24 | - moto 25 | # Optional dependencies 26 | - mimesis 27 | - numpy=1.22 28 | - pandas=1.4 29 | - flask 30 | - fastparquet 31 | - h5py 32 | - pytables 33 | - zarr 34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI. 35 | # We should unpin when possible. 36 | # https://github.com/dask/dask/pull/9569 37 | - tiledb-py<0.17.4 38 | - pyspark 39 | - tiledb>=2.5.0 40 | - xarray 41 | - sqlalchemy>=1.4.16,<2 # `pandas=1.4` doesn't support `sqlalchemy=2` 42 | - pyarrow=9 43 | - coverage 44 | - jsonschema 45 | # other -- IO 46 | - boto3 47 | - botocore 48 | - bokeh 49 | - httpretty 50 | - aiohttp 51 | - s3fs 52 | - crick 53 | - cytoolz 54 | - distributed 55 | - ipython 56 | - ipycytoscape 57 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed 58 | - ipywidgets<8.0.5 59 | - ipykernel<6.22.0 60 | - lz4 61 | - numba 62 | - psutil 63 | - requests 64 | - scikit-image<0.20 65 | - scikit-learn 66 | - scipy 67 | - python-snappy 68 | - sparse 69 | - cachey 70 | - python-graphviz 71 | - python-xxhash 72 | - python-cityhash 73 | - mmh3 74 | - jinja2 75 | - pip 76 | - pip: 77 | - git+https://github.com/dask/distributed 78 | -------------------------------------------------------------------------------- /continuous_integration/environment-3.10.yaml: -------------------------------------------------------------------------------- 1 | # This job includes coverage 2 | name: test-environment 3 | channels: 4 | - conda-forge 5 | - nodefaults 6 | dependencies: 7 | # required dependencies 8 | - python=3.10 9 | - packaging 10 | - pyyaml 11 | - click 12 | - cloudpickle 13 | - partd 14 | - fsspec 15 | - importlib_metadata 16 | - toolz 17 | # test dependencies 18 | - pre-commit 19 | - pytest 20 | - pytest-cov 21 | - pytest-rerunfailures 22 | - pytest-timeout 23 | - pytest-xdist 24 | - moto 25 | # Optional dependencies 26 | - mimesis 27 | - numpy=1.23 28 | - pandas=1.5 29 | - flask 30 | - fastparquet>=0.8.0 31 | - h5py 32 | - pytables 33 | - zarr 34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI. 35 | # We should unpin when possible. 36 | # https://github.com/dask/dask/pull/9569 37 | - tiledb-py<0.17.4 38 | - pyspark 39 | - tiledb>=2.5.0 40 | - xarray 41 | - sqlalchemy>=1.4.16,<2 # `pandas=1.5` doesn't support `sqlalchemy=2` 42 | - pyarrow=10 43 | - coverage 44 | - jsonschema 45 | # other -- IO 46 | - boto3 47 | - botocore 48 | - bokeh 49 | - httpretty 50 | - aiohttp 51 | - s3fs 52 | - crick 53 | - cytoolz 54 | - distributed 55 | - ipython 56 | - ipycytoscape 57 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed 58 | - ipywidgets<8.0.5 59 | - ipykernel<6.22.0 60 | - lz4 61 | - numba 62 | - psutil 63 | - requests 64 | - scikit-image 65 | - scikit-learn 66 | - scipy 67 | - python-snappy 68 | - sparse 69 | - cachey 70 | - python-graphviz 71 | - python-xxhash 72 | - python-cityhash 73 | - mmh3 74 | - jinja2 75 | - pip 76 | - pip: 77 | - git+https://github.com/dask/distributed 78 | -------------------------------------------------------------------------------- /dask/system.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import os 5 | import sys 6 | 7 | try: 8 | import psutil 9 | except ImportError: 10 | psutil = None # type: ignore 11 | 12 | __all__ = ("cpu_count", "CPU_COUNT") 13 | 14 | 15 | def cpu_count(): 16 | """Get the available CPU count for this system. 17 | 18 | Takes the minimum value from the following locations: 19 | 20 | - Total system cpus available on the host. 21 | - CPU Affinity (if set) 22 | - Cgroups limit (if set) 23 | """ 24 | count = os.cpu_count() 25 | 26 | # Check CPU affinity if available 27 | if psutil is not None: 28 | try: 29 | affinity_count = len(psutil.Process().cpu_affinity()) 30 | if affinity_count > 0: 31 | count = min(count, affinity_count) 32 | except Exception: 33 | pass 34 | 35 | # Check cgroups if available 36 | if sys.platform == "linux": 37 | # The directory name isn't standardized across linux distros, check both 38 | for dirname in ["cpuacct,cpu", "cpu,cpuacct"]: 39 | try: 40 | with open("/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname) as f: 41 | quota = int(f.read()) 42 | with open("/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname) as f: 43 | period = int(f.read()) 44 | # We round up on fractional CPUs 45 | cgroups_count = math.ceil(quota / period) 46 | if cgroups_count > 0: 47 | count = min(count, cgroups_count) 48 | break 49 | except Exception: 50 | pass 51 | 52 | return count 53 | 54 | 55 | CPU_COUNT = cpu_count() 56 | -------------------------------------------------------------------------------- /dask/tests/test_utils_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | import pytest 6 | 7 | from dask import utils_test 8 | from dask.highlevelgraph import HighLevelGraph 9 | from dask.utils_test import _check_warning 10 | 11 | 12 | def test_hlg_layer(): 13 | a = {"x": 1} 14 | b = {"y": (utils_test.inc, "x")} 15 | layers = {"a-layer": a, "bee-layer": b} 16 | dependencies = {"a-layer": set(), "bee-layer": {"a-layer"}} 17 | hg = HighLevelGraph(layers, dependencies) 18 | 19 | assert utils_test.hlg_layer(hg, "a") is hg.layers["a-layer"] 20 | assert utils_test.hlg_layer(hg, "b") is hg.layers["bee-layer"] 21 | with pytest.raises(KeyError, match="No layer starts with"): 22 | utils_test.hlg_layer(hg, "foo") 23 | 24 | 25 | def test_hlg_layer_topological(): 26 | a = {"x": 1} 27 | b = {"y": (utils_test.inc, "x")} 28 | c = {"z": (utils_test.inc, "x")} 29 | d = {"r": (sum, ["y", "z"])} 30 | layers = {"a": a, "b": b, "c": c, "d": d} 31 | dependencies = {"a": set(), "b": {"a"}, "c": {"a"}, "d": {"b", "c"}} 32 | hg = HighLevelGraph(layers, dependencies) 33 | 34 | assert utils_test.hlg_layer_topological(hg, -1) is hg.layers["d"] 35 | assert utils_test.hlg_layer_topological(hg, 0) is hg.layers["a"] 36 | assert utils_test.hlg_layer_topological(hg, 1) in (hg.layers["b"], hg.layers["c"]) 37 | 38 | 39 | def test__check_warning(): 40 | class MyWarning(Warning): 41 | pass 42 | 43 | with warnings.catch_warnings(): 44 | warnings.simplefilter("error") 45 | with _check_warning(True, MyWarning, "foo"): 46 | warnings.warn("foo", MyWarning) 47 | 48 | with pytest.warns(MyWarning, match="foo"): 49 | with _check_warning(False, MyWarning, "foo"): 50 | warnings.warn("foo", MyWarning) 51 | -------------------------------------------------------------------------------- /continuous_integration/environment-3.11.yaml: -------------------------------------------------------------------------------- 1 | # This job includes coverage 2 | name: test-environment 3 | channels: 4 | - conda-forge 5 | - nodefaults 6 | dependencies: 7 | # required dependencies 8 | - python=3.11 9 | - packaging 10 | - pyyaml 11 | - click 12 | - cloudpickle 13 | - partd 14 | - fsspec 15 | - importlib_metadata 16 | - toolz 17 | # test dependencies 18 | - pre-commit 19 | - pytest 20 | - pytest-cov 21 | - pytest-rerunfailures 22 | - pytest-timeout 23 | - pytest-xdist 24 | - moto 25 | # Optional dependencies 26 | - mimesis 27 | - numpy 28 | - pandas 29 | - flask 30 | - fastparquet>=0.8.0 31 | - h5py 32 | - pytables 33 | - zarr 34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI, However 0.18 is needed for 3.11 35 | # https://github.com/dask/dask/pull/9569 36 | # - tiledb-py # crashes on Python 3.11 37 | # - pyspark 38 | # - tiledb>=2.5.0 # crashes on Python 3.11 39 | - xarray 40 | - sqlalchemy>=1.4.16 41 | - pyarrow>=11 42 | - coverage 43 | - jsonschema 44 | # # other -- IO 45 | - boto3 46 | - botocore 47 | - bokeh 48 | - httpretty 49 | - aiohttp 50 | - s3fs 51 | # Need a new `crick` release with support for `numpy=1.24+` 52 | # https://github.com/dask/crick/issues/25 53 | # - crick 54 | - cytoolz 55 | - distributed 56 | - ipython 57 | - ipycytoscape 58 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed 59 | - ipywidgets<8.0.5 60 | - ipykernel<6.22.0 61 | - lz4 62 | - numba 63 | - psutil 64 | - requests 65 | - scikit-image 66 | - scikit-learn 67 | - scipy 68 | - python-snappy 69 | - sparse 70 | - cachey 71 | - python-graphviz 72 | - python-cityhash 73 | - python-xxhash 74 | - mmh3 75 | - jinja2 76 | - pip 77 | - pip: 78 | - git+https://github.com/dask/distributed 79 | -------------------------------------------------------------------------------- /dask/dataframe/numeric.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | from pandas.api.types import is_scalar as pd_is_scalar 5 | 6 | from dask.array import Array 7 | from dask.dataframe.core import Series 8 | from dask.delayed import delayed 9 | from dask.utils import derived_from 10 | 11 | __all__ = ("to_numeric",) 12 | 13 | 14 | @derived_from(pd, ua_args=["downcast"]) 15 | def to_numeric(arg, errors="raise", meta=None): 16 | """ 17 | Return type depends on input. Delayed if scalar, otherwise same as input. 18 | For errors, only "raise" and "coerce" are allowed. 19 | """ 20 | if errors not in ("raise", "coerce"): 21 | raise ValueError("invalid error value specified") 22 | 23 | is_series = isinstance(arg, Series) 24 | is_array = isinstance(arg, Array) 25 | is_scalar = pd_is_scalar(arg) 26 | 27 | if not any([is_series, is_array, is_scalar]): 28 | raise TypeError( 29 | "arg must be a list, tuple, dask.array.Array, or dask.dataframe.Series" 30 | ) 31 | 32 | if meta is not None: 33 | if is_scalar: 34 | raise KeyError("``meta`` is not allowed when input is a scalar.") 35 | else: 36 | if is_series or is_array: 37 | meta = pd.to_numeric(arg._meta) 38 | 39 | if is_series: 40 | return arg.map_partitions( 41 | pd.to_numeric, 42 | token=arg._name + "-to_numeric", 43 | meta=meta, 44 | enforce_metadata=False, 45 | errors=errors, 46 | ) 47 | if is_array: 48 | return arg.map_blocks( 49 | pd.to_numeric, 50 | name=arg._name + "-to_numeric", 51 | meta=meta, 52 | errors=errors, 53 | ) 54 | if is_scalar: 55 | return delayed(pd.to_numeric, pure=True)(arg, errors=errors) 56 | -------------------------------------------------------------------------------- /docs/source/deploying-ssh.rst: -------------------------------------------------------------------------------- 1 | SSH 2 | === 3 | 4 | It is easy to set up Dask on informally managed networks of machines using SSH. 5 | This can be done manually using SSH and the 6 | Dask :doc:`command line interface `, 7 | or automatically using either the :class:`dask.distributed.SSHCluster` Python *cluster manager* or the 8 | ``dask-ssh`` command line tool. This document describes both of these options. 9 | 10 | .. note:: 11 | Before instaniating a ``SSHCluster`` it is recommended to configure keyless SSH 12 | for your local machine and other machines. For example, on a Mac to SSH into 13 | localhost (local machine) you need to ensure the Remote Login option is set in 14 | System Preferences -> Sharing. In addition, ``id_rsa.pub`` should be in 15 | ``authorized_keys`` for keyless login. 16 | 17 | Python Interface 18 | ---------------- 19 | 20 | .. currentmodule:: dask.distributed 21 | 22 | .. autofunction:: SSHCluster 23 | 24 | Command Line 25 | ------------ 26 | 27 | The convenience script ``dask-ssh`` opens several SSH connections to your 28 | target computers and initializes the network accordingly. You can 29 | give it a list of hostnames or IP addresses:: 30 | 31 | $ dask-ssh 192.168.0.1 192.168.0.2 192.168.0.3 192.168.0.4 32 | 33 | Or you can use normal UNIX grouping:: 34 | 35 | $ dask-ssh 192.168.0.{1,2,3,4} 36 | 37 | Or you can specify a hostfile that includes a list of hosts:: 38 | 39 | $ cat hostfile.txt 40 | 192.168.0.1 41 | 192.168.0.2 42 | 192.168.0.3 43 | 192.168.0.4 44 | 45 | $ dask-ssh --hostfile hostfile.txt 46 | 47 | .. note:: 48 | 49 | The command line documentation here may differ depending on your installed 50 | version. We recommend referring to the output of ``dask-ssh --help``. 51 | 52 | .. click:: distributed.cli.dask_ssh:main 53 | :prog: dask-ssh 54 | :show-nested: 55 | -------------------------------------------------------------------------------- /docs/source/how-to/extend-sizeof.rst: -------------------------------------------------------------------------------- 1 | Extend `sizeof` 2 | =============== 3 | 4 | When Dask needs to compute the size of an object in bytes, e.g. to determine which objects to spill to disk, it uses the ``dask.sizeof.sizeof`` registration mechanism. Users who need to define a ``sizeof`` implementation for their own objects can use ``sizeof.register``: 5 | 6 | .. code-block:: python 7 | 8 | >>> import numpy as np 9 | >>> from dask.sizeof import sizeof 10 | >>> @sizeof.register(np.ndarray) 11 | >>> def sizeof_numpy_like(array): 12 | ... return array.nbytes 13 | 14 | This code can be executed in order to register the implementation with Dask by placing it in one of the library's modules e.g. ``__init__.py``. However, this introduces a maintenance burden on the developers of these libraries, and must be manually imported on all workers in the event that these libraries do not accept the patch. 15 | 16 | Therefore, Dask also exposes an `entrypoint `_ under the group ``dask.sizeof`` to enable third-party libraries to develop and maintain these ``sizeof`` implementations. 17 | 18 | For a fictitious library ``numpy_sizeof_dask.py``, the necessary ``setup.cfg`` configuration would be as follows: 19 | 20 | .. code-block:: ini 21 | 22 | [options.entry_points] 23 | dask.sizeof = 24 | numpy = numpy_sizeof_dask:sizeof_plugin 25 | 26 | whilst ``numpy_sizeof_dask.py`` would contain 27 | 28 | .. code-block:: python 29 | 30 | >>> import numpy as np 31 | >>> def sizeof_plugin(sizeof): 32 | ... @sizeof.register(np.ndarray) 33 | ... def sizeof_numpy_like(array): 34 | ... return array.nbytes 35 | 36 | Upon the first import of `dask.sizeof`, Dask calls the entrypoint (``sizeof_plugin``) with the ``dask.sizeof.sizeof`` object, which can then be used to register a sizeof implementation. 37 | -------------------------------------------------------------------------------- /continuous_integration/gpuci/build.sh: -------------------------------------------------------------------------------- 1 | ############################################## 2 | # Dask GPU build and test script for CI # 3 | ############################################## 4 | set -e 5 | NUMARGS=$# 6 | ARGS=$* 7 | 8 | # Arg parsing function 9 | function hasArg { 10 | (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") 11 | } 12 | 13 | # Set path and build parallel level 14 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH 15 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} 16 | 17 | # Set home to the job's workspace 18 | export HOME="$WORKSPACE" 19 | 20 | # Switch to project root; also root of repo checkout 21 | cd "$WORKSPACE" 22 | 23 | # Determine CUDA release version 24 | export CUDA_REL=${CUDA_VERSION%.*} 25 | 26 | ################################################################################ 27 | # SETUP - Check environment 28 | ################################################################################ 29 | 30 | gpuci_logger "Check environment variables" 31 | env 32 | 33 | gpuci_logger "Check GPU usage" 34 | nvidia-smi 35 | 36 | gpuci_logger "Activate conda env" 37 | . /opt/conda/etc/profile.d/conda.sh 38 | conda activate dask 39 | 40 | gpuci_logger "Install distributed" 41 | python -m pip install git+https://github.com/dask/distributed 42 | 43 | gpuci_logger "Install dask" 44 | python -m pip install --no-deps -e . 45 | 46 | gpuci_logger "Install pytest-timeout" 47 | python -m pip install pytest-timeout 48 | 49 | gpuci_logger "Check Python version" 50 | python --version 51 | 52 | gpuci_logger "Check conda environment" 53 | conda info 54 | conda config --show-sources 55 | conda list --show-channel-urls 56 | 57 | gpuci_logger "Python py.test for dask" 58 | py.test $WORKSPACE -n 3 -v -m gpu --junitxml="$WORKSPACE/junit-dask.xml" --cov-config="$WORKSPACE/pyproject.toml" --cov=dask --cov-report=xml:"$WORKSPACE/dask-coverage.xml" --cov-report term 59 | -------------------------------------------------------------------------------- /docs/source/delayed-api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | The ``dask.delayed`` interface consists of one function, ``delayed``: 5 | 6 | - ``delayed`` wraps functions 7 | 8 | Wraps functions. Can be used as a decorator, or around function calls 9 | directly (i.e. ``delayed(foo)(a, b, c)``). Outputs from functions wrapped in 10 | ``delayed`` are proxy objects of type ``Delayed`` that contain a graph of 11 | all operations done to get to this result. 12 | 13 | - ``delayed`` wraps objects 14 | 15 | Wraps objects. Used to create ``Delayed`` proxies directly. 16 | 17 | ``Delayed`` objects can be thought of as representing a key in the dask task 18 | graph. A ``Delayed`` supports *most* python operations, each of which creates 19 | another ``Delayed`` representing the result: 20 | 21 | - Most operators (``*``, ``-``, and so on) 22 | - Item access and slicing (``a[0]``) 23 | - Attribute access (``a.size``) 24 | - Method calls (``a.index(0)``) 25 | 26 | Operations that aren't supported include: 27 | 28 | - Mutating operators (``a += 1``) 29 | - Mutating magics such as ``__setitem__``/``__setattr__`` (``a[0] = 1``, ``a.foo = 1``) 30 | - Iteration. (``for i in a: ...``) 31 | - Use as a predicate (``if a: ...``) 32 | 33 | The last two points in particular mean that ``Delayed`` objects cannot be used for 34 | control flow, meaning that no ``Delayed`` can appear in a loop or if statement. 35 | In other words you can't iterate over a ``Delayed`` object, or use it as part of 36 | a condition in an if statement, but ``Delayed`` object can be used in a body of a loop 37 | or if statement (i.e. the example above is fine, but if ``data`` was a ``Delayed`` 38 | object it wouldn't be). 39 | Even with this limitation, many workflows can easily be parallelized. 40 | 41 | .. currentmodule:: dask.delayed 42 | 43 | .. autosummary:: 44 | delayed 45 | Delayed 46 | 47 | .. autofunction:: delayed 48 | .. autoclass:: Delayed 49 | -------------------------------------------------------------------------------- /.github/workflows/additional.yml: -------------------------------------------------------------------------------- 1 | name: Additional 2 | 3 | on: [push, pull_request] 4 | 5 | # Required shell entrypoint to have properly activated conda environments 6 | defaults: 7 | run: 8 | shell: bash -l {0} 9 | 10 | jobs: 11 | doctest: 12 | runs-on: "ubuntu-latest" 13 | timeout-minutes: 90 14 | steps: 15 | - name: Checkout source 16 | uses: actions/checkout@v3.5.3 17 | 18 | - name: Setup Conda Environment 19 | uses: conda-incubator/setup-miniconda@v2.2.0 20 | with: 21 | miniforge-variant: Mambaforge 22 | miniforge-version: latest 23 | use-mamba: true 24 | channel-priority: strict 25 | python-version: "3.10" 26 | environment-file: continuous_integration/environment-3.10.yaml 27 | activate-environment: test-environment 28 | auto-activate-base: false 29 | 30 | - name: Install 31 | run: source continuous_integration/scripts/install.sh 32 | 33 | - name: Run tests 34 | run: pytest -v --doctest-modules --ignore-glob='*/test_*.py' dask 35 | 36 | imports: 37 | runs-on: "ubuntu-latest" 38 | timeout-minutes: 90 39 | strategy: 40 | fail-fast: false 41 | matrix: 42 | python-version: ["3.9", "3.10", "3.11"] 43 | steps: 44 | - name: Checkout source 45 | uses: actions/checkout@v3.5.3 46 | 47 | - name: Setup Conda 48 | uses: conda-incubator/setup-miniconda@v2.2.0 49 | with: 50 | miniforge-variant: Mambaforge 51 | miniforge-version: latest 52 | use-mamba: true 53 | channel-priority: strict 54 | python-version: "3.9" 55 | activate-environment: test-environment 56 | auto-activate-base: false 57 | 58 | - name: Run import tests 59 | env: 60 | PYTHON_VERSION: ${{ matrix.python-version }} 61 | run: source continuous_integration/scripts/test_imports.sh 62 | -------------------------------------------------------------------------------- /dask/tests/test_cache.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from operator import add 4 | from time import sleep 5 | 6 | import pytest 7 | 8 | from dask.cache import Cache 9 | from dask.callbacks import Callback 10 | from dask.local import get_sync 11 | from dask.threaded import get 12 | 13 | cachey = pytest.importorskip("cachey") 14 | 15 | 16 | flag = [] 17 | 18 | 19 | def inc(x): 20 | flag.append(x) 21 | return x + 1 22 | 23 | 24 | def test_cache(): 25 | c = cachey.Cache(10000) 26 | cc = Cache(c) 27 | 28 | with cc: 29 | assert get({"x": (inc, 1)}, "x") == 2 30 | 31 | assert flag == [1] 32 | assert c.data["x"] == 2 33 | 34 | assert not cc.starttimes 35 | assert not cc.durations 36 | 37 | while flag: 38 | flag.pop() 39 | dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")} 40 | with cc: 41 | assert get(dsk, "z") == 5 42 | 43 | assert flag == [2] # no x present 44 | 45 | assert not Callback.active 46 | 47 | 48 | def test_cache_with_number(): 49 | c = Cache(10000, limit=1) 50 | assert isinstance(c.cache, cachey.Cache) 51 | assert c.cache.available_bytes == 10000 52 | assert c.cache.limit == 1 53 | 54 | 55 | def test_cache_correctness(): 56 | # https://github.com/dask/dask/issues/3631 57 | c = Cache(10000) 58 | da = pytest.importorskip("dask.array") 59 | from numpy import ones, zeros 60 | 61 | z = da.from_array(zeros(1), chunks=10) 62 | o = da.from_array(ones(1), chunks=10) 63 | with c: 64 | assert (z.compute() == 0).all() 65 | assert (o.compute() == 1).all() 66 | 67 | 68 | def f(duration, size, *args): 69 | sleep(duration) 70 | return [0] * size 71 | 72 | 73 | def test_prefer_cheap_dependent(): 74 | dsk = {"x": (f, 0.01, 10), "y": (f, 0.000001, 1, "x")} 75 | c = Cache(10000) 76 | with c: 77 | get_sync(dsk, "y") 78 | 79 | assert c.cache.scorer.cost["x"] < c.cache.scorer.cost["y"] 80 | -------------------------------------------------------------------------------- /docs/source/understanding-performance.rst: -------------------------------------------------------------------------------- 1 | Understanding Performance 2 | ========================= 3 | 4 | The first step in making computations run quickly is to understand the costs involved. 5 | In Python we often rely on tools like 6 | the `CProfile module `_, 7 | `%%prun IPython magic `_, 8 | `VMProf `_, or 9 | `snakeviz `_ 10 | to understand the costs associated with our code. 11 | However, few of these tools work well on multi-threaded or multi-process code, 12 | and fewer still on computations distributed among many machines. 13 | We also have new costs like data transfer, serialization, task scheduling overhead, and more 14 | that we may not be accustomed to tracking. 15 | 16 | Fortunately, the Dask schedulers come with diagnostics 17 | to help you understand the performance characteristics of your computations. 18 | By using these diagnostics and with some thought, 19 | we can often identify the slow parts of troublesome computations. 20 | 21 | The :doc:`single-machine and distributed schedulers ` come with *different* diagnostic tools. 22 | These tools are deeply integrated into each scheduler, 23 | so a tool designed for one will not transfer over to the other. 24 | 25 | These pages provide four options for profiling parallel code: 26 | 27 | 1. :doc:`Visualize task graphs ` 28 | 2. :ref:`Single threaded scheduler and a normal Python profiler ` 29 | 3. :doc:`Diagnostics for the single-machine scheduler ` 30 | 4. :doc:`Diagnostics for the distributed scheduler and dashboard ` 31 | 32 | Additionally, if you are interested in understanding the various phases where 33 | slowdown can occur, you may wish to read the following: 34 | 35 | - :doc:`Phases of computation ` 36 | -------------------------------------------------------------------------------- /.github/workflows/conda.yml: -------------------------------------------------------------------------------- 1 | name: Conda build 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | paths: 8 | - setup.py 9 | - continuous_integration/recipe/** 10 | - .github/workflows/conda.yml 11 | 12 | # When this workflow is queued, automatically cancel any previous running 13 | # or pending jobs from the same branch 14 | concurrency: 15 | group: conda-${{ github.head_ref }} 16 | cancel-in-progress: true 17 | 18 | # Required shell entrypoint to have properly activated conda environments 19 | defaults: 20 | run: 21 | shell: bash -l {0} 22 | 23 | jobs: 24 | conda: 25 | name: Build (and upload) 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v3.5.3 29 | with: 30 | fetch-depth: 0 31 | - name: Set up Python 32 | uses: conda-incubator/setup-miniconda@v2.2.0 33 | with: 34 | miniforge-variant: Mambaforge 35 | use-mamba: true 36 | python-version: 3.9 37 | channel-priority: strict 38 | - name: Install dependencies 39 | run: | 40 | mamba install -c conda-forge boa conda-verify 41 | 42 | which python 43 | pip list 44 | mamba list 45 | - name: Build conda package 46 | run: | 47 | # suffix for nightly package versions 48 | export VERSION_SUFFIX=a`date +%y%m%d` 49 | 50 | conda mambabuild continuous_integration/recipe \ 51 | --no-anaconda-upload \ 52 | --output-folder . 53 | - name: Upload conda package 54 | if: | 55 | github.event_name == 'push' 56 | && github.ref == 'refs/heads/main' 57 | && github.repository == 'dask/dask' 58 | env: 59 | ANACONDA_API_TOKEN: ${{ secrets.DASK_CONDA_TOKEN }} 60 | run: | 61 | # install anaconda for upload 62 | mamba install -c conda-forge anaconda-client 63 | 64 | anaconda upload --label dev noarch/*.tar.bz2 65 | -------------------------------------------------------------------------------- /dask/context.py: -------------------------------------------------------------------------------- 1 | """ 2 | Control global computation context 3 | """ 4 | from __future__ import annotations 5 | 6 | import threading 7 | from functools import partial 8 | 9 | from dask import config 10 | 11 | _globals = config.config 12 | 13 | 14 | thread_state = threading.local() 15 | 16 | 17 | def globalmethod(default=None, key=None, falsey=None): 18 | """Allow function to be taken over by globals 19 | 20 | This modifies a method so that occurrences of it may be taken over by 21 | functions registered in the global options. Can be used as a decorator or a 22 | function. 23 | 24 | Parameters 25 | ---------- 26 | default : callable 27 | The default callable to use. 28 | key : str 29 | Key under which we register this function in the global parameters 30 | falsey : callable, None, optional 31 | A function to use if the option is falsey. If not provided, the default 32 | is used instead. 33 | 34 | Examples 35 | -------- 36 | >>> import dask 37 | >>> class Foo: 38 | ... @globalmethod(key='bar', falsey=lambda: 3) 39 | ... def bar(): 40 | ... return 1 41 | >>> f = Foo() 42 | >>> f.bar() 43 | 1 44 | >>> with dask.config.set(bar=lambda: 2): 45 | ... print(f.bar()) 46 | 2 47 | >>> with dask.config.set(bar=False): 48 | ... print(f.bar()) 49 | 3 50 | """ 51 | if default is None: 52 | return partial(globalmethod, key=key, falsey=falsey) 53 | return GlobalMethod(default=default, key=key, falsey=falsey) 54 | 55 | 56 | class GlobalMethod: 57 | def __init__(self, default, key, falsey=None): 58 | self._default = default 59 | self._key = key 60 | self._falsey = falsey 61 | 62 | def __get__(self, instance, owner=None): 63 | if self._key in _globals: 64 | if _globals[self._key]: 65 | return _globals[self._key] 66 | elif self._falsey is not None: 67 | return self._falsey 68 | return self._default 69 | -------------------------------------------------------------------------------- /dask/dataframe/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | try: 4 | import dask.dataframe._pyarrow_compat 5 | from dask.base import compute 6 | from dask.dataframe import backends, dispatch, rolling 7 | from dask.dataframe.core import ( 8 | DataFrame, 9 | Index, 10 | Series, 11 | _Frame, 12 | map_partitions, 13 | repartition, 14 | to_datetime, 15 | to_timedelta, 16 | ) 17 | from dask.dataframe.groupby import Aggregation 18 | from dask.dataframe.io import ( 19 | demo, 20 | from_array, 21 | from_dask_array, 22 | from_delayed, 23 | from_dict, 24 | from_map, 25 | from_pandas, 26 | read_csv, 27 | read_fwf, 28 | read_hdf, 29 | read_json, 30 | read_sql, 31 | read_sql_query, 32 | read_sql_table, 33 | read_table, 34 | to_bag, 35 | to_csv, 36 | to_hdf, 37 | to_json, 38 | to_records, 39 | to_sql, 40 | ) 41 | from dask.dataframe.multi import concat, merge, merge_asof 42 | from dask.dataframe.numeric import to_numeric 43 | from dask.dataframe.optimize import optimize 44 | from dask.dataframe.reshape import get_dummies, melt, pivot_table 45 | from dask.dataframe.utils import assert_eq 46 | 47 | try: 48 | from dask.dataframe.io import read_parquet, to_parquet 49 | except ImportError: 50 | pass 51 | try: 52 | from dask.dataframe.io import read_orc, to_orc 53 | except ImportError: 54 | pass 55 | try: 56 | from dask.dataframe.core import isna 57 | except ImportError: 58 | pass 59 | except ImportError as e: 60 | msg = ( 61 | "Dask dataframe requirements are not installed.\n\n" 62 | "Please either conda or pip install as follows:\n\n" 63 | " conda install dask # either conda install\n" 64 | ' python -m pip install "dask[dataframe]" --upgrade # or python -m pip install' 65 | ) 66 | raise ImportError(msg) from e 67 | -------------------------------------------------------------------------------- /dask/dataframe/_pyarrow_compat.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copyreg 4 | 5 | import pandas as pd 6 | 7 | try: 8 | import pyarrow as pa 9 | except ImportError: 10 | pa = None 11 | 12 | from dask.dataframe._compat import PANDAS_GE_150, PANDAS_GE_200 13 | 14 | # Pickling of pyarrow arrays is effectively broken - pickling a slice of an 15 | # array ends up pickling the entire backing array. 16 | # 17 | # See https://issues.apache.org/jira/browse/ARROW-10739 18 | # 19 | # This comes up when using pandas `string[pyarrow]` dtypes, which are backed by 20 | # a `pyarrow.StringArray`. To fix this, we register a *global* override for 21 | # pickling `ArrowStringArray` or `ArrowExtensionArray` types (where available). 22 | # We do this at the pandas level rather than the pyarrow level for efficiency reasons 23 | # (a pandas ArrowStringArray may contain many small pyarrow StringArray objects). 24 | # 25 | # The implementation here is based on https://github.com/pandas-dev/pandas/pull/49078 26 | # which is included in pandas=2+. We can remove all this once Dask's minimum 27 | # supported pandas version is at least 2.0.0. 28 | 29 | 30 | def rebuild_arrowextensionarray(type_, chunks): 31 | array = pa.chunked_array(chunks) 32 | return type_(array) 33 | 34 | 35 | def reduce_arrowextensionarray(x): 36 | return (rebuild_arrowextensionarray, (type(x), x._data.combine_chunks())) 37 | 38 | 39 | # `pandas=2` includes efficient serialization of `pyarrow`-backed extension arrays. 40 | # See https://github.com/pandas-dev/pandas/pull/49078 for details. 41 | # We only need to backport efficient serialization for `pandas<2`. 42 | if pa is not None and not PANDAS_GE_200: 43 | if PANDAS_GE_150: 44 | # Applies to all `pyarrow`-backed extension arrays (e.g. `string[pyarrow]`, `int64[pyarrow]`) 45 | for type_ in [pd.arrays.ArrowExtensionArray, pd.arrays.ArrowStringArray]: 46 | copyreg.dispatch_table[type_] = reduce_arrowextensionarray 47 | else: 48 | # Only `string[pyarrow]` is implemented, so just patch that 49 | copyreg.dispatch_table[pd.arrays.ArrowStringArray] = reduce_arrowextensionarray 50 | -------------------------------------------------------------------------------- /dask/widgets/templates/highlevelgraph_layer.html.j2: -------------------------------------------------------------------------------- 1 |
2 | 3 | {% if materialized %} 4 | 5 | {% else %} 6 | 7 | {% endif %} 8 | 9 | 10 |
11 | 12 |

Layer{{ layer_index }}: {{ shortname }}

13 |
14 |

15 | {{ highlevelgraph_key }} 16 |

17 | 18 | 19 | 20 | 43 | 46 | 47 |
21 | 22 | {% for key, val in info.items() %} 23 | 24 | 25 | 26 | 27 | {% endfor %} 28 | {% for dep in dependencies %} 29 | {% if loop.index > 1 %} 30 | 31 | 32 | 33 | 34 | {% else %} 35 | 36 | 37 | 38 | 39 | {% endif %} 40 | {% endfor %} 41 |
{{ key }}{{ val }}
{{ dep }}
depends on {{ dep }}
42 |
44 | {{ svg_repr }} 45 |
48 | 49 |
50 |
51 | -------------------------------------------------------------------------------- /docs/source/bag-api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. currentmodule:: dask.bag 5 | 6 | Create Bags 7 | ----------- 8 | 9 | .. autosummary:: 10 | :toctree: generated/ 11 | 12 | from_sequence 13 | from_delayed 14 | from_url 15 | range 16 | read_text 17 | read_avro 18 | 19 | From dataframe 20 | ~~~~~~~~~~~~~~ 21 | 22 | .. currentmodule:: dask.dataframe 23 | 24 | .. autosummary:: 25 | :toctree: generated/ 26 | 27 | DataFrame.to_bag 28 | Series.to_bag 29 | 30 | Top-level functions 31 | ------------------- 32 | 33 | .. currentmodule:: dask.bag 34 | 35 | .. autosummary:: 36 | :toctree: generated/ 37 | 38 | concat 39 | map 40 | map_partitions 41 | to_textfiles 42 | zip 43 | 44 | Random Sampling 45 | --------------- 46 | 47 | .. autosummary:: 48 | :toctree: generated/ 49 | 50 | random.choices 51 | random.sample 52 | 53 | 54 | Turn Bags into other things 55 | --------------------------- 56 | 57 | .. autosummary:: 58 | :toctree: generated/ 59 | 60 | Bag.to_textfiles 61 | Bag.to_dataframe 62 | Bag.to_delayed 63 | Bag.to_avro 64 | 65 | 66 | Bag Methods 67 | ----------- 68 | 69 | .. autosummary:: 70 | :toctree: generated/ 71 | 72 | Bag 73 | Bag.accumulate 74 | Bag.all 75 | Bag.any 76 | Bag.compute 77 | Bag.count 78 | Bag.distinct 79 | Bag.filter 80 | Bag.flatten 81 | Bag.fold 82 | Bag.foldby 83 | Bag.frequencies 84 | Bag.groupby 85 | Bag.join 86 | Bag.map 87 | Bag.map_partitions 88 | Bag.max 89 | Bag.mean 90 | Bag.min 91 | Bag.persist 92 | Bag.pluck 93 | Bag.product 94 | Bag.reduction 95 | Bag.random_sample 96 | Bag.remove 97 | Bag.repartition 98 | Bag.starmap 99 | Bag.std 100 | Bag.sum 101 | Bag.take 102 | Bag.to_avro 103 | Bag.to_dataframe 104 | Bag.to_delayed 105 | Bag.to_textfiles 106 | Bag.topk 107 | Bag.var 108 | Bag.visualize 109 | 110 | 111 | Item Methods 112 | ------------ 113 | 114 | .. autosummary:: 115 | :toctree: generated/ 116 | 117 | Item 118 | Item.apply 119 | Item.compute 120 | Item.from_delayed 121 | Item.persist 122 | Item.to_delayed 123 | Item.visualize 124 | -------------------------------------------------------------------------------- /dask/cache.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | from numbers import Number 5 | from timeit import default_timer 6 | 7 | from dask.callbacks import Callback 8 | 9 | overhead = sys.getsizeof(1.23) * 4 + sys.getsizeof(()) * 4 10 | 11 | 12 | class Cache(Callback): 13 | """Use cache for computation 14 | 15 | Examples 16 | -------- 17 | 18 | >>> cache = Cache(1e9) # doctest: +SKIP 19 | 20 | The cache can be used locally as a context manager around ``compute`` or 21 | ``get`` calls: 22 | 23 | >>> with cache: # doctest: +SKIP 24 | ... result = x.compute() 25 | 26 | You can also register a cache globally, so that it works for all 27 | computations: 28 | 29 | >>> cache.register() # doctest: +SKIP 30 | >>> cache.unregister() # doctest: +SKIP 31 | """ 32 | 33 | def __init__(self, cache, *args, **kwargs): 34 | try: 35 | import cachey 36 | except ImportError as ex: 37 | raise ImportError( 38 | 'Cache requires cachey, "{ex}" problem ' "importing".format(ex=str(ex)) 39 | ) from ex 40 | self._nbytes = cachey.nbytes 41 | if isinstance(cache, Number): 42 | cache = cachey.Cache(cache, *args, **kwargs) 43 | else: 44 | assert not args and not kwargs 45 | self.cache = cache 46 | self.starttimes = dict() 47 | 48 | def _start(self, dsk): 49 | self.durations = dict() 50 | overlap = set(dsk) & set(self.cache.data) 51 | for key in overlap: 52 | dsk[key] = self.cache.data[key] 53 | 54 | def _pretask(self, key, dsk, state): 55 | self.starttimes[key] = default_timer() 56 | 57 | def _posttask(self, key, value, dsk, state, id): 58 | duration = default_timer() - self.starttimes[key] 59 | deps = state["dependencies"][key] 60 | if deps: 61 | duration += max(self.durations.get(k, 0) for k in deps) 62 | self.durations[key] = duration 63 | nb = self._nbytes(value) + overhead + sys.getsizeof(key) * 4 64 | self.cache.put(key, value, cost=duration / nb / 1e9, nbytes=nb) 65 | 66 | def _finish(self, dsk, state, errored): 67 | self.starttimes.clear() 68 | self.durations.clear() 69 | -------------------------------------------------------------------------------- /dask/array/image.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | from glob import glob 5 | 6 | try: 7 | from skimage.io import imread as sk_imread 8 | except (AttributeError, ImportError): 9 | pass 10 | 11 | from dask.array.core import Array 12 | from dask.base import tokenize 13 | 14 | 15 | def add_leading_dimension(x): 16 | return x[None, ...] 17 | 18 | 19 | def imread(filename, imread=None, preprocess=None): 20 | """Read a stack of images into a dask array 21 | 22 | Parameters 23 | ---------- 24 | 25 | filename: string 26 | A globstring like 'myfile.*.png' 27 | imread: function (optional) 28 | Optionally provide custom imread function. 29 | Function should expect a filename and produce a numpy array. 30 | Defaults to ``skimage.io.imread``. 31 | preprocess: function (optional) 32 | Optionally provide custom function to preprocess the image. 33 | Function should expect a numpy array for a single image. 34 | 35 | Examples 36 | -------- 37 | 38 | >>> from dask.array.image import imread 39 | >>> im = imread('2015-*-*.png') # doctest: +SKIP 40 | >>> im.shape # doctest: +SKIP 41 | (365, 1000, 1000, 3) 42 | 43 | Returns 44 | ------- 45 | 46 | Dask array of all images stacked along the first dimension. 47 | Each separate image file will be treated as an individual chunk. 48 | """ 49 | imread = imread or sk_imread 50 | filenames = sorted(glob(filename)) 51 | if not filenames: 52 | raise ValueError("No files found under name %s" % filename) 53 | 54 | name = "imread-%s" % tokenize(filenames, map(os.path.getmtime, filenames)) 55 | 56 | sample = imread(filenames[0]) 57 | if preprocess: 58 | sample = preprocess(sample) 59 | 60 | keys = [(name, i) + (0,) * len(sample.shape) for i in range(len(filenames))] 61 | if preprocess: 62 | values = [ 63 | (add_leading_dimension, (preprocess, (imread, fn))) for fn in filenames 64 | ] 65 | else: 66 | values = [(add_leading_dimension, (imread, fn)) for fn in filenames] 67 | dsk = dict(zip(keys, values)) 68 | 69 | chunks = ((1,) * len(filenames),) + tuple((d,) for d in sample.shape) 70 | 71 | return Array(dsk, name, chunks, sample.dtype) 72 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: debug-statements 7 | - repo: https://github.com/MarcoGorelli/absolufy-imports 8 | rev: v0.3.1 9 | hooks: 10 | - id: absolufy-imports 11 | name: absolufy-imports 12 | - repo: https://github.com/pycqa/isort 13 | rev: 5.12.0 14 | hooks: 15 | - id: isort 16 | language_version: python3 17 | - repo: https://github.com/asottile/pyupgrade 18 | rev: v3.4.0 19 | hooks: 20 | - id: pyupgrade 21 | args: 22 | - --py39-plus 23 | - repo: https://github.com/psf/black 24 | rev: 23.3.0 25 | hooks: 26 | - id: black 27 | language_version: python3 28 | args: 29 | - --target-version=py39 30 | - repo: https://github.com/pycqa/flake8 31 | rev: 6.0.0 32 | hooks: 33 | - id: flake8 34 | language_version: python3 35 | additional_dependencies: 36 | # NOTE: autoupdate does not pick up flake8-bugbear since it is a transitive 37 | # dependency. Make sure to update flake8-bugbear manually on a regular basis. 38 | - flake8-bugbear==23.2.13 39 | - repo: https://github.com/codespell-project/codespell 40 | rev: v2.2.4 41 | hooks: 42 | - id: codespell 43 | types_or: [rst, markdown] 44 | files: docs 45 | additional_dependencies: 46 | - tomli 47 | - repo: https://github.com/pre-commit/mirrors-mypy 48 | # pinned due to 49 | # https://github.com/python/typeshed/pull/9771 and 50 | # https://github.com/python/mypy/issues/15257 for DaskCollection.__dask_scheduler__ 51 | rev: v1.1.1 52 | hooks: 53 | - id: mypy 54 | # Override default --ignore-missing-imports 55 | # Use pyproject.toml if possible instead of adding command line parameters here 56 | args: [--warn-unused-configs] 57 | additional_dependencies: 58 | # Type stubs 59 | # - pandas-stubs # TODO 60 | - types-docutils 61 | - types-PyYAML 62 | - types-psutil 63 | - types-requests 64 | - types-setuptools 65 | # Typed libraries 66 | - numpy 67 | - pytest 68 | -------------------------------------------------------------------------------- /dask/dask.yaml: -------------------------------------------------------------------------------- 1 | temporary-directory: null # Directory for local disk like /tmp, /scratch, or /local 2 | 3 | visualization: 4 | engine: null # Default visualization engine to use when calling `.visualize()` on a collection 5 | 6 | tokenize: 7 | ensure-deterministic: false # If true, tokenize will error instead of falling back to uuids 8 | 9 | dataframe: 10 | backend: "pandas" # Backend dataframe library for input IO and data creation 11 | shuffle: 12 | method: null 13 | compression: null # compression for on disk-shuffling. Partd supports ZLib, BZ2, SNAPPY 14 | parquet: 15 | metadata-task-size-local: 512 # Number of files per local metadata-processing task 16 | metadata-task-size-remote: 1 # Number of files per remote metadata-processing task 17 | convert-string: null # Whether to convert string-like data to pyarrow strings 18 | 19 | array: 20 | backend: "numpy" # Backend array library for input IO and data creation 21 | chunk-size: "128MiB" 22 | rechunk: 23 | method: "tasks" # Rechunking method to use 24 | threshold: 4 25 | svg: 26 | size: 120 # pixels 27 | slicing: 28 | split-large-chunks: null # How to handle large output chunks in slicing. Warns by default. 29 | 30 | optimization: 31 | annotations: 32 | fuse: true # Automatically fuse compatible annotations on layers 33 | fuse: 34 | active: null # Treat as false for dask.dataframe, true for everything else 35 | ave-width: 1 36 | max-width: null # 1.5 + ave_width * log(ave_width + 1) 37 | max-height: .inf 38 | max-depth-new-edges: null # ave_width * 1.5 39 | subgraphs: null # true for dask.dataframe, false for everything else 40 | rename-keys: true 41 | 42 | admin: 43 | traceback: 44 | shorten: 45 | when: 46 | - dask[\\\/]base.py 47 | - distributed[\\\/]client.py 48 | what: 49 | - dask[\\\/]base.py 50 | - dask[\\\/]core.py 51 | - dask[\\\/]array[\\\/]core.py 52 | - dask[\\\/]optimization.py 53 | - dask[\\\/]dataframe[\\\/]core.py 54 | - dask[\\\/]dataframe[\\\/]methods.py 55 | - dask[\\\/]utils.py 56 | - distributed[\\\/]worker.py 57 | - distributed[\\\/]scheduler.py 58 | - distributed[\\\/]client.py 59 | - distributed[\\\/]utils.py 60 | - tornado[\\\/]gen.py 61 | - pandas[\\\/]core[\\\/] 62 | -------------------------------------------------------------------------------- /docs/source/array-stack.rst: -------------------------------------------------------------------------------- 1 | Stack, Concatenate, and Block 2 | ============================= 3 | 4 | Often we have many arrays stored on disk that we want to stack together and 5 | think of as one large array. This is common with geospatial data in which we 6 | might have many HDF5/NetCDF files on disk, one for every day, but we want to do 7 | operations that span multiple days. 8 | 9 | To solve this problem, we use the functions ``da.stack``, ``da.concatenate``, 10 | and ``da.block``. 11 | 12 | Stack 13 | ----- 14 | 15 | We stack many existing Dask arrays into a new array, creating a new dimension 16 | as we go. 17 | 18 | .. code-block:: python 19 | 20 | >>> import dask.array as da 21 | 22 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2)) 23 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2)) 24 | 25 | >>> data = [arr0, arr1] 26 | 27 | >>> x = da.stack(data, axis=0) 28 | >>> x.shape 29 | (2, 3, 4) 30 | 31 | >>> da.stack(data, axis=1).shape 32 | (3, 2, 4) 33 | 34 | >>> da.stack(data, axis=-1).shape 35 | (3, 4, 2) 36 | 37 | This creates a new dimension with length equal to the number of slices 38 | 39 | Concatenate 40 | ----------- 41 | 42 | We concatenate existing arrays into a new array, extending them along an 43 | existing dimension 44 | 45 | .. code-block:: python 46 | 47 | >>> import dask.array as da 48 | >>> import numpy as np 49 | 50 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2)) 51 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2)) 52 | 53 | >>> data = [arr0, arr1] 54 | 55 | >>> x = da.concatenate(data, axis=0) 56 | >>> x.shape 57 | (6, 4) 58 | 59 | >>> da.concatenate(data, axis=1).shape 60 | (3, 8) 61 | 62 | Block 63 | ----- 64 | 65 | We can handle a larger variety of cases with ``da.block`` as it allows 66 | concatenation to be applied over multiple dimensions at once. This is useful if 67 | your chunks tile a space, for example if small squares tile a larger 2-D plane. 68 | 69 | .. code-block:: python 70 | 71 | >>> import dask.array as da 72 | >>> import numpy as np 73 | 74 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2)) 75 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2)) 76 | 77 | >>> data = [ 78 | ... [arr0, arr1], 79 | ... [arr1, arr0] 80 | ... ] 81 | 82 | >>> x = da.block(data) 83 | >>> x.shape 84 | (6, 8) 85 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_numeric.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from dask.array import Array, from_array 8 | from dask.dataframe import Series, from_pandas, to_numeric 9 | from dask.dataframe.utils import pyarrow_strings_enabled 10 | from dask.delayed import Delayed 11 | 12 | 13 | @pytest.mark.parametrize("arg", ["5", 5, "5 "]) 14 | def test_to_numeric_on_scalars(arg): 15 | output = to_numeric(arg) 16 | assert isinstance(output, Delayed) 17 | assert output.compute() == 5 18 | 19 | 20 | def test_to_numeric_on_dask_array(): 21 | arg = from_array(["1.0", "2", "-3", "5.1"]) 22 | expected = np.array([1.0, 2.0, -3.0, 5.1]) 23 | output = to_numeric(arg) 24 | assert isinstance(output, Array) 25 | assert list(output.compute()) == list(expected) 26 | 27 | 28 | def test_to_numeric_on_dask_dataframe_series(): 29 | s = pd.Series(["1.0", "2", -3, -5.1]) 30 | arg = from_pandas(s, npartitions=2) 31 | expected = pd.to_numeric(s) 32 | output = to_numeric(arg) 33 | expected_dtype = "int64" 34 | if pyarrow_strings_enabled(): 35 | # `to_numeric` output depends on input dtype 36 | expected_dtype = "Int64" 37 | assert output.dtype == expected_dtype 38 | assert isinstance(output, Series) 39 | assert list(output.compute()) == list(expected) 40 | 41 | 42 | def test_to_numeric_on_dask_dataframe_series_with_meta(): 43 | s = pd.Series(["1.0", "2", -3, -5.1]) 44 | arg = from_pandas(s, npartitions=2) 45 | expected = pd.to_numeric(s) 46 | output = to_numeric(arg, meta=pd.Series([], dtype="float64")) 47 | assert output.dtype == "float64" 48 | assert isinstance(output, Series) 49 | assert list(output.compute()) == list(expected) 50 | 51 | 52 | def test_to_numeric_on_dask_dataframe_dataframe_raises_error(): 53 | s = pd.Series(["1.0", "2", -3, -5.1]) 54 | df = pd.DataFrame({"a": s, "b": s}) 55 | arg = from_pandas(df, npartitions=2) 56 | with pytest.raises(TypeError, match="arg must be a list, tuple, dask."): 57 | to_numeric(arg) 58 | 59 | 60 | def test_to_numeric_raises(): 61 | with pytest.raises(ValueError, match="invalid error value"): 62 | to_numeric("10", errors="invalid") 63 | with pytest.raises(KeyError, match="``meta`` is not allowed"): 64 | to_numeric("10", meta=pd.Series([], dtype="float64")) 65 | -------------------------------------------------------------------------------- /dask/array/cupy_entry_point.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dask.array as da 4 | from dask import config 5 | from dask.array.backends import ArrayBackendEntrypoint, register_cupy 6 | from dask.array.core import Array 7 | from dask.array.dispatch import to_cupy_dispatch 8 | 9 | 10 | def _cupy(strict=True): 11 | try: 12 | import cupy 13 | except ImportError: 14 | if strict: 15 | raise ImportError("Please install `cupy` to use `CupyBackendEntrypoint`") 16 | return None 17 | return cupy 18 | 19 | 20 | def _da_with_cupy_meta(attr, *args, meta=None, **kwargs): 21 | # Call the dask.array api with cupy-based meta 22 | meta = _cupy().empty(()) if meta is None else meta 23 | with config.set({"array.backend": "numpy"}): 24 | return getattr(da, attr)(*args, meta=meta, **kwargs) 25 | 26 | 27 | class CupyBackendEntrypoint(ArrayBackendEntrypoint): 28 | def __init__(self): 29 | """Register data-directed dispatch functions""" 30 | if _cupy(strict=False): 31 | register_cupy() 32 | 33 | @classmethod 34 | def to_backend_dispatch(cls): 35 | return to_cupy_dispatch 36 | 37 | @classmethod 38 | def to_backend(cls, data: Array, **kwargs): 39 | if isinstance(data._meta, _cupy().ndarray): 40 | # Already a cupy-backed collection 41 | return data 42 | return data.map_blocks(cls.to_backend_dispatch(), **kwargs) 43 | 44 | @property 45 | def RandomState(self): 46 | return _cupy().random.RandomState 47 | 48 | @property 49 | def default_bit_generator(self): 50 | return _cupy().random.XORWOW 51 | 52 | @staticmethod 53 | def ones(*args, **kwargs): 54 | return _da_with_cupy_meta("ones", *args, **kwargs) 55 | 56 | @staticmethod 57 | def zeros(*args, **kwargs): 58 | return _da_with_cupy_meta("zeros", *args, **kwargs) 59 | 60 | @staticmethod 61 | def empty(*args, **kwargs): 62 | return _da_with_cupy_meta("empty", *args, **kwargs) 63 | 64 | @staticmethod 65 | def full(*args, **kwargs): 66 | return _da_with_cupy_meta("full", *args, **kwargs) 67 | 68 | @staticmethod 69 | def arange(*args, like=None, **kwargs): 70 | like = _cupy().empty(()) if like is None else like 71 | with config.set({"array.backend": "numpy"}): 72 | return da.arange(*args, like=like, **kwargs) 73 | -------------------------------------------------------------------------------- /dask/dataframe/_dtypes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from datetime import date, time 4 | from decimal import Decimal 5 | 6 | import pandas as pd 7 | 8 | from dask.dataframe._compat import PANDAS_GE_150 9 | from dask.dataframe.extensions import make_array_nonempty, make_scalar 10 | 11 | 12 | @make_array_nonempty.register(pd.DatetimeTZDtype) 13 | def _(dtype): 14 | return pd.array([pd.Timestamp(1), pd.NaT], dtype=dtype) 15 | 16 | 17 | @make_scalar.register(pd.DatetimeTZDtype) 18 | def _(x): 19 | return pd.Timestamp(1, tz=x.tz, unit=x.unit) 20 | 21 | 22 | @make_array_nonempty.register(pd.StringDtype) 23 | def _(dtype): 24 | return pd.array(["a", pd.NA], dtype=dtype) 25 | 26 | 27 | if PANDAS_GE_150: 28 | 29 | @make_array_nonempty.register(pd.ArrowDtype) 30 | def _make_array_nonempty_pyarrow_dtype(dtype): 31 | import pyarrow as pa 32 | 33 | if pa.types.is_integer(dtype.pyarrow_dtype): 34 | data = [1, 2] 35 | elif pa.types.is_floating(dtype.pyarrow_dtype): 36 | data = [1.5, 2.5] 37 | elif pa.types.is_boolean(dtype.pyarrow_dtype): 38 | data = [True, False] 39 | elif pa.types.is_string(dtype.pyarrow_dtype) or pa.types.is_large_string( 40 | dtype.pyarrow_dtype 41 | ): 42 | data = ["a", "b"] 43 | elif pa.types.is_timestamp(dtype.pyarrow_dtype): 44 | data = [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-02")] 45 | elif pa.types.is_date(dtype.pyarrow_dtype): 46 | data = [date(1970, 1, 1), date(1970, 1, 2)] 47 | elif pa.types.is_binary(dtype.pyarrow_dtype) or pa.types.is_large_binary( 48 | dtype.pyarrow_dtype 49 | ): 50 | data = [b"a", b"b"] 51 | elif pa.types.is_decimal(dtype.pyarrow_dtype): 52 | data = [Decimal("1"), Decimal("0.0")] 53 | elif pa.types.is_duration(dtype.pyarrow_dtype): 54 | data = [pd.Timedelta("1 day"), pd.Timedelta("2 days")] 55 | elif pa.types.is_time(dtype.pyarrow_dtype): 56 | data = [time(12, 0), time(0, 12)] 57 | else: 58 | data = dtype.empty(2) 59 | return pd.array(data, dtype=dtype) 60 | 61 | 62 | @make_scalar.register(str) 63 | def _(x): 64 | return "s" 65 | 66 | 67 | @make_array_nonempty.register(pd.BooleanDtype) 68 | def _(dtype): 69 | return pd.array([True, pd.NA], dtype=dtype) 70 | 71 | 72 | @make_scalar.register(bool) 73 | def _(x): 74 | return True 75 | -------------------------------------------------------------------------------- /dask/array/tests/test_cupy_reductions.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import warnings 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | pytestmark = pytest.mark.gpu 9 | 10 | import dask 11 | import dask.array as da 12 | from dask.array.utils import assert_eq 13 | 14 | cupy = pytest.importorskip("cupy") 15 | 16 | 17 | @pytest.mark.parametrize( 18 | ["dfunc", "func"], 19 | [ 20 | (da.argmin, np.argmin), 21 | (da.argmax, np.argmax), 22 | (da.nanargmin, np.nanargmin), 23 | (da.nanargmax, np.nanargmax), 24 | ], 25 | ) 26 | def test_arg_reductions(dfunc, func): 27 | x = cupy.random.default_rng().random((10, 10, 10)) 28 | a = da.from_array(x, chunks=(3, 4, 5)) 29 | 30 | assert_eq(dfunc(a), func(x)) 31 | assert_eq(dfunc(a, 0), func(x, 0)) 32 | assert_eq(dfunc(a, 1), func(x, 1)) 33 | assert_eq(dfunc(a, 2), func(x, 2)) 34 | with dask.config.set(split_every=2): 35 | assert_eq(dfunc(a), func(x)) 36 | assert_eq(dfunc(a, 0), func(x, 0)) 37 | assert_eq(dfunc(a, 1), func(x, 1)) 38 | assert_eq(dfunc(a, 2), func(x, 2)) 39 | 40 | pytest.raises(ValueError, lambda: dfunc(a, 3)) 41 | pytest.raises(TypeError, lambda: dfunc(a, (0, 1))) 42 | 43 | x2 = cupy.arange(10) 44 | a2 = da.from_array(x2, chunks=3) 45 | assert_eq(dfunc(a2), func(x2)) 46 | assert_eq(dfunc(a2, 0), func(x2, 0)) 47 | assert_eq(dfunc(a2, 0, split_every=2), func(x2, 0)) 48 | 49 | 50 | @pytest.mark.parametrize( 51 | ["dfunc", "func"], [(da.nanargmin, np.nanargmin), (da.nanargmax, np.nanargmax)] 52 | ) 53 | def test_nanarg_reductions(dfunc, func): 54 | x = cupy.random.default_rng().random((10, 10, 10)) 55 | x[5] = cupy.nan 56 | a = da.from_array(x, chunks=(3, 4, 5)) 57 | assert_eq(dfunc(a), func(x)) 58 | assert_eq(dfunc(a, 0), func(x, 0)) 59 | 60 | with warnings.catch_warnings(): 61 | warnings.simplefilter("ignore", RuntimeWarning) # All-NaN slice encountered 62 | with pytest.raises(ValueError): 63 | dfunc(a, 1).compute() 64 | 65 | with pytest.raises(ValueError): 66 | dfunc(a, 2).compute() 67 | 68 | x[:] = cupy.nan 69 | a = da.from_array(x, chunks=(3, 4, 5)) 70 | with pytest.raises(ValueError): 71 | dfunc(a).compute() 72 | 73 | 74 | @pytest.mark.parametrize("func", [np.cumsum, np.cumprod]) 75 | def test_cumreduction_with_cupy(func): 76 | a = cupy.ones((10, 10)) 77 | b = da.from_array(a, chunks=(4, 4)) 78 | result = func(b, axis=0) 79 | assert_eq(result, func(a, axis=0)) 80 | -------------------------------------------------------------------------------- /dask/widgets/templates/highlevelgraph.html.j2: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 |

HighLevelGraph

17 |

18 | {{ type }} with {{ layers | length }} layers and {{ n_outputs }} keys from all layers. 19 |

20 | {% for layer in toposort %} 21 | {{ layers[layer]._repr_html_(layer_index=loop.index, highlevelgraph_key=layer, dependencies=layer_dependencies[layer])}} 22 | {% endfor %} 23 |
24 |
25 |
26 | -------------------------------------------------------------------------------- /.github/workflows/upstream.yml: -------------------------------------------------------------------------------- 1 | name: Upstream 2 | 3 | on: 4 | schedule: 5 | - cron: "0 1 * * *" 6 | push: 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | # Required shell entrypoint to have properly activated conda environments 11 | defaults: 12 | run: 13 | shell: bash -l {0} 14 | 15 | jobs: 16 | 17 | check: 18 | runs-on: ubuntu-latest 19 | if: github.event_name == 'push' || github.event_name == 'pull_request' 20 | outputs: 21 | test-upstream: ${{ steps.detect-trigger.outputs.trigger-found }} 22 | steps: 23 | - uses: actions/checkout@v3.5.3 24 | with: 25 | fetch-depth: 2 26 | - uses: xarray-contrib/ci-trigger@v1 27 | id: detect-trigger 28 | with: 29 | keyword: "test-upstream" 30 | 31 | build: 32 | needs: check 33 | runs-on: ubuntu-latest 34 | if: | 35 | always() 36 | && ( 37 | needs.check.outputs.test-upstream == 'true' 38 | || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'upstream')) 39 | || (github.repository == 'dask/dask' && github.event_name != 'pull_request') 40 | ) 41 | timeout-minutes: 90 42 | 43 | env: 44 | COVERAGE: "true" 45 | PARALLEL: "true" 46 | UPSTREAM_DEV: 1 47 | 48 | steps: 49 | - name: Checkout source 50 | uses: actions/checkout@v3.5.3 51 | 52 | - name: Setup Conda Environment 53 | uses: conda-incubator/setup-miniconda@v2.2.0 54 | with: 55 | miniforge-variant: Mambaforge 56 | miniforge-version: latest 57 | use-mamba: true 58 | channel-priority: strict 59 | python-version: "3.10" 60 | environment-file: continuous_integration/environment-3.10.yaml 61 | activate-environment: test-environment 62 | auto-activate-base: false 63 | 64 | - name: Install 65 | run: source continuous_integration/scripts/install.sh 66 | 67 | - name: Run tests 68 | id: run_tests 69 | env: 70 | XTRATESTARGS: "--report-log output-log.jsonl" 71 | run: source continuous_integration/scripts/run_tests.sh 72 | 73 | - name: Open or update issue on failure 74 | if: | 75 | failure() 76 | && github.event_name != 'pull_request' 77 | && github.repository == 'dask/dask' 78 | && steps.run_tests.outcome == 'failure' 79 | uses: xarray-contrib/issue-from-pytest-log@v1.2.6 80 | with: 81 | log-path: output-log.jsonl 82 | issue-title: ⚠️ Upstream CI failed ⚠️ 83 | issue-label: upstream 84 | 85 | - name: Coverage 86 | uses: codecov/codecov-action@v3 87 | -------------------------------------------------------------------------------- /continuous_integration/scripts/install.sh: -------------------------------------------------------------------------------- 1 | set -xe 2 | 3 | if [[ ${UPSTREAM_DEV} ]]; then 4 | 5 | # NOTE: `dask/tests/test_ci.py::test_upstream_packages_installed` should up be 6 | # updated when pacakges here are updated. 7 | 8 | # FIXME https://github.com/mamba-org/mamba/issues/412 9 | # mamba uninstall --force ... 10 | conda uninstall --force bokeh 11 | mamba install -y -c bokeh/label/dev bokeh 12 | 13 | # FIXME https://github.com/mamba-org/mamba/issues/412 14 | # mamba uninstall --force ... 15 | conda uninstall --force pyarrow 16 | python -m pip install --no-deps \ 17 | --extra-index-url https://pypi.fury.io/arrow-nightlies/ \ 18 | --prefer-binary --pre pyarrow 19 | 20 | # FIXME https://github.com/mamba-org/mamba/issues/412 21 | # mamba uninstall --force ... 22 | conda uninstall --force fastparquet 23 | python -m pip install \ 24 | --upgrade \ 25 | locket \ 26 | git+https://github.com/pydata/sparse \ 27 | git+https://github.com/dask/s3fs \ 28 | git+https://github.com/intake/filesystem_spec \ 29 | git+https://github.com/dask/partd \ 30 | git+https://github.com/dask/zict \ 31 | git+https://github.com/dask/distributed \ 32 | git+https://github.com/dask/fastparquet \ 33 | git+https://github.com/zarr-developers/zarr-python 34 | 35 | # FIXME https://github.com/mamba-org/mamba/issues/412 36 | # mamba uninstall --force ... 37 | conda uninstall --force numpy pandas scipy 38 | python -m pip install --no-deps --pre --retries 10 \ 39 | -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \ 40 | numpy \ 41 | pandas \ 42 | scipy 43 | 44 | # Used when automatically opening an issue when the `upstream` CI build fails 45 | mamba install pytest-reportlog 46 | 47 | # Crick doesn't work with latest nightly `numpy`. Temporarily remove 48 | # `crick` from the upstream CI environment as a workaround. 49 | # Can restore `crick` once https://github.com/dask/crick/issues/25 is closed. 50 | 51 | # Tiledb is causing segfaults. Temporarily remove `tiledb` and `tiledb-py` 52 | # as a workaround. 53 | 54 | # FIXME https://github.com/mamba-org/mamba/issues/412 55 | # mamba uninstall --force ... 56 | conda uninstall --force crick tiledb tiledb-py 57 | 58 | 59 | fi 60 | 61 | # Install dask 62 | python -m pip install --quiet --no-deps -e .[complete] 63 | echo mamba list 64 | mamba list 65 | 66 | # For debugging 67 | echo -e "--\n--Conda Environment (re-create this with \`conda env create --name -f \`)\n--" 68 | mamba env export | grep -E -v '^prefix:.*$' > env.yaml 69 | cat env.yaml 70 | 71 | set +xe 72 | -------------------------------------------------------------------------------- /docs/source/images/unoverlapping-neighbors.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | 67 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /.github/workflows/update-gpuci.yml: -------------------------------------------------------------------------------- 1 | name: Check for gpuCI updates 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 * * *" # Daily “At 00:00” UTC 6 | workflow_dispatch: 7 | 8 | jobs: 9 | update-gpuci: 10 | runs-on: ubuntu-latest 11 | if: github.repository == 'dask/dask' 12 | 13 | steps: 14 | - uses: actions/checkout@v3.5.3 15 | 16 | - name: Parse current axis YAML 17 | id: rapids_current 18 | uses: the-coding-turtle/ga-yaml-parser@v0.1.2 19 | with: 20 | file: continuous_integration/gpuci/axis.yaml 21 | 22 | - name: Get latest cuDF nightly version 23 | id: cudf_latest 24 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.3 25 | with: 26 | org: "rapidsai-nightly" 27 | package: "cudf" 28 | version_system: "CalVer" 29 | 30 | - name: Get latest UCX-Py nightly version 31 | id: ucx_py_latest 32 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.3 33 | with: 34 | org: "rapidsai-nightly" 35 | package: "ucx-py" 36 | version_system: "CalVer" 37 | 38 | - name: Get old RAPIDS / UCX-Py versions 39 | env: 40 | FULL_RAPIDS_VER: ${{ steps.cudf_latest.outputs.version }} 41 | FULL_UCX_PY_VER: ${{ steps.ucx_py_latest.outputs.version }} 42 | run: | 43 | echo RAPIDS_VER=${{ steps.rapids_current.outputs.RAPIDS_VER_0 }} >> $GITHUB_ENV 44 | echo UCX_PY_VER=$(curl -sL https://version.gpuci.io/rapids/${{ steps.rapids_current.outputs.RAPIDS_VER_0 }}) >> $GITHUB_ENV 45 | echo NEW_RAPIDS_VER=${FULL_RAPIDS_VER::-4} >> $GITHUB_ENV 46 | echo NEW_UCX_PY_VER=${FULL_UCX_PY_VER::-4} >> $GITHUB_ENV 47 | 48 | - name: Update RAPIDS version 49 | uses: jacobtomlinson/gha-find-replace@v3 50 | with: 51 | include: 'continuous_integration\/gpuci\/axis\.yaml' 52 | find: "${{ env.RAPIDS_VER }}" 53 | replace: "${{ env.NEW_RAPIDS_VER }}" 54 | regex: false 55 | 56 | - name: Create Pull Request 57 | uses: peter-evans/create-pull-request@v5 58 | if: ${{ env.UCX_PY_VER != env.NEW_UCX_PY_VER }} # make sure new ucx-py nightlies are available 59 | with: 60 | token: ${{ secrets.GITHUB_TOKEN }} 61 | draft: true 62 | commit-message: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`" 63 | title: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`" 64 | author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 65 | branch: "upgrade-gpuci-rapids" 66 | body: | 67 | New cuDF and ucx-py nightly versions have been detected. 68 | 69 | Updated `axis.yaml` to use `${{ env.NEW_RAPIDS_VER }}`. 70 | -------------------------------------------------------------------------------- /dask/dataframe/hyperloglog.py: -------------------------------------------------------------------------------- 1 | """Implementation of HyperLogLog 2 | 3 | This implements the HyperLogLog algorithm for cardinality estimation, found 4 | in 5 | 6 | Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier. 7 | "HyperLogLog: the analysis of a near-optimal cardinality estimation 8 | algorithm". 2007 Conference on Analysis of Algorithms. Nice, France 9 | (2007) 10 | 11 | """ 12 | from __future__ import annotations 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from pandas.util import hash_pandas_object 17 | 18 | 19 | def compute_first_bit(a): 20 | "Compute the position of the first nonzero bit for each int in an array." 21 | # TODO: consider making this less memory-hungry 22 | bits = np.bitwise_and.outer(a, 1 << np.arange(32)) 23 | bits = bits.cumsum(axis=1).astype(bool) 24 | return 33 - bits.sum(axis=1) 25 | 26 | 27 | def compute_hll_array(obj, b): 28 | # b is the number of bits 29 | 30 | if not 8 <= b <= 16: 31 | raise ValueError("b should be between 8 and 16") 32 | num_bits_discarded = 32 - b 33 | m = 1 << b 34 | 35 | # Get an array of the hashes 36 | hashes = hash_pandas_object(obj, index=False) 37 | if isinstance(hashes, pd.Series): 38 | hashes = hashes._values 39 | hashes = hashes.astype(np.uint32) 40 | 41 | # Of the first b bits, which is the first nonzero? 42 | j = hashes >> num_bits_discarded 43 | first_bit = compute_first_bit(hashes) 44 | 45 | # Pandas can do the max aggregation 46 | df = pd.DataFrame({"j": j, "first_bit": first_bit}) 47 | series = df.groupby("j").max()["first_bit"] 48 | 49 | # Return a dense array so we can concat them and get a result 50 | # that is easy to deal with 51 | return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8) 52 | 53 | 54 | def reduce_state(Ms, b): 55 | m = 1 << b 56 | 57 | # We concatenated all of the states, now we need to get the max 58 | # value for each j in both 59 | Ms = Ms.reshape((len(Ms) // m), m) 60 | return Ms.max(axis=0) 61 | 62 | 63 | def estimate_count(Ms, b): 64 | m = 1 << b 65 | 66 | # Combine one last time 67 | M = reduce_state(Ms, b) 68 | 69 | # Estimate cardinality, no adjustments 70 | alpha = 0.7213 / (1 + 1.079 / m) 71 | E = alpha * m / (2.0 ** -(M.astype("f8"))).sum() * m 72 | # ^^^^ starts as unsigned, need a signed type for 73 | # negation operator to do something useful 74 | 75 | # Apply adjustments for small / big cardinalities, if applicable 76 | if E < 2.5 * m: 77 | V = (M == 0).sum() 78 | if V: 79 | return m * np.log(m / V) 80 | if E > 2**32 / 30.0: 81 | return -(2**32) * np.log1p(-E / 2**32) 82 | return E 83 | -------------------------------------------------------------------------------- /docs/source/graph_manipulation.rst: -------------------------------------------------------------------------------- 1 | .. _graph_manipulation: 2 | 3 | Advanced graph manipulation 4 | =========================== 5 | There are some situations where computations with Dask collections will result in 6 | suboptimal memory usage (e.g. an entire Dask DataFrame is loaded into memory). 7 | This may happen when Dask’s scheduler doesn’t automatically delay the computation of 8 | nodes in a task graph to avoid occupying memory with their output for prolonged periods 9 | of time, or in scenarios where recalculating nodes is much cheaper than holding their 10 | output in memory. 11 | 12 | This page highlights a set of graph manipulation utilities which can be used to help 13 | avoid these scenarios. In particular, the utilities described below rewrite the 14 | underlying Dask graph for Dask collections, producing equivalent collections with 15 | different sets of keys. 16 | 17 | Consider the following example: 18 | 19 | .. code-block:: python 20 | 21 | >>> import dask.array as da 22 | >>> x = da.random.default_rng().normal(size=500_000_000, chunks=100_000) 23 | >>> x_mean = x.mean() 24 | >>> y = (x - x_mean).max().compute() 25 | 26 | The above example computes the largest value of a distribution after removing its bias. 27 | This involves loading the chunks of ``x`` into memory in order to compute ``x_mean``. 28 | However, since the ``x`` array is needed later in the computation to compute ``y``, the 29 | entire ``x`` array is kept in memory. For large Dask Arrays this can be very 30 | problematic. 31 | 32 | To alleviate the need for the entire ``x`` array to be kept in memory, one could rewrite 33 | the last line as follows: 34 | 35 | .. code-block:: python 36 | 37 | >>> from dask.graph_manipulation import bind 38 | >>> xb = bind(x, x_mean) 39 | >>> y = (xb - x_mean).max().compute() 40 | 41 | Here we use :func:`~dask.graph_manipulation.bind` to create a new Dask Array, ``xb``, 42 | which produces exactly the same output as ``x``, but whose underlying Dask graph has 43 | different keys than ``x``, and will only be computed after ``x_mean`` has been 44 | calculated. 45 | 46 | This results in the chunks of ``x`` being computed and immediately individually reduced 47 | by ``mean``; then recomputed and again immediately pipelined into the subtraction 48 | followed by reduction with ``max``. This results in a much smaller peak memory usage as 49 | the full ``x`` array is no longer loaded into memory. However, the tradeoff is that the 50 | compute time increases as ``x`` is computed twice. 51 | 52 | 53 | API 54 | --- 55 | 56 | .. currentmodule:: dask.graph_manipulation 57 | 58 | .. autosummary:: 59 | 60 | checkpoint 61 | wait_on 62 | bind 63 | clone 64 | 65 | 66 | Definitions 67 | ~~~~~~~~~~~ 68 | 69 | .. autofunction:: checkpoint 70 | .. autofunction:: wait_on 71 | .. autofunction:: bind 72 | .. autofunction:: clone 73 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | import dask 6 | 7 | # The doctests in these files fail due to either: 8 | # - Non-required dependencies not being installed 9 | # - Imported doctests due to pulling the docstrings from other packages 10 | # (e.g. `numpy`). No need to run these doctests. 11 | collect_ignore = [ 12 | "dask/bytes/hdfs3.py", 13 | "dask/bytes/pyarrow.py", 14 | "dask/bytes/s3.py", 15 | "dask/array/ghost.py", 16 | "dask/array/fft.py", 17 | "dask/dataframe/io/io.py", 18 | "dask/dataframe/io/parquet/arrow.py", 19 | "dask/dot.py", 20 | "dask/ml.py", 21 | ] 22 | 23 | collect_ignore_glob = [] 24 | try: 25 | import numpy # noqa: F401 26 | except ImportError: 27 | collect_ignore_glob.append("dask/array/*") 28 | 29 | try: 30 | import pandas # noqa: F401 31 | except ImportError: 32 | collect_ignore_glob.append("dask/dataframe/*") 33 | 34 | try: 35 | import scipy # noqa: F401 36 | except ImportError: 37 | collect_ignore.append("dask/array/stats.py") 38 | 39 | try: 40 | import pyarrow # noqa: F401 41 | except ImportError: 42 | collect_ignore.append("dask/dataframe/io/orc/arrow.py") 43 | 44 | try: 45 | import tiledb # noqa: F401 46 | except ImportError: 47 | collect_ignore.append("dask/array/tiledb_io.py") 48 | 49 | try: 50 | import sqlalchemy # noqa: F401 51 | except ImportError: 52 | collect_ignore.append("dask/dataframe/io/sql.py") 53 | 54 | 55 | def pytest_addoption(parser): 56 | parser.addoption("--runslow", action="store_true", help="run slow tests") 57 | 58 | 59 | def pytest_runtest_setup(item): 60 | if "slow" in item.keywords and not item.config.getoption("--runslow"): 61 | pytest.skip("need --runslow option to run") 62 | 63 | 64 | try: 65 | from dask.dataframe.utils import pyarrow_strings_enabled 66 | 67 | convert_string = pyarrow_strings_enabled() 68 | except (ImportError, RuntimeError): 69 | convert_string = False 70 | 71 | skip_with_pyarrow_strings = pytest.mark.skipif( 72 | convert_string, 73 | reason="No need to run with pyarrow strings", 74 | ) 75 | 76 | xfail_with_pyarrow_strings = pytest.mark.xfail( 77 | convert_string, 78 | reason="Known failure with pyarrow strings", 79 | ) 80 | 81 | 82 | def pytest_collection_modifyitems(config, items): 83 | for item in items: 84 | if "skip_with_pyarrow_strings" in item.keywords: 85 | item.add_marker(skip_with_pyarrow_strings) 86 | if "xfail_with_pyarrow_strings" in item.keywords: 87 | item.add_marker(xfail_with_pyarrow_strings) 88 | 89 | 90 | pytest.register_assert_rewrite( 91 | "dask.array.utils", "dask.dataframe.utils", "dask.bag.utils" 92 | ) 93 | 94 | 95 | @pytest.fixture(params=["disk", "tasks"]) 96 | def shuffle_method(request): 97 | with dask.config.set({"dataframe.shuffle.method": request.param}): 98 | yield request.param 99 | -------------------------------------------------------------------------------- /docs/source/images/optimize_dask5.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | %3 11 | 12 | 13 | 14 | 1242807449933300231 15 | 16 | count1 17 | 18 | 19 | 20 | 9194842205208052348 21 | 22 | print1 23 | 24 | 25 | 26 | 1242807449933300231->9194842205208052348 27 | 28 | 29 | 30 | 31 | 32 | 6590722590589999451 33 | 34 | count2 35 | 36 | 37 | 38 | 5022637276554243765 39 | 40 | print2 41 | 42 | 43 | 44 | 6590722590589999451->5022637276554243765 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /dask/hashing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import binascii 4 | import hashlib 5 | 6 | hashers = [] # In decreasing performance order 7 | 8 | 9 | # Timings on a largish array: 10 | # - CityHash is 2x faster than MurmurHash 11 | # - xxHash is slightly slower than CityHash 12 | # - MurmurHash is 8x faster than SHA1 13 | # - SHA1 is significantly faster than all other hashlib algorithms 14 | 15 | try: 16 | import cityhash # `python -m pip install cityhash` 17 | except ImportError: 18 | pass 19 | else: 20 | # CityHash disabled unless the reference leak in 21 | # https://github.com/escherba/python-cityhash/pull/16 22 | # is fixed. 23 | if cityhash.__version__ >= "0.2.2": 24 | 25 | def _hash_cityhash(buf): 26 | """ 27 | Produce a 16-bytes hash of *buf* using CityHash. 28 | """ 29 | h = cityhash.CityHash128(buf) 30 | return h.to_bytes(16, "little") 31 | 32 | hashers.append(_hash_cityhash) 33 | 34 | try: 35 | import xxhash # `python -m pip install xxhash` 36 | except ImportError: 37 | pass 38 | else: 39 | 40 | def _hash_xxhash(buf): 41 | """ 42 | Produce a 8-bytes hash of *buf* using xxHash. 43 | """ 44 | return xxhash.xxh64(buf).digest() 45 | 46 | hashers.append(_hash_xxhash) 47 | 48 | try: 49 | import mmh3 # `python -m pip install mmh3` 50 | except ImportError: 51 | pass 52 | else: 53 | 54 | def _hash_murmurhash(buf): 55 | """ 56 | Produce a 16-bytes hash of *buf* using MurmurHash. 57 | """ 58 | return mmh3.hash_bytes(buf) 59 | 60 | hashers.append(_hash_murmurhash) 61 | 62 | 63 | def _hash_sha1(buf): 64 | """ 65 | Produce a 20-bytes hash of *buf* using SHA1. 66 | """ 67 | return hashlib.sha1(buf).digest() 68 | 69 | 70 | hashers.append(_hash_sha1) 71 | 72 | 73 | def hash_buffer(buf, hasher=None): 74 | """ 75 | Hash a bytes-like (buffer-compatible) object. This function returns 76 | a good quality hash but is not cryptographically secure. The fastest 77 | available algorithm is selected. A fixed-length bytes object is returned. 78 | """ 79 | if hasher is not None: 80 | try: 81 | return hasher(buf) 82 | except (TypeError, OverflowError): 83 | # Some hash libraries may have overly-strict type checking, 84 | # not accepting all buffers 85 | pass 86 | for hasher in hashers: 87 | try: 88 | return hasher(buf) 89 | except (TypeError, OverflowError): 90 | pass 91 | raise TypeError(f"unsupported type for hashing: {type(buf)}") 92 | 93 | 94 | def hash_buffer_hex(buf, hasher=None): 95 | """ 96 | Same as hash_buffer, but returns its result in hex-encoded form. 97 | """ 98 | h = hash_buffer(buf, hasher) 99 | s = binascii.b2a_hex(h) 100 | return s.decode() 101 | -------------------------------------------------------------------------------- /docs/source/images/dask_horizontal.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_hashing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from pandas.util import hash_pandas_object 7 | 8 | import dask.dataframe as dd 9 | from dask.dataframe import _compat 10 | from dask.dataframe._compat import tm 11 | from dask.dataframe.utils import assert_eq 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "obj", 16 | [ 17 | pd.Series([1, 2, 3]), 18 | pd.Series([1.0, 1.5, 3.2]), 19 | pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), 20 | pd.Series(["a", "b", "c"]), 21 | pd.Series([True, False, True]), 22 | pd.Index([1, 2, 3]), 23 | pd.Index([True, False, True]), 24 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), 25 | _compat.makeMissingDataframe(), 26 | _compat.makeMixedDataFrame(), 27 | _compat.makeTimeDataFrame(), 28 | _compat.makeTimeSeries(), 29 | _compat.makeTimedeltaIndex(), 30 | ], 31 | ) 32 | def test_hash_pandas_object(obj): 33 | a = hash_pandas_object(obj) 34 | b = hash_pandas_object(obj) 35 | if isinstance(a, np.ndarray): 36 | np.testing.assert_equal(a, b) 37 | else: 38 | assert_eq(a, b) 39 | 40 | 41 | def test_categorical_consistency(): 42 | # Check that categoricals hash consistent with their values, not codes 43 | # This should work for categoricals of any dtype 44 | for s1 in [ 45 | pd.Series(["a", "b", "c", "d"]), 46 | pd.Series([1000, 2000, 3000, 4000]), 47 | pd.Series(pd.date_range(0, periods=4)), 48 | ]: 49 | s2 = s1.astype("category").cat.set_categories(s1) 50 | s3 = s2.cat.set_categories(list(reversed(s1))) 51 | for categorize in [True, False]: 52 | # These should all hash identically 53 | h1 = hash_pandas_object(s1, categorize=categorize) 54 | h2 = hash_pandas_object(s2, categorize=categorize) 55 | h3 = hash_pandas_object(s3, categorize=categorize) 56 | tm.assert_series_equal(h1, h2) 57 | tm.assert_series_equal(h1, h3) 58 | 59 | 60 | def test_object_missing_values(): 61 | # Check that the presence of missing values doesn't change how object dtype 62 | # is hashed. 63 | s = pd.Series(["a", "b", "c", None]) 64 | h1 = hash_pandas_object(s).iloc[:3] 65 | h2 = hash_pandas_object(s.iloc[:3]) 66 | tm.assert_series_equal(h1, h2) 67 | 68 | 69 | @pytest.mark.parametrize( 70 | "obj", 71 | [ 72 | pd.Index([1, 2, 3]), 73 | pd.Index([True, False, True]), 74 | pd.Series([1, 2, 3]), 75 | pd.Series([1.0, 1.5, 3.2]), 76 | pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]), 77 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}), 78 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}, index=["a", "z", "x"]), 79 | ], 80 | ) 81 | def test_hash_object_dispatch(obj): 82 | result = dd.dispatch.hash_object_dispatch(obj) 83 | expected = pd.util.hash_pandas_object(obj) 84 | assert_eq(result, expected) 85 | -------------------------------------------------------------------------------- /dask/array/tests/test_wrap.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | pytest.importorskip("numpy") 6 | 7 | import numpy as np 8 | 9 | import dask.array as da 10 | from dask.array.utils import assert_eq 11 | from dask.array.wrap import ones 12 | 13 | 14 | def test_ones(): 15 | a = ones((10, 10), dtype="i4", chunks=(4, 4)) 16 | x = np.array(a) 17 | assert (x == np.ones((10, 10), "i4")).all() 18 | 19 | assert a.name.startswith("ones_like-") 20 | 21 | 22 | def test_size_as_list(): 23 | a = ones([10, 10], dtype="i4", chunks=(4, 4)) 24 | x = np.array(a) 25 | assert (x == np.ones((10, 10), dtype="i4")).all() 26 | 27 | 28 | def test_singleton_size(): 29 | a = ones(10, dtype="i4", chunks=(4,)) 30 | x = np.array(a) 31 | assert (x == np.ones(10, dtype="i4")).all() 32 | 33 | 34 | def test_kwargs(): 35 | a = ones(10, dtype="i4", chunks=(4,)) 36 | x = np.array(a) 37 | assert (x == np.ones(10, dtype="i4")).all() 38 | 39 | 40 | def test_full(): 41 | a = da.full((3, 3), 100, chunks=(2, 2), dtype="i8") 42 | 43 | assert (a.compute() == 100).all() 44 | assert a.dtype == a.compute(scheduler="sync").dtype == "i8" 45 | 46 | assert a.name.startswith("full_like-") 47 | 48 | 49 | def test_full_error_nonscalar_fill_value(): 50 | with pytest.raises(ValueError, match="fill_value must be scalar"): 51 | da.full((3, 3), [100, 100], chunks=(2, 2), dtype="i8") 52 | 53 | 54 | def test_full_detects_da_dtype(): 55 | x = da.from_array(100) 56 | with pytest.warns(FutureWarning, match="not implemented by Dask array") as record: 57 | # This shall not raise an NotImplementedError due to dtype detected as object. 58 | a = da.full(shape=(3, 3), fill_value=x) 59 | assert a.dtype == x.dtype 60 | assert_eq(a, np.full(shape=(3, 3), fill_value=100)) 61 | assert len(record) == 1 62 | 63 | 64 | def test_full_none_dtype(): 65 | a = da.full(shape=(3, 3), fill_value=100, dtype=None) 66 | assert_eq(a, np.full(shape=(3, 3), fill_value=100, dtype=None)) 67 | 68 | 69 | def test_full_like_error_nonscalar_fill_value(): 70 | x = np.full((3, 3), 1, dtype="i8") 71 | with pytest.raises(ValueError, match="fill_value must be scalar"): 72 | da.full_like(x, [100, 100], chunks=(2, 2), dtype="i8") 73 | 74 | 75 | def test_can_make_really_big_array_of_ones(): 76 | ones((1000000, 1000000), chunks=(100000, 100000)) 77 | ones(shape=(1000000, 1000000), chunks=(100000, 100000)) 78 | 79 | 80 | def test_wrap_consistent_names(): 81 | assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) == sorted( 82 | ones(10, dtype="i4", chunks=(4,)).dask 83 | ) 84 | assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) != sorted( 85 | ones(10, chunks=(4,)).dask 86 | ) 87 | assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask) == sorted( 88 | da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask 89 | ) 90 | assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="i2").dask) != sorted( 91 | da.full((3, 3), 100, chunks=(2, 2)).dask 92 | ) 93 | -------------------------------------------------------------------------------- /dask/array/tests/test_svg.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import xml.etree.ElementTree 4 | 5 | import pytest 6 | 7 | import dask.array as da 8 | from dask.array.svg import draw_sizes 9 | 10 | 11 | def parses(text): 12 | cleaned = text.replace("→", "") # xml doesn't like righarrow character 13 | assert xml.etree.ElementTree.fromstring(cleaned) is not None # parses cleanly 14 | 15 | 16 | def test_basic(): 17 | parses(da.ones(10).to_svg()) 18 | parses(da.ones((10, 10)).to_svg()) 19 | parses(da.ones((10, 10, 10)).to_svg()) 20 | parses(da.ones((10, 10, 10, 10)).to_svg()) 21 | parses(da.ones((10, 10, 10, 10, 10)).to_svg()) 22 | parses(da.ones((10, 10, 10, 10, 10, 10)).to_svg()) 23 | parses(da.ones((10, 10, 10, 10, 10, 10, 10)).to_svg()) 24 | 25 | 26 | def test_repr_html(): 27 | pytest.importorskip("jinja2") 28 | assert da.ones([])._repr_html_() 29 | assert da.ones(10)[:0]._repr_html_() 30 | assert da.ones(10)._repr_html_() 31 | assert da.ones((10, 10))._repr_html_() 32 | assert da.ones((10, 10, 10))._repr_html_() 33 | assert da.ones((10, 10, 10, 10))._repr_html_() 34 | 35 | 36 | def test_errors(): 37 | # empty arrays 38 | with pytest.raises(NotImplementedError) as excpt: 39 | da.ones([]).to_svg() 40 | assert "0 dimensions" in str(excpt.value) 41 | 42 | # Scalars 43 | with pytest.raises(NotImplementedError) as excpt: 44 | da.asarray(1).to_svg() 45 | assert "0 dimensions" in str(excpt.value) 46 | 47 | # 0-length dims arrays 48 | with pytest.raises(NotImplementedError) as excpt: 49 | da.ones(10)[:0].to_svg() 50 | assert "0-length dimensions" in str(excpt.value) 51 | 52 | # unknown chunk sizes 53 | with pytest.raises(NotImplementedError) as excpt: 54 | x = da.ones(10) 55 | x = x[x > 5] 56 | x.to_svg() 57 | assert "unknown chunk sizes" in str(excpt.value) 58 | 59 | 60 | def test_repr_html_size_units(): 61 | pytest.importorskip("jinja2") 62 | x = da.ones((10000, 5000)) 63 | x = da.ones((3000, 10000), chunks=(1000, 1000)) 64 | text = x._repr_html_() 65 | 66 | assert "MB" in text or "MiB" in text 67 | assert str(x.shape) in text 68 | assert str(x.dtype) in text 69 | 70 | parses(text) 71 | 72 | x = da.ones((3000, 10000, 50), chunks=(1000, 1000, 10)) 73 | parses(x._repr_html_()) 74 | 75 | 76 | def test_draw_sizes(): 77 | assert draw_sizes((10, 10), size=100) == (100, 100) # respect symmetry 78 | assert draw_sizes((10, 10), size=200) == (200, 200) # respect size keyword 79 | assert draw_sizes((10, 5), size=100) == (100, 50) # respect small ratios 80 | 81 | a, b, c = draw_sizes((1000, 100, 10)) 82 | assert a > b 83 | assert b > c 84 | assert a < b * 5 85 | assert b < c * 5 86 | 87 | 88 | def test_too_many_lines_fills_sides_darker(): 89 | data = da.ones((16000, 2400, 3600), chunks=(1, 2400, 3600)) 90 | text = data.to_svg() 91 | assert "8B4903" in text 92 | assert text.count("\n") < 300 93 | 94 | 95 | def test_3d(): 96 | text = da.ones((10, 10, 10, 10, 10)).to_svg() 97 | assert text.count("`_ 18 | and so it is suitable for use both normally as a Jupyter server, and also as 19 | part of a JupyterHub deployment. It also includes a matching Dask software 20 | environment described above. This image is about 2GB in size. 21 | 22 | Example 23 | ------- 24 | 25 | Here is a simple example on a dedicated virtual network 26 | 27 | .. code-block:: bash 28 | 29 | docker network create dask 30 | 31 | docker run --network dask -p 8787:8787 --name scheduler ghcr.io/dask/dask dask-scheduler # start scheduler 32 | 33 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker 34 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker 35 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker 36 | 37 | docker run --network dask -p 8888:8888 ghcr.io/dask/dask-notebook # start Jupyter server 38 | 39 | Then from within the notebook environment you can connect to the Dask cluster like this: 40 | 41 | .. code-block:: python 42 | 43 | from dask.distributed import Client 44 | client = Client("scheduler:8786") 45 | client 46 | 47 | Extensibility 48 | ------------- 49 | 50 | Users can mildly customize the software environment by populating the 51 | environment variables ``EXTRA_APT_PACKAGES``, ``EXTRA_CONDA_PACKAGES``, and 52 | ``EXTRA_PIP_PACKAGES``. If these environment variables are set in the container, 53 | they will trigger calls to the following respectively:: 54 | 55 | apt-get install $EXTRA_APT_PACKAGES 56 | conda install $EXTRA_CONDA_PACKAGES 57 | python -m pip install $EXTRA_PIP_PACKAGES 58 | 59 | For example, the following ``conda`` installs the ``joblib`` package into 60 | the Dask worker software environment: 61 | 62 | .. code-block:: bash 63 | 64 | docker run --network dask -e EXTRA_CONDA_PACKAGES="joblib" ghcr.io/dask/dask dask-worker scheduler:8786 65 | 66 | Note that using these can significantly delay the container from starting, 67 | especially when using ``apt``, or ``conda`` (``pip`` is relatively fast). 68 | 69 | Remember that it is important for software versions to match between Dask 70 | workers and Dask clients. As a result, it is often useful to include the same 71 | extra packages in both Jupyter and Worker images. 72 | 73 | Source 74 | ------ 75 | 76 | Docker files are maintained at https://github.com/dask/dask-docker. 77 | This repository also includes a docker-compose configuration. 78 | -------------------------------------------------------------------------------- /dask/array/tests/test_cupy_percentile.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | pytestmark = pytest.mark.gpu 7 | 8 | import dask.array as da 9 | from dask.array.utils import assert_eq, same_keys 10 | 11 | cupy = pytest.importorskip("cupy") 12 | 13 | 14 | def test_percentile(): 15 | d = da.from_array(cupy.ones((16,)), chunks=(4,)) 16 | qs = np.array([0, 50, 100]) 17 | 18 | result = da.percentile(d, qs, method="midpoint") 19 | assert_eq(result, np.array([1, 1, 1], dtype=d.dtype), check_type=False) 20 | 21 | x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20]) 22 | d = da.from_array(x, chunks=(3,)) 23 | 24 | result = da.percentile(d, qs, method="midpoint") 25 | assert_eq(result, np.array([0, 5, 20], dtype=result.dtype), check_type=False) 26 | 27 | assert not same_keys( 28 | da.percentile(d, qs, "midpoint"), 29 | da.percentile(d, [0, 50], "midpoint"), 30 | ) 31 | 32 | 33 | @pytest.mark.xfail( 34 | reason="Non-deterministic tokenize(cupy.array(...)), " 35 | "see https://github.com/dask/dask/issues/6718" 36 | ) 37 | def test_percentile_tokenize(): 38 | d = da.from_array(cupy.ones((16,)), chunks=(4,)) 39 | qs = np.array([0, 50, 100]) 40 | assert same_keys(da.percentile(d, qs), da.percentile(d, qs)) 41 | 42 | 43 | def test_percentiles_with_empty_arrays(): 44 | x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),)) 45 | result = da.percentile(x, [10, 50, 90], method="midpoint") 46 | assert type(result._meta) == cupy.ndarray 47 | assert_eq(result, result) # Check that _meta and computed arrays match types 48 | assert_eq(result, np.array([1, 1, 1], dtype=x.dtype), check_type=False) 49 | 50 | 51 | def test_percentiles_with_empty_q(): 52 | x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),)) 53 | result = da.percentile(x, [], method="midpoint") 54 | assert type(result._meta) == cupy.ndarray 55 | assert_eq(result, result) # Check that _meta and computed arrays match types 56 | assert_eq(result, np.array([], dtype=x.dtype), check_type=False) 57 | 58 | 59 | @pytest.mark.parametrize("q", [5, 5.0, np.int64(5), np.float64(5)]) 60 | def test_percentiles_with_scaler_percentile(q): 61 | # Regression test to ensure da.percentile works with scalar percentiles 62 | # See #3020 63 | d = da.from_array(cupy.ones((16,)), chunks=(4,)) 64 | result = da.percentile(d, q, method="midpoint") 65 | assert type(result._meta) == cupy.ndarray 66 | assert_eq(result, result) # Check that _meta and computed arrays match types 67 | assert_eq(result, np.array([1], dtype=d.dtype), check_type=False) 68 | 69 | 70 | def test_percentiles_with_unknown_chunk_sizes(): 71 | rng = da.random.default_rng(cupy.random.default_rng()) 72 | x = rng.random(1000, chunks=(100,)) 73 | x._chunks = ((np.nan,) * 10,) 74 | 75 | result = da.percentile(x, 50, method="midpoint").compute() 76 | assert type(result) == cupy.ndarray 77 | assert 0.1 < result < 0.9 78 | 79 | a, b = da.percentile(x, [40, 60], method="midpoint").compute() 80 | assert type(a) == cupy.ndarray 81 | assert type(b) == cupy.ndarray 82 | assert 0.1 < a < 0.9 83 | assert 0.1 < b < 0.9 84 | assert a < b 85 | -------------------------------------------------------------------------------- /docs/source/_static/main-page.css: -------------------------------------------------------------------------------- 1 | /* GLOBAL STYLES 2 | -------------------------------------------------- */ 3 | /* Padding below the footer and lighter body text */ 4 | 5 | body { 6 | padding-bottom: 3rem; 7 | color: #5a5a5a; 8 | } 9 | 10 | /* navbar 11 | * ----------------------------------------*/ 12 | 13 | .navbar { 14 | background-color: #000000; 15 | } 16 | .navbar li { 17 | transition: .3s background-color; 18 | text-align: center; 19 | background-color: transparent; 20 | padding: 0rem 1rem; 21 | text-decoration: none; 22 | border-radius: 0.3rem; 23 | } 24 | .navbar li:hover { 25 | background-color: #FDA061; 26 | } 27 | .navbar li .nav-link{ 28 | color: #FDA061; 29 | } 30 | .navbar li:hover .nav-link{ 31 | color: #212529; 32 | } 33 | 34 | .dropdown-menu { 35 | background-color: #000000d0; 36 | } 37 | 38 | .dropdown-item { 39 | color: #FDA061; 40 | } 41 | 42 | .dropdown-item:hover { 43 | background-color: #FDA061D0; 44 | } 45 | 46 | .hero { 47 | background-color: rgba(0,0,0,0.92); 48 | text-color: white; 49 | } 50 | 51 | 52 | .top-image { 53 | height: 10rem; 54 | max-width: 20rem; 55 | } 56 | 57 | 58 | .outline-dask { 59 | color: #FDA061; 60 | background-color: transparent; 61 | border-color: #FDA061; 62 | } 63 | 64 | 65 | .outline-dask:hover { 66 | color: #212529; 67 | background-color: #FDA061; 68 | border-color: #FDA061; 69 | } 70 | 71 | .solid-dask { 72 | color: #212529; 73 | background-color: #FDA061; 74 | } 75 | 76 | .solid-dask:hover { 77 | color: #212529; 78 | background-color: #EC9050; 79 | } 80 | 81 | 82 | /* MARKETING CONTENT 83 | -------------------------------------------------- */ 84 | 85 | /* Center align the text within the three columns below the carousel */ 86 | .marketing .col-lg-4 { 87 | margin-bottom: 1.5rem; 88 | text-align: center; 89 | } 90 | .marketing .col-lg-4 p { 91 | margin-right: .75rem; 92 | margin-left: .75rem; 93 | } 94 | 95 | 96 | /* Featurettes 97 | ------------------------- */ 98 | 99 | .featurette-divider { 100 | margin: 3rem 0; /* Space out the Bootstrap
more */ 101 | } 102 | 103 | /* Thin out the marketing headings */ 104 | .featurette-heading { 105 | font-weight: 300; 106 | line-height: 1; 107 | letter-spacing: -.05rem; 108 | } 109 | 110 | .featurette-subheading { 111 | text-transform: uppercase; 112 | font-size: 1.2rem; 113 | display: block; 114 | font-weight: 600; 115 | margin: 1.2rem 0; 116 | } 117 | 118 | /* Supporters 119 | * ----------------------------*/ 120 | 121 | .supporters { 122 | text-align: center; 123 | } 124 | 125 | .supporter { 126 | margin: 0.5rem 0; 127 | width: 100%; 128 | } 129 | 130 | .supporter img{ 131 | max-height: 100%; 132 | max-width: 85%; 133 | position: relative; 134 | top: 50%; 135 | transform: translateY(-50%); 136 | 137 | } 138 | 139 | 140 | /* RESPONSIVE CSS 141 | -------------------------------------------------- */ 142 | 143 | @media (min-width: 40em) { 144 | .featurette-heading { 145 | font-size: 50px; 146 | } 147 | } 148 | 149 | @media (min-width: 62em) { 150 | .featurette-heading { 151 | margin-top: 3rem; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /docs/source/array-gufunc.rst: -------------------------------------------------------------------------------- 1 | Generalized Ufuncs 2 | ================== 3 | 4 | `NumPy `_ provides the concept of `generalized ufuncs `_. Generalized ufuncs are functions 5 | that distinguish the various dimensions of passed arrays in the two classes loop dimensions 6 | and core dimensions. To accomplish this, a `signature `_ is specified for NumPy generalized ufuncs. 7 | 8 | `Dask `_ integrates interoperability with NumPy's generalized ufuncs 9 | by adhering to respective `ufunc protocol `_, and provides a wrapper to make a Python function a generalized ufunc. 10 | 11 | 12 | Usage 13 | ----- 14 | 15 | NumPy Generalized UFuncs 16 | ~~~~~~~~~~~~~~~~~~~~~~~~ 17 | .. note:: 18 | 19 | `NumPy `_ generalized ufuncs are currently (v1.14.3 and below) stored in 20 | inside ``np.linalg._umath_linalg`` and might change in the future. 21 | 22 | 23 | .. code-block:: python 24 | 25 | import dask.array as da 26 | import numpy as np 27 | 28 | x = da.random.default_rng().normal(size=(3, 10, 10), chunks=(2, 10, 10)) 29 | 30 | w, v = np.linalg._umath_linalg.eig(x, output_dtypes=(float, float)) 31 | 32 | 33 | Create Generalized UFuncs 34 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | 36 | It can be difficult to create your own GUFuncs without going into the CPython API. 37 | However, the `Numba `_ project does provide a 38 | nice implementation with their ``numba.guvectorize`` decorator. See `Numba's 39 | documentation 40 | `_ 41 | for more information. 42 | 43 | Wrap your own Python function 44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 45 | ``gufunc`` can be used to make a Python function behave like a generalized ufunc: 46 | 47 | 48 | .. code-block:: python 49 | 50 | x = da.random.default_rng().normal(size=(10, 5), chunks=(2, 5)) 51 | 52 | def foo(x): 53 | return np.mean(x, axis=-1) 54 | 55 | gufoo = da.gufunc(foo, signature="(i)->()", output_dtypes=float, vectorize=True) 56 | 57 | y = gufoo(x) 58 | 59 | 60 | Instead of ``gufunc``, also the ``as_gufunc`` decorator can be used for convenience: 61 | 62 | 63 | .. code-block:: python 64 | 65 | x = da.random.normal(size=(10, 5), chunks=(2, 5)) 66 | 67 | @da.as_gufunc(signature="(i)->()", output_dtypes=float, vectorize=True) 68 | def gufoo(x): 69 | return np.mean(x, axis=-1) 70 | 71 | y = gufoo(x) 72 | 73 | 74 | Disclaimer 75 | ---------- 76 | This experimental generalized ufunc integration is not complete: 77 | 78 | * ``gufunc`` does not create a true generalized ufunc to be used with other input arrays besides Dask. 79 | I.e., at the moment, ``gufunc`` casts all input arguments to ``dask.array.Array`` 80 | 81 | * Inferring ``output_dtypes`` automatically is not implemented yet 82 | 83 | 84 | API 85 | --- 86 | 87 | .. currentmodule:: dask.array.gufunc 88 | 89 | .. autosummary:: 90 | apply_gufunc 91 | as_gufunc 92 | gufunc 93 | -------------------------------------------------------------------------------- /dask/dataframe/tests/test_hyperloglog.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | import dask.dataframe as dd 8 | 9 | rs = np.random.RandomState(96) 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "df", 14 | [ 15 | pd.DataFrame( 16 | { 17 | "x": [1, 2, 3] * 3, 18 | "y": [1.2, 3.4, 5.6] * 3, 19 | "z": -(np.arange(9, dtype=np.int8)), 20 | } 21 | ), 22 | pd.DataFrame( 23 | { 24 | "x": rs.randint(0, 1000000, (10000,)), 25 | "y": rs.randn(10000), 26 | "z": rs.uniform(0, 9999999, (10000,)), 27 | } 28 | ), 29 | pd.DataFrame( 30 | { 31 | "x": np.repeat(rs.randint(0, 1000000, (1000,)), 3), 32 | "y": np.repeat(rs.randn(1000), 3), 33 | "z": np.repeat(rs.uniform(0, 9999999, (1000,)), 3), 34 | } 35 | ), 36 | pd.DataFrame({"x": rs.randint(0, 1000000, (10000,))}), 37 | pd.DataFrame( 38 | { 39 | "x": rs.randint(0, 1000000, (7,)), 40 | "y": ["a", "bet", "is", "a", "tax", "on", "bs"], 41 | } 42 | ), 43 | pd.DataFrame( 44 | { 45 | "w": np.zeros((20000,)), 46 | "x": np.zeros((20000,)), 47 | "y": np.zeros((20000,)) + 4803592, 48 | "z": np.zeros((20000,)), 49 | } 50 | ), 51 | pd.DataFrame({"x": [1, 2, 3] * 1000}), 52 | pd.DataFrame({"x": np.random.random(1000)}), 53 | pd.DataFrame( 54 | { 55 | "a": [1, 2, 3] * 3, 56 | "b": [1.2, 3.4, 5.6] * 3, 57 | "c": [1 + 2j, 3 + 4j, 5 + 6j] * 3, 58 | "d": -(np.arange(9, dtype=np.int8)), 59 | } 60 | ), 61 | pd.Series([1, 2, 3] * 1000), 62 | pd.Series(np.random.random(1000)), 63 | pd.Series(np.random.random(1000), index=np.ones(1000)), 64 | pd.Series(np.random.random(1000), index=np.random.random(1000)), 65 | ], 66 | ) 67 | @pytest.mark.parametrize("npartitions", [2, 20]) 68 | def test_basic(df, npartitions): 69 | ddf = dd.from_pandas(df, npartitions=npartitions) 70 | 71 | approx = ddf.nunique_approx().compute(scheduler="sync") 72 | exact = len(df.drop_duplicates()) 73 | assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05 74 | 75 | 76 | @pytest.mark.parametrize("split_every", [None, 2, 10]) 77 | @pytest.mark.parametrize("npartitions", [2, 20]) 78 | def test_split_every(split_every, npartitions): 79 | df = pd.Series([1, 2, 3] * 1000) 80 | ddf = dd.from_pandas(df, npartitions=npartitions) 81 | 82 | approx = ddf.nunique_approx(split_every=split_every).compute(scheduler="sync") 83 | exact = len(df.drop_duplicates()) 84 | assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05 85 | 86 | 87 | def test_larger_data(): 88 | df = dd.demo.make_timeseries( 89 | "2000-01-01", 90 | "2000-04-01", 91 | {"value": float, "id": int}, 92 | freq="10s", 93 | partition_freq="1D", 94 | seed=1, 95 | ) 96 | assert df.nunique_approx().compute() > 1000 97 | -------------------------------------------------------------------------------- /docs/source/deploying-cloud.rst: -------------------------------------------------------------------------------- 1 | Cloud 2 | ===== 3 | 4 | There are a variety of ways to deploy Dask on cloud providers. 5 | Cloud providers provide managed services, 6 | like VMs, Kubernetes, Yarn, or custom APIs with which Dask can connect easily. 7 | You may want to consider the following options: 8 | 9 | 1. A managed Kubernetes service and Dask's 10 | :doc:`Kubernetes integration `. 11 | 2. A managed Yarn service, 12 | like `Amazon EMR `_ 13 | or `Google Cloud DataProc `_ 14 | and `Dask-Yarn `_. 15 | 16 | Specific documentation for the popular Amazon EMR service can be found 17 | `here `_. 18 | 3. Directly launching cloud resources such as VMs or containers via a cluster manager with 19 | `Dask Cloud Provider `_. 20 | 4. A commercial Dask deployment option like `Coiled `_ to handle the creation and management of Dask clusters on a cloud computing environment (AWS and GCP). 21 | 22 | Cloud Deployment Example 23 | ------------------------ 24 | 25 | Using `Dask Cloud Provider `_ to launch a cluster of 26 | VMs on a platform like `DigitalOcean `_ can be as convenient as 27 | launching a local cluster. 28 | 29 | .. code-block:: python 30 | 31 | >>> import dask.config 32 | 33 | >>> dask.config.set({"cloudprovider.digitalocean.token": "yourAPItoken"}) 34 | 35 | >>> from dask_cloudprovider.digitalocean import DropletCluster 36 | 37 | >>> cluster = DropletCluster(n_workers=1) 38 | Creating scheduler instance 39 | Created droplet dask-38b817c1-scheduler 40 | Waiting for scheduler to run 41 | Scheduler is running 42 | Creating worker instance 43 | Created droplet dask-38b817c1-worker-dc95260d 44 | 45 | Many of the cluster managers in Dask Cloud Provider work by launching VMs with a startup script 46 | that pulls down the :doc:`Dask Docker image ` and runs Dask components within that container. 47 | As with all cluster managers the VM resources, Docker image, etc are all configurable. 48 | 49 | You can then connect a client and work with the cluster as if it were on your local machine. 50 | 51 | .. code-block:: python 52 | 53 | >>> from dask.distributed import Client 54 | 55 | >>> client = Client(cluster) 56 | 57 | Data Access 58 | ----------- 59 | 60 | You may want to install additional libraries in your Jupyter and worker images 61 | to access the object stores of each cloud (see :doc:`how-to/connect-to-remote-data`): 62 | 63 | - `s3fs `_ for Amazon's S3 64 | - `gcsfs `_ for Google's GCS 65 | - `adlfs `_ for Microsoft's ADL 66 | 67 | Historical Libraries 68 | -------------------- 69 | 70 | Dask previously maintained libraries for deploying Dask on 71 | Amazon's EC2 and Google GKE. 72 | Due to sporadic interest, 73 | and churn both within the Dask library and EC2 itself, 74 | these were not well maintained. 75 | They have since been deprecated in favor of the 76 | :doc:`Kubernetes ` solutions. 77 | -------------------------------------------------------------------------------- /docs/source/deploying-python.rst: -------------------------------------------------------------------------------- 1 | Python API 2 | ========== 3 | 4 | You can create a ``dask.distributed`` scheduler by importing and creating a 5 | ``Client`` with no arguments. This overrides whatever default was previously 6 | set. 7 | 8 | .. code-block:: python 9 | 10 | from dask.distributed import Client 11 | client = Client() 12 | 13 | You can navigate to ``http://localhost:8787/status`` to see the diagnostic 14 | dashboard if you have Bokeh installed. 15 | 16 | Client 17 | ------ 18 | 19 | You can trivially set up a local cluster on your machine by instantiating a Dask 20 | Client with no arguments 21 | 22 | .. code-block:: python 23 | 24 | from dask.distributed import Client 25 | client = Client() 26 | 27 | This sets up a scheduler in your local process along with a number of workers and 28 | threads per worker related to the number of cores in your machine. 29 | 30 | If you want to run workers in your same process, you can pass the 31 | ``processes=False`` keyword argument. 32 | 33 | .. code-block:: python 34 | 35 | client = Client(processes=False) 36 | 37 | This is sometimes preferable if you want to avoid inter-worker communication 38 | and your computations release the GIL. This is common when primarily using 39 | NumPy or Dask Array. 40 | 41 | 42 | LocalCluster 43 | ------------ 44 | 45 | The ``Client()`` call described above is shorthand for creating a LocalCluster 46 | and then passing that to your client. 47 | 48 | .. code-block:: python 49 | 50 | from dask.distributed import Client, LocalCluster 51 | cluster = LocalCluster() 52 | client = Client(cluster) 53 | 54 | This is equivalent, but somewhat more explicit. 55 | 56 | You may want to look at the 57 | keyword arguments available on ``LocalCluster`` to understand the options available 58 | to you on handling the mixture of threads and processes, like specifying explicit 59 | ports, and so on. 60 | 61 | To create a local cluster with all workers running in dedicated subprocesses, 62 | ``dask.distributed`` also offers the experimental ``SubprocessCluster``. 63 | 64 | Cluster manager features 65 | ------------------------ 66 | 67 | Instantiating a cluster manager class like ``LocalCluster`` and then passing it to the 68 | ``Client`` is a common pattern. Cluster managers also provide useful utilities to help 69 | you understand what is going on. 70 | 71 | For example you can retrieve the Dashboard URL. 72 | 73 | .. code-block:: python 74 | 75 | >>> cluster.dashboard_link 76 | 'http://127.0.0.1:8787/status' 77 | 78 | You can retrieve logs from cluster components. 79 | 80 | .. code-block:: python 81 | 82 | >>> cluster.get_logs() 83 | {'Cluster': '', 84 | 'Scheduler': "distributed.scheduler - INFO - Clear task state\ndistributed.scheduler - INFO - S... 85 | 86 | If you are using a cluster manager that supports scaling you can modify the number of workers manually 87 | or automatically based on workload. 88 | 89 | .. code-block:: python 90 | 91 | >>> cluster.scale(10) # Sets the number of workers to 10 92 | 93 | >>> cluster.adapt(minimum=1, maximum=10) # Allows the cluster to auto scale to 10 when tasks are computed 94 | 95 | Reference 96 | --------- 97 | 98 | .. currentmodule:: distributed.deploy.local 99 | 100 | .. autoclass:: LocalCluster 101 | :members: 102 | -------------------------------------------------------------------------------- /docs/source/array-assignment.rst: -------------------------------------------------------------------------------- 1 | .. _array.assignment: 2 | 3 | Assignment 4 | ========== 5 | 6 | Dask Array supports most of the NumPy assignment indexing syntax. In 7 | particular, it supports combinations of the following: 8 | 9 | * Indexing by integers: ``x[1] = y`` 10 | * Indexing by slices: ``x[2::-1] = y`` 11 | * Indexing by a list of integers: ``x[[0, -1, 1]] = y`` 12 | * Indexing by a 1-d :class:`numpy` array of integers: ``x[np.arange(3)] = y`` 13 | * Indexing by a 1-d :class:`~dask.array.Array` of integers: ``x[da.arange(3)] = y``, ``x[da.from_array([0, -1, 1])] = y``, ``x[da.where(np.array([1, 2, 3]) < 3)[0]] = y`` 14 | * Indexing by a list of booleans: ``x[[False, True, True]] = y`` 15 | * Indexing by a 1-d :class:`numpy` array of booleans: ``x[np.arange(3) > 0] = y`` 16 | 17 | It also supports: 18 | 19 | * Indexing by one broadcastable :class:`~dask.array.Array` of 20 | booleans: ``x[x > 0] = y``. 21 | 22 | However, it does not currently support the following: 23 | 24 | * Indexing with lists in multiple axes: ``x[[1, 2, 3], [3, 1, 2]] = y`` 25 | 26 | 27 | .. _array.assignment.broadcasting: 28 | 29 | Broadcasting 30 | ------------ 31 | 32 | The normal NumPy broadcasting rules apply: 33 | 34 | .. code-block:: python 35 | 36 | >>> x = da.zeros((2, 6)) 37 | >>> x[0] = 1 38 | >>> x[..., 1] = 2.0 39 | >>> x[:, 2] = [3, 4] 40 | >>> x[:, 5:2:-2] = [[6, 5]] 41 | >>> x.compute() 42 | array([[1., 2., 3., 5., 1., 6.], 43 | [0., 2., 4., 5., 0., 6.]]) 44 | >>> x[1] = -x[0] 45 | >>> x.compute() 46 | array([[ 1., 2., 3., 5., 1., 6.], 47 | [-1., -2., -3., -5., -1., -6.]]) 48 | 49 | .. _array.assignment.masking: 50 | 51 | Masking 52 | ------- 53 | 54 | Elements may be masked by assigning to the NumPy masked value, or to an 55 | array with masked values: 56 | 57 | .. code-block:: python 58 | 59 | >>> x = da.ones((2, 6)) 60 | >>> x[0, [1, -2]] = np.ma.masked 61 | >>> x[1] = np.ma.array([0, 1, 2, 3, 4, 5], mask=[0, 1, 1, 0, 0, 0]) 62 | >>> print(x.compute()) 63 | [[1.0 -- 1.0 1.0 -- 1.0] 64 | [0.0 -- -- 3.0 4.0 5.0]] 65 | >>> x[:, 0] = x[:, 1] 66 | >>> print(x.compute()) 67 | [[1.0 -- 1.0 1.0 -- 1.0] 68 | [0.0 -- -- 3.0 4.0 5.0]] 69 | >>> x[:, 0] = x[:, 1] 70 | >>> print(x.compute()) 71 | [[-- -- 1.0 1.0 -- 1.0] 72 | [-- -- -- 3.0 4.0 5.0]] 73 | 74 | If, and only if, a single broadcastable :class:`~dask.array.Array` of 75 | booleans is provided then masked array assignment does not yet work as 76 | expected. In this case the data underlying the mask are assigned: 77 | 78 | .. code-block:: python 79 | 80 | >>> x = da.arange(12).reshape(2, 6) 81 | >>> x[x > 7] = np.ma.array(-99, mask=True) 82 | >>> print(x.compute()) 83 | [[ 0 1 2 3 4 5] 84 | [ 6 7 -99 -99 -99 -99]] 85 | 86 | Note that masked assignments do work when a boolean 87 | :class:`~dask.array.Array` index used in a tuple, or implicit tuple, 88 | of indices: 89 | 90 | .. code-block:: python 91 | 92 | >>> x = da.arange(12).reshape(2, 6) 93 | >>> x[1, x[0] > 3] = np.ma.masked 94 | >>> print(x.compute()) 95 | [[0 1 2 3 4 5] 96 | [6 7 8 9 -- --]] 97 | >>> x = da.arange(12).reshape(2, 6) 98 | >>> print(x.compute()) 99 | [[ 0 1 2 3 4 5] 100 | [ 6 7 8 9 10 11]] 101 | >>> x[(x[:, 2] < 4,)] = np.ma.masked 102 | >>> print(x.compute()) 103 | [[-- -- -- -- -- --] 104 | [6 7 8 9 10 11]] 105 | --------------------------------------------------------------------------------