├── dask
├── py.typed
├── tests
│ ├── __init__.py
│ ├── warning_aliases.py
│ ├── test_compatibility.py
│ ├── test_ml.py
│ ├── test_backends.py
│ ├── test_docs.py
│ ├── test_hashing.py
│ ├── test_datasets.py
│ ├── test_ci.py
│ ├── test_context.py
│ ├── test_system.py
│ ├── test_utils_test.py
│ ├── test_cache.py
│ └── test_callbacks.py
├── array
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_testing.py
│ │ ├── test_cupy_gufunc.py
│ │ ├── test_numpy_compat.py
│ │ ├── test_xarray.py
│ │ ├── test_image.py
│ │ ├── test_cupy_reductions.py
│ │ ├── test_wrap.py
│ │ ├── test_svg.py
│ │ └── test_cupy_percentile.py
│ ├── lib
│ │ ├── __init__.py
│ │ └── stride_tricks.py
│ ├── dispatch.py
│ ├── NUMPY_LICENSE.txt
│ ├── image.py
│ └── cupy_entry_point.py
├── bag
│ ├── tests
│ │ └── __init__.py
│ ├── utils.py
│ ├── chunk.py
│ └── __init__.py
├── bytes
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_compression.py
│ ├── __init__.py
│ └── utils.py
├── dataframe
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_methods.py
│ │ ├── test_boolean.py
│ │ ├── test_optimize_dataframe.py
│ │ ├── test_extensions.py
│ │ ├── test_numeric.py
│ │ ├── test_hashing.py
│ │ └── test_hyperloglog.py
│ ├── io
│ │ ├── tests
│ │ │ └── __init__.py
│ │ ├── orc
│ │ │ ├── __init__.py
│ │ │ └── utils.py
│ │ ├── parquet
│ │ │ └── __init__.py
│ │ └── __init__.py
│ ├── tseries
│ │ ├── __init__.py
│ │ └── tests
│ │ │ └── __init__.py
│ ├── extensions.py
│ ├── numeric.py
│ ├── __init__.py
│ ├── _pyarrow_compat.py
│ ├── _dtypes.py
│ └── hyperloglog.py
├── diagnostics
│ ├── tests
│ │ └── __init__.py
│ └── __init__.py
├── widgets
│ ├── tests
│ │ ├── templates
│ │ │ ├── example.html.j2
│ │ │ ├── bytes.html.j2
│ │ │ └── custom_filter.html.j2
│ │ └── test_widgets.py
│ ├── templates
│ │ ├── dataframe.html.j2
│ │ ├── array.html.j2
│ │ ├── highlevelgraph_layer.html.j2
│ │ └── highlevelgraph.html.j2
│ ├── __init__.py
│ └── widgets.py
├── __main__.py
├── ml.py
├── __init__.py
├── compatibility.py
├── _compatibility.py
├── distributed.py
├── system.py
├── context.py
├── cache.py
├── dask.yaml
└── hashing.py
├── docs
├── source
│ ├── daskcheatsheet.pdf
│ ├── images
│ │ ├── reshape.png
│ │ ├── gputester-msg.png
│ │ ├── merge_chunks.png
│ │ ├── order-failure.png
│ │ ├── order-success.png
│ │ ├── scaling-edges.png
│ │ ├── scaling-nodes.png
│ │ ├── simple-dask.png
│ │ ├── dashboard_link.png
│ │ ├── reshape_problem.png
│ │ ├── HHMI_Janelia_Color.png
│ │ ├── async-embarrassing.gif
│ │ ├── dashboard_memory.png
│ │ ├── dashboard_progress.png
│ │ ├── dashboard_status.png
│ │ ├── merge_chunks_false.png
│ │ ├── reshape_rechunked.png
│ │ ├── 10_minutes_bag_graph.png
│ │ ├── dashboard_jupyterlab.png
│ │ ├── dashboard_memory_new.gif
│ │ ├── growth_of_languages.png
│ │ ├── growth_of_libraries.png
│ │ ├── map_blocks_drop_axis.png
│ │ ├── 10_minutes_array_graph.png
│ │ ├── transpose-hlg-html-repr.png
│ │ ├── dashboard_task_processing.png
│ │ ├── 10_minutes_dataframe_graph.png
│ │ ├── concurrent-futures-threaded.webp
│ │ ├── dashboard_taskstream_healthy.png
│ │ ├── transpose-hlg-hovertooltip.png
│ │ ├── dashboard_task_stream_unhealthy.png
│ │ ├── dask_icon_black.svg
│ │ ├── dask_icon.svg
│ │ ├── dask_icon_on_pink.svg
│ │ ├── dask_icon_white.svg
│ │ ├── unoverlapping-neighbors.svg
│ │ ├── optimize_dask5.svg
│ │ └── dask_horizontal.svg
│ ├── _static
│ │ ├── dask-simple.png
│ │ ├── theme_overrides.css
│ │ ├── style.css
│ │ └── main-page.css
│ ├── _templates
│ │ └── layout.html
│ ├── cheatsheet.rst
│ ├── internals.rst
│ ├── debugging-performance.rst
│ ├── how-to
│ │ ├── index.rst
│ │ ├── setup-prometheus.rst
│ │ └── extend-sizeof.rst
│ ├── logos.rst
│ ├── dashboard-progress-script.py
│ ├── array-stats.rst
│ ├── delayed-collections.rst
│ ├── deploying-ssh.rst
│ ├── delayed-api.rst
│ ├── understanding-performance.rst
│ ├── bag-api.rst
│ ├── array-stack.rst
│ ├── graph_manipulation.rst
│ ├── deploying-docker.rst
│ ├── array-gufunc.rst
│ ├── deploying-cloud.rst
│ ├── deploying-python.rst
│ └── array-assignment.rst
├── requirements-docs.txt
└── README.rst
├── .github
├── PULL_REQUEST_TEMPLATE.md
├── CONTRIBUTING.md
├── dependabot.yml
├── workflows
│ ├── label-prs.yml
│ ├── label-all.yml
│ ├── pre-commit.yml
│ ├── stale-bot.yaml
│ ├── additional.yml
│ ├── conda.yml
│ ├── upstream.yml
│ └── update-gpuci.yml
├── labeler.yml
└── release.yml
├── continuous_integration
├── gpuci
│ ├── axis.yaml
│ └── build.sh
├── scripts
│ ├── run_tests.sh
│ ├── test_imports.sh
│ └── install.sh
├── environment-mindeps-non-optional.yaml
├── environment-mindeps-array.yaml
├── environment-mindeps-dataframe.yaml
├── environment-mindeps-distributed.yaml
├── recipe
│ └── meta.yaml
├── environment-mindeps-optional.yaml
├── environment-3.9.yaml
├── environment-3.10.yaml
└── environment-3.11.yaml
├── setup.py
├── CONTRIBUTING.md
├── .readthedocs.yaml
├── MANIFEST.in
├── .gitignore
├── .git-blame-ignore-revs
├── codecov.yml
├── .flake8
├── README.rst
├── LICENSE.txt
├── .pre-commit-config.yaml
└── conftest.py
/dask/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/array/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/bag/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/bytes/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/dataframe/io/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/dataframe/tseries/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/diagnostics/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/dataframe/tseries/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dask/widgets/tests/templates/example.html.j2:
--------------------------------------------------------------------------------
1 |
2 | Hello {{ foo }}!
3 |
4 |
--------------------------------------------------------------------------------
/dask/widgets/tests/templates/bytes.html.j2:
--------------------------------------------------------------------------------
1 |
2 | {{ foo | format_bytes }}
3 |
4 |
--------------------------------------------------------------------------------
/dask/widgets/tests/templates/custom_filter.html.j2:
--------------------------------------------------------------------------------
1 |
2 | {{ foo | custom_filter }}
3 |
4 |
--------------------------------------------------------------------------------
/dask/bytes/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.bytes.core import read_bytes
4 |
--------------------------------------------------------------------------------
/docs/source/daskcheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/daskcheatsheet.pdf
--------------------------------------------------------------------------------
/docs/source/images/reshape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape.png
--------------------------------------------------------------------------------
/dask/array/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.array.lib import stride_tricks
4 |
--------------------------------------------------------------------------------
/docs/source/_static/dask-simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/_static/dask-simple.png
--------------------------------------------------------------------------------
/docs/source/images/gputester-msg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/gputester-msg.png
--------------------------------------------------------------------------------
/docs/source/images/merge_chunks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks.png
--------------------------------------------------------------------------------
/docs/source/images/order-failure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-failure.png
--------------------------------------------------------------------------------
/docs/source/images/order-success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-success.png
--------------------------------------------------------------------------------
/docs/source/images/scaling-edges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-edges.png
--------------------------------------------------------------------------------
/docs/source/images/scaling-nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-nodes.png
--------------------------------------------------------------------------------
/docs/source/images/simple-dask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/simple-dask.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_link.png
--------------------------------------------------------------------------------
/docs/source/images/reshape_problem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_problem.png
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | - [ ] Closes #xxxx
2 | - [ ] Tests added / passed
3 | - [ ] Passes `pre-commit run --all-files`
4 |
--------------------------------------------------------------------------------
/docs/source/images/HHMI_Janelia_Color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/HHMI_Janelia_Color.png
--------------------------------------------------------------------------------
/docs/source/images/async-embarrassing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/async-embarrassing.gif
--------------------------------------------------------------------------------
/docs/source/images/dashboard_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_progress.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_status.png
--------------------------------------------------------------------------------
/docs/source/images/merge_chunks_false.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks_false.png
--------------------------------------------------------------------------------
/docs/source/images/reshape_rechunked.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_rechunked.png
--------------------------------------------------------------------------------
/dask/dataframe/io/orc/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.dataframe.io.orc.core import read_orc, to_orc
4 |
--------------------------------------------------------------------------------
/docs/source/images/10_minutes_bag_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_bag_graph.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_jupyterlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_jupyterlab.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_memory_new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory_new.gif
--------------------------------------------------------------------------------
/docs/source/images/growth_of_languages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_languages.png
--------------------------------------------------------------------------------
/docs/source/images/growth_of_libraries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_libraries.png
--------------------------------------------------------------------------------
/docs/source/images/map_blocks_drop_axis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/map_blocks_drop_axis.png
--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | See [developer documentation](https://docs.dask.org/en/latest/develop.html)
2 | for tips on how to get started.
3 |
--------------------------------------------------------------------------------
/docs/source/images/10_minutes_array_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_array_graph.png
--------------------------------------------------------------------------------
/docs/source/images/transpose-hlg-html-repr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-html-repr.png
--------------------------------------------------------------------------------
/dask/array/lib/stride_tricks.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.array.overlap import sliding_window_view # noqa: F401
4 |
--------------------------------------------------------------------------------
/docs/source/images/dashboard_task_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_processing.png
--------------------------------------------------------------------------------
/docs/source/images/10_minutes_dataframe_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_dataframe_graph.png
--------------------------------------------------------------------------------
/docs/source/images/concurrent-futures-threaded.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/concurrent-futures-threaded.webp
--------------------------------------------------------------------------------
/docs/source/images/dashboard_taskstream_healthy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_taskstream_healthy.png
--------------------------------------------------------------------------------
/docs/source/images/transpose-hlg-hovertooltip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-hovertooltip.png
--------------------------------------------------------------------------------
/docs/source/images/dashboard_task_stream_unhealthy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_stream_unhealthy.png
--------------------------------------------------------------------------------
/dask/widgets/templates/dataframe.html.j2:
--------------------------------------------------------------------------------
1 | Dask DataFrame Structure:
2 | {{ data }}
3 | Dask Name: {{ name | key_split }}, {{ layers }}
4 |
--------------------------------------------------------------------------------
/dask/__main__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.cli import run_cli
4 |
5 |
6 | def main():
7 | run_cli()
8 |
9 |
10 | if __name__ == "__main__":
11 | main()
12 |
--------------------------------------------------------------------------------
/continuous_integration/gpuci/axis.yaml:
--------------------------------------------------------------------------------
1 | PYTHON_VER:
2 | - "3.9"
3 | - "3.10"
4 |
5 | CUDA_VER:
6 | - "11.5"
7 |
8 | LINUX_VER:
9 | - ubuntu18.04
10 |
11 | RAPIDS_VER:
12 | - "23.10"
13 |
14 | excludes:
15 |
--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% set css_files = css_files + ["_static/style.css"] %}
3 | {% set script_files = script_files + ["_static/yaml.min.js", "_static/config_converter.js"] %}
4 |
--------------------------------------------------------------------------------
/dask/dataframe/io/parquet/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.dataframe.io.parquet.core import (
4 | create_metadata_file,
5 | read_parquet,
6 | read_parquet_part,
7 | to_parquet,
8 | )
9 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from __future__ import annotations
4 |
5 | import versioneer
6 | from setuptools import setup
7 |
8 | setup(
9 | version=versioneer.get_version(),
10 | cmdclass=versioneer.get_cmdclass(),
11 | )
12 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more.
2 |
3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html.
4 |
--------------------------------------------------------------------------------
/docs/source/cheatsheet.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Dask Cheat Sheet
4 | ================
5 |
6 | The 300KB pdf :download:`Dask cheat sheet `
7 | is a single page summary about using Dask.
8 | It is commonly distributed at conferences and trade shows.
9 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # Set update schedule for GitHub Actions
2 |
3 | version: 2
4 | updates:
5 | - package-ecosystem: "github-actions"
6 | directory: "/"
7 | schedule:
8 | # Check for updates to GitHub Actions every weekday
9 | interval: "weekly"
10 |
--------------------------------------------------------------------------------
/dask/tests/warning_aliases.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | try:
4 | from sqlalchemy.exc import RemovedIn20Warning
5 | except ImportError:
6 |
7 | class _RemovedIn20Warning(Warning):
8 | pass
9 |
10 | RemovedIn20Warning = _RemovedIn20Warning
11 |
--------------------------------------------------------------------------------
/.github/workflows/label-prs.yml:
--------------------------------------------------------------------------------
1 | name: "PR Labeler"
2 | on:
3 | - pull_request_target
4 |
5 | jobs:
6 | label:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/labeler@main
10 | with:
11 | repo-token: "${{ secrets.GITHUB_TOKEN }}"
12 | sync-labels: false
13 |
--------------------------------------------------------------------------------
/dask/bag/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | def assert_eq(a, b, scheduler="sync"):
5 | if hasattr(a, "compute"):
6 | a = a.compute(scheduler=scheduler)
7 | if hasattr(b, "compute"):
8 | b = b.compute(scheduler=scheduler)
9 |
10 | assert a == b
11 |
--------------------------------------------------------------------------------
/dask/diagnostics/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.callbacks import Callback
4 | from dask.diagnostics.profile import CacheProfiler, Profiler, ResourceProfiler
5 | from dask.diagnostics.profile_visualize import visualize
6 | from dask.diagnostics.progress import ProgressBar
7 |
--------------------------------------------------------------------------------
/.github/workflows/label-all.yml:
--------------------------------------------------------------------------------
1 | name: "Issue and PR Labeler"
2 | on:
3 | pull_request:
4 | types: [opened]
5 | issues:
6 | types: [opened, reopened]
7 | jobs:
8 | label-all-on-open:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: andymckay/labeler@1.0.4
12 | with:
13 | add-labels: "needs triage"
14 | ignore-if-labeled: false
15 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
2 | version: 2
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | sphinx:
9 | configuration: docs/source/conf.py
10 | fail_on_warning: true
11 |
12 | python:
13 | install:
14 | - requirements: docs/requirements-docs.txt
15 | - method: pip
16 | path: .
17 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include dask *.py
2 | recursive-include dask *.j2
3 | recursive-include docs/source *
4 | include docs/Makefile docs/make.bat
5 |
6 | include setup.py
7 | include README.rst
8 | include MANIFEST.in
9 | include dask/dask.yaml
10 | include dask/dask-schema.yaml
11 | include dask/py.typed
12 |
13 | include versioneer.py
14 | include dask/_version.py
15 |
16 | include conftest.py
17 |
--------------------------------------------------------------------------------
/continuous_integration/scripts/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -e
4 |
5 | if [[ $PARALLEL == 'true' ]]; then
6 | export XTRATESTARGS="-n4 $XTRATESTARGS"
7 | fi
8 |
9 | if [[ $COVERAGE == 'true' ]]; then
10 | export XTRATESTARGS="--cov=dask --cov-report=xml $XTRATESTARGS"
11 | fi
12 |
13 | echo "py.test dask --runslow $XTRATESTARGS"
14 | py.test dask --runslow $XTRATESTARGS
15 |
16 | set +e
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .hypothesis
2 | *.py[cod]
3 | __pycache__/
4 | *.egg-info
5 | .mypy_cache
6 | dask-worker-space/
7 | docs/build
8 | docs/source/generated
9 | build/
10 | dist/
11 | .idea/
12 | log.*
13 | log
14 | .pytest_cache/
15 | .coverage
16 | .coverage.*
17 | coverage.xml
18 | .DS_Store
19 | *.sqlite
20 | *.swp
21 | *.swo
22 | .cache/
23 | hdfs-initialized-indicator
24 | .ipynb_checkpoints
25 | .vscode/
26 | .history
27 |
--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
1 | name: Linting
2 |
3 | on:
4 | push:
5 | branches: main
6 | pull_request:
7 | branches: main
8 |
9 | jobs:
10 | checks:
11 | name: pre-commit hooks
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v3.5.3
15 | - uses: actions/setup-python@v4
16 | with:
17 | python-version: '3.9'
18 | - uses: pre-commit/action@v3.0.0
19 |
--------------------------------------------------------------------------------
/docs/source/internals.rst:
--------------------------------------------------------------------------------
1 | Dask Internals
2 | ==============
3 |
4 | This section is intended for contributors and power users who are interested in
5 | learning more about how Dask works internally.
6 |
7 | .. toctree::
8 | :maxdepth: 1
9 |
10 | user-interfaces.rst
11 | understanding-performance.rst
12 | phases-of-computation.rst
13 | order.rst
14 | caching.rst
15 | shared.rst
16 | scheduling-policy.rst
17 |
--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | numpydoc
2 | sphinx>=4.0.0
3 | dask-sphinx-theme>=3.0.0
4 | sphinx-click
5 | sphinx-copybutton
6 | sphinx-remove-toctrees
7 | sphinx_autosummary_accessors
8 | sphinx-tabs
9 | sphinx-design
10 | jupyter_sphinx
11 | toolz
12 | cloudpickle>=1.5.0
13 | pandas>=1.4.0
14 | git+https://github.com/dask/distributed
15 | fsspec
16 | scipy
17 | pytest
18 | pytest-check-links
19 | requests-cache
20 | ipython
21 | ipykernel<6.22.0
22 |
--------------------------------------------------------------------------------
/docs/source/_static/theme_overrides.css:
--------------------------------------------------------------------------------
1 | /* override table width restrictions */
2 | @media screen and (min-width: 767px) {
3 |
4 | .wy-table-responsive table td {
5 | /* !important prevents the common CSS stylesheets from overriding
6 | this as on RTD they are loaded after this stylesheet */
7 | white-space: normal !important;
8 | }
9 |
10 | .wy-table-responsive {
11 | overflow: visible !important;
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/docs/source/debugging-performance.rst:
--------------------------------------------------------------------------------
1 | Debugging and Performance
2 | ==========================
3 |
4 | This section contains resources to help you debug and understand performance.
5 |
6 |
7 | .. toctree::
8 | :maxdepth: 1
9 |
10 | how-to/debug.rst
11 | Visualize task graphs
12 | Dashboard
13 | diagnostics-local.rst
14 | diagnostics-distributed.rst
15 | Phases of computation
16 |
--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
1 | dataframe:
2 | - dask/dataframe/*
3 | - dask/dataframe/**/*
4 |
5 | array:
6 | - dask/array/*
7 | - dask/array/**/*
8 |
9 | io:
10 | - dask/dataframe/io/*
11 | - dask/dataframe/io/**/*
12 |
13 | documentation:
14 | - docs/*
15 | - docs/**/*
16 |
17 | dispatch:
18 | - dask/array/backends.py
19 | - dask/array/dispatch.py
20 | - dask/dataframe/backends.py
21 | - dask/dataframe/dispatch.py
22 | - dask/dataframe/extensions.py
23 |
--------------------------------------------------------------------------------
/dask/tests/test_compatibility.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from dask._compatibility import entry_points
6 |
7 |
8 | def test_deprecation():
9 | with pytest.warns(DeprecationWarning):
10 | from dask.compatibility import _EMSCRIPTEN # noqa
11 |
12 |
13 | def test_entry_points():
14 | with pytest.warns(DeprecationWarning):
15 | assert "pytest" in [ep.name for ep in entry_points(group="console_scripts")]
16 |
--------------------------------------------------------------------------------
/dask/tests/test_ml.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | def test_basic():
5 | try:
6 | import dask_ml # noqa: F401
7 | except ImportError:
8 | try:
9 | from dask.ml.model_selection import GridSearchCV # noqa: F401
10 | except ImportError as e:
11 | assert "conda install dask-ml" in str(e)
12 | else:
13 | assert False
14 | else:
15 | from dask.ml.model_selection import GridSearchCV # noqa: F401
16 |
--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-non-optional.yaml:
--------------------------------------------------------------------------------
1 | name: test-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | # required dependencies
6 | - packaging=20.0
7 | - python=3.9
8 | - pyyaml=5.3.1
9 | - click=8.0
10 | - cloudpickle=1.5.0
11 | - partd=1.2.0
12 | - fsspec=2021.09.0
13 | - importlib-metadata=4.13.0
14 | - toolz=0.10.0
15 | # test dependencies
16 | - pre-commit
17 | - pytest
18 | - pytest-cov
19 | - pytest-rerunfailures
20 | - pytest-xdist
21 |
--------------------------------------------------------------------------------
/docs/source/how-to/index.rst:
--------------------------------------------------------------------------------
1 | How To...
2 | =========
3 |
4 | This section contains snippets and suggestions about how to perform different actions
5 | using Dask. If you have an idea of a how-to that we should add, please
6 | `make a suggestion `_!
7 |
8 | .. Articles in this section should be short and not contain much explanation.
9 |
10 | .. toctree::
11 | :caption: How To...
12 | :maxdepth: 1
13 | :glob:
14 |
15 | *
16 | Use GPUs <../gpu.rst>
17 |
--------------------------------------------------------------------------------
/dask/ml.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | def __getattr__(value):
5 | try:
6 | import dask_ml
7 | except ImportError as e:
8 | msg = (
9 | "Dask-ML is not installed.\n\n"
10 | "Please either conda or pip install dask-ml:\n\n"
11 | " conda install dask-ml # either conda install\n"
12 | " python -m pip install dask-ml --upgrade # or pip install"
13 | )
14 | raise ImportError(msg) from e
15 | return getattr(dask_ml, value)
16 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_methods.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 | import dask.dataframe.methods as methods
7 | from dask.dataframe._compat import PANDAS_GE_140
8 |
9 |
10 | def test_assign_not_modifying_array_inplace():
11 | df = pd.DataFrame({"a": [1, 2, 3], "b": 1.5})
12 | result = methods.assign(df, "a", 5)
13 | assert not np.shares_memory(df["a"].values, result["a"].values)
14 | if PANDAS_GE_140:
15 | assert np.shares_memory(df["b"].values, result["b"].values)
16 |
--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-array.yaml:
--------------------------------------------------------------------------------
1 | name: test-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | # required dependencies
6 | - packaging=20.0
7 | - python=3.9
8 | - pyyaml=5.3.1
9 | - click=8.0
10 | - cloudpickle=1.5.0
11 | - partd=1.2.0
12 | - fsspec=2021.09.0
13 | - importlib-metadata=4.13.0
14 | - toolz=0.10.0
15 | # optional dependencies pulled in by pip install dask[array]
16 | - numpy=1.21
17 | # test dependencies
18 | - pre-commit
19 | - pytest
20 | - pytest-cov
21 | - pytest-rerunfailures
22 | - pytest-xdist
23 |
--------------------------------------------------------------------------------
/dask/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask import config, datasets
4 | from dask._version import get_versions
5 | from dask.base import (
6 | annotate,
7 | compute,
8 | get_annotations,
9 | is_dask_collection,
10 | optimize,
11 | persist,
12 | visualize,
13 | )
14 | from dask.core import istask
15 | from dask.delayed import delayed
16 | from dask.local import get_sync as get
17 |
18 | versions = get_versions()
19 | __version__ = versions["version"]
20 | __git_revision__ = versions["full-revisionid"]
21 | del get_versions, versions
22 |
--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
1 | # .github/release.yml
2 |
3 | changelog:
4 | categories:
5 | - title: New Features
6 | labels:
7 | - feature
8 | - title: Enhancements
9 | labels:
10 | - enhancement
11 | - title: Bug Fixes
12 | labels:
13 | - bug
14 | - title: Deprecations
15 | labels:
16 | - deprecation
17 | - title: Documentation
18 | labels:
19 | - documentation
20 | - title: Maintenance
21 | labels:
22 | - tests
23 | - hygiene
24 | - title: Misc
25 | labels:
26 | - "*"
27 |
--------------------------------------------------------------------------------
/dask/compatibility.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | from dask._compatibility import EMSCRIPTEN as _EMSCRIPTEN # noqa
6 | from dask._compatibility import PY_VERSION as _PY_VERSION # noqa
7 | from dask._compatibility import entry_points, parse_version # noqa
8 |
9 | warnings.warn(
10 | "`dask.compatibility` is not intended for external use and has been renamed to `dask._compatibility`. "
11 | "This backward-compatible shim will be removed in a future release. Please find an alternative.",
12 | DeprecationWarning,
13 | stacklevel=2,
14 | )
15 |
--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
1 | To build a local copy of the Dask documentation, install the packages in
2 | ``requirements-docs.txt`` and run ``make html``.
3 |
4 | Optionally create and activate a ``conda`` environment first::
5 |
6 | conda create -n daskdocs -c conda-forge python=3.11
7 | conda activate daskdocs
8 |
9 | Install the dependencies with ``pip``::
10 |
11 | python -m pip install -r requirements-docs.txt
12 |
13 | After running ``make html`` the generated HTML documentation can be found in
14 | the ``build/html`` directory. Open ``build/html/index.html`` to view the home
15 | page for the documentation.
16 |
--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-dataframe.yaml:
--------------------------------------------------------------------------------
1 | name: test-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | # required dependencies
6 | - packaging=20.0
7 | - python=3.9
8 | - pyyaml=5.3.1
9 | - click=8.0
10 | - cloudpickle=1.5.0
11 | - partd=1.2.0
12 | - fsspec=2021.09.0
13 | - importlib-metadata=4.13.0
14 | - toolz=0.10.0
15 | # optional dependencies pulled in by pip install dask[dataframe]
16 | - numpy=1.21
17 | - pandas=1.3
18 | # test dependencies
19 | - pre-commit
20 | - pytest
21 | - pytest-cov
22 | - pytest-rerunfailures
23 | - pytest-xdist
24 |
--------------------------------------------------------------------------------
/dask/dataframe/io/orc/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | class ORCEngine:
5 | """The API necessary to provide a new ORC reader/writer"""
6 |
7 | @classmethod
8 | def read_metadata(
9 | cls, fs, paths, columns, index, split_stripes, aggregate_files, **kwargs
10 | ):
11 | raise NotImplementedError()
12 |
13 | @classmethod
14 | def read_partition(cls, fs, part, columns, **kwargs):
15 | raise NotImplementedError()
16 |
17 | @classmethod
18 | def write_partition(cls, df, path, fs, filename, **kwargs):
19 | raise NotImplementedError
20 |
--------------------------------------------------------------------------------
/dask/array/dispatch.py:
--------------------------------------------------------------------------------
1 | """
2 | Dispatch in dask.array.
3 |
4 | Also see backends.py
5 | """
6 |
7 | from __future__ import annotations
8 |
9 | from dask.utils import Dispatch
10 |
11 | concatenate_lookup = Dispatch("concatenate")
12 | tensordot_lookup = Dispatch("tensordot")
13 | einsum_lookup = Dispatch("einsum")
14 | empty_lookup = Dispatch("empty")
15 | divide_lookup = Dispatch("divide")
16 | percentile_lookup = Dispatch("percentile")
17 | numel_lookup = Dispatch("numel")
18 | nannumel_lookup = Dispatch("nannumel")
19 | to_numpy_dispatch = Dispatch("to_numpy_dispatch")
20 | to_cupy_dispatch = Dispatch("to_cupy_dispatch")
21 |
--------------------------------------------------------------------------------
/dask/bytes/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import bz2
4 | import gzip
5 | import io
6 | import lzma
7 | import zipfile
8 |
9 |
10 | def zip_compress(data):
11 | """Write data into zipfile and return the bytes"""
12 | out = io.BytesIO()
13 | with zipfile.ZipFile(file=out, mode="w") as z:
14 | with z.open("myfile", "w") as zf:
15 | zf.write(data)
16 | out.seek(0)
17 | return out.read()
18 |
19 |
20 | compress = {
21 | "gzip": gzip.compress,
22 | "bz2": bz2.compress,
23 | None: lambda x: x,
24 | "xz": lzma.compress,
25 | "zip": zip_compress,
26 | }
27 |
--------------------------------------------------------------------------------
/dask/array/tests/test_testing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 |
5 | import numpy as np
6 | import pytest
7 |
8 | import dask.array as da
9 | from dask.array.utils import assert_eq
10 |
11 |
12 | @pytest.mark.skipif(bool(sys.flags.optimize), reason="Assertions disabled.")
13 | def test_assert_eq_checks_scalars():
14 | # https://github.com/dask/dask/issues/2680
15 | with pytest.raises(AssertionError):
16 | assert_eq(np.array(0), np.array(1))
17 |
18 | a = da.from_array(np.array([0]), 1)[0]
19 | b = np.array([1])[0]
20 | with pytest.raises(AssertionError):
21 | assert_eq(a, b)
22 |
--------------------------------------------------------------------------------
/docs/source/_static/style.css:
--------------------------------------------------------------------------------
1 | .configTextArea {
2 | font-family: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;
3 | margin-bottom: 24px;
4 | }
5 |
6 | .classifier::before {
7 | content: ": ";
8 | }
9 |
10 | /* options for jupyter-sphinx extension */
11 | div.jupyter_container {
12 | box-shadow: None;
13 | font-family: var(--pst-font-family-monospace);
14 | border-radius: 0.4em;
15 | }
16 |
17 | .jupyter_container div.code_cell {
18 | padding: 10px;
19 | max-width: None !important;
20 | }
21 |
22 | .jupyter_container .output {
23 | font-size: 16px;
24 | padding: 10px
25 | }
26 |
--------------------------------------------------------------------------------
/dask/bytes/tests/test_compression.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from io import BytesIO
4 |
5 | import pytest
6 | from fsspec.compression import compr
7 |
8 | from dask.bytes.utils import compress
9 |
10 |
11 | @pytest.mark.parametrize("fmt,File", compr.items())
12 | def test_files(fmt, File):
13 | if fmt not in compress:
14 | pytest.skip("compression function not provided")
15 | if fmt is None:
16 | return
17 | data = b"1234" * 1000
18 | compressed = compress[fmt](data)
19 |
20 | b = BytesIO(compressed)
21 | g = File(b, mode="rb")
22 | data2 = g.read()
23 | g.close()
24 | assert data == data2
25 |
--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-distributed.yaml:
--------------------------------------------------------------------------------
1 | name: test-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | # required dependencies
6 | - packaging=20.0
7 | - python=3.9
8 | - pyyaml=5.3.1
9 | - click=8.0
10 | - cloudpickle=1.5.0
11 | - partd=1.2.0
12 | - fsspec=2021.09.0
13 | - importlib-metadata=4.13.0
14 | - toolz=0.10.0
15 | # optional dependencies pulled in by pip install dask[distributed]
16 | - pip
17 | - pip:
18 | - git+https://github.com/dask/distributed
19 | # test dependencies
20 | - pre-commit
21 | - pytest
22 | - pytest-cov
23 | - pytest-rerunfailures
24 | - pytest-timeout
25 | - pytest-xdist
26 |
--------------------------------------------------------------------------------
/dask/dataframe/extensions.py:
--------------------------------------------------------------------------------
1 | """
2 | Support for pandas ExtensionArray in dask.dataframe.
3 |
4 | See :ref:`extensionarrays` for more.
5 | """
6 | from __future__ import annotations
7 |
8 | from dask.dataframe.accessor import (
9 | register_dataframe_accessor,
10 | register_index_accessor,
11 | register_series_accessor,
12 | )
13 | from dask.utils import Dispatch
14 |
15 | make_array_nonempty = Dispatch("make_array_nonempty")
16 | make_scalar = Dispatch("make_scalar")
17 |
18 |
19 | __all__ = [
20 | "make_array_nonempty",
21 | "make_scalar",
22 | "register_dataframe_accessor",
23 | "register_index_accessor",
24 | "register_series_accessor",
25 | ]
26 |
--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_gufunc.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pytest
5 |
6 | pytestmark = pytest.mark.gpu
7 |
8 | import dask.array as da
9 | from dask.array.gufunc import apply_gufunc
10 | from dask.array.utils import assert_eq
11 |
12 | cupy = pytest.importorskip("cupy")
13 |
14 |
15 | def test_apply_gufunc_axis():
16 | def mydiff(x):
17 | return np.diff(x)
18 |
19 | a = cupy.random.default_rng().standard_normal((3, 6, 4))
20 | da_ = da.from_array(a, chunks=2, asarray=False)
21 |
22 | m = np.diff(a, axis=1)
23 | dm = apply_gufunc(
24 | mydiff, "(i)->(i)", da_, axis=1, output_sizes={"i": 5}, allow_rechunk=True
25 | )
26 | assert_eq(m, dm)
27 |
--------------------------------------------------------------------------------
/dask/_compatibility.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 | import warnings
5 |
6 | from importlib_metadata import entry_points as _entry_points
7 | from packaging.version import parse as parse_version
8 |
9 | PY_VERSION = parse_version(".".join(map(str, sys.version_info[:3])))
10 |
11 | EMSCRIPTEN = sys.platform == "emscripten"
12 |
13 |
14 | def entry_points(group=None):
15 | warnings.warn(
16 | "`dask._compatibility.entry_points` has been replaced by `importlib_metadata.entry_points` and will be removed "
17 | "in a future version. Please use `importlib_metadata.entry_points` instead.",
18 | DeprecationWarning,
19 | stacklevel=2,
20 | )
21 | return _entry_points(group=group)
22 |
--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # .git-blame-ignore-revs
2 | # absolufy-imports - No relative - PEP8 (#8796)
3 | cccb9d8d8e33a891396b1275c2448c352ef40c27
4 |
5 | # Update `pre-commit` version (#8691)
6 | 510bbc380531cbf56a409f1ae68e6fd84a9599e6
7 |
8 | # Run pyupgrade in CI (#8246)
9 | 80a82008d5b02a08f6ff59d802defcc43247eb1a
10 |
11 | # Bump pre-commit hook versions (#7676)
12 | d6bbbb08c92652eae2820e93edc2f3fe502391d3
13 |
14 | # Start adding isort (#7370)
15 | a31c0fc72e1cc59b8b0254965824abb0718c5f56
16 |
17 | # Rerun with latest black release (#6568)
18 | 64e2a9b3b9992503221a074a547827501927d1fa
19 |
20 | # LINT: Fixup black string normalization (#5227)
21 | d92f4015a1da3da10c04c682ed2acae8469e9576
22 |
23 | # Apply Black formatting (#4983)
24 | 7e4beffb339c69278091d4e305c2ae18ddf8c74f
25 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 | require_ci_to_pass: yes
3 | # codecov pushes a failing status update to github actions before all the
4 | # test runs have completed (this is later updated to passing after more test
5 | # runs pass, but the initial red X is annoying). As far as I can tell from
6 | # https://docs.codecov.com/docs/merging-reports this shouldn't be happening,
7 | # but it is. Here we set a minimum number of builds before notifying in the
8 | # hopes that it will stop this behavior.
9 | notify:
10 | after_n_builds: 10
11 |
12 | coverage:
13 | precision: 2
14 | round: down
15 | range: "90...100"
16 |
17 | status:
18 | project:
19 | default:
20 | target: 90%
21 | threshold: 1%
22 | patch: no
23 | changes: no
24 |
25 | comment: off
26 |
--------------------------------------------------------------------------------
/.github/workflows/stale-bot.yaml:
--------------------------------------------------------------------------------
1 | name: 'Label stale issues and PRs'
2 | on:
3 | schedule:
4 | - cron: '30 1 * * 1' # runs once a week
5 |
6 | jobs:
7 | stale:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - uses: actions/stale@v8
11 | with:
12 | stale-issue-message: '' # no comment left if string is empty
13 | stale-pr-message: '' # no comment left if string is empty
14 | days-before-stale: 30
15 | days-before-close: -1
16 | stale-issue-label: 'needs attention'
17 | stale-pr-label: 'needs attention'
18 | exempt-issue-labels: 'good intro to dask,good first issue,Good First Issue,good second issue,feature request'
19 | exempt-draft-pr: true
20 | start-date: '2020-04-18T00:00:00Z' # ignore before this date, ISO 8601 or RFC 2822
21 |
--------------------------------------------------------------------------------
/dask/distributed.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from __future__ import annotations
4 |
5 | _import_error_message = (
6 | "dask.distributed is not installed.\n\n"
7 | "Please either conda or pip install distributed:\n\n"
8 | " conda install dask distributed # either conda install\n"
9 | ' python -m pip install "dask[distributed]" --upgrade # or pip install'
10 | )
11 |
12 | try:
13 | from distributed import *
14 | except ImportError as e:
15 | if e.msg == "No module named 'distributed'":
16 | raise ImportError(_import_error_message) from e
17 | else:
18 | raise
19 |
20 |
21 | def __getattr__(value):
22 | try:
23 | import distributed
24 | except ImportError as e:
25 | raise ImportError(_import_error_message) from e
26 | return getattr(distributed, value)
27 |
--------------------------------------------------------------------------------
/dask/dataframe/io/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from dask.dataframe.io import demo
4 | from dask.dataframe.io.csv import read_csv, read_fwf, read_table, to_csv
5 | from dask.dataframe.io.hdf import read_hdf, to_hdf
6 | from dask.dataframe.io.io import (
7 | from_array,
8 | from_dask_array,
9 | from_delayed,
10 | from_dict,
11 | from_map,
12 | from_pandas,
13 | to_backend,
14 | to_bag,
15 | to_records,
16 | )
17 | from dask.dataframe.io.json import read_json, to_json
18 | from dask.dataframe.io.sql import read_sql, read_sql_query, read_sql_table, to_sql
19 |
20 | try:
21 | from dask.dataframe.io.parquet import read_parquet, to_parquet
22 | except ImportError:
23 | pass
24 |
25 | try:
26 | from dask.dataframe.io.orc import read_orc, to_orc
27 | except ImportError:
28 | pass
29 |
--------------------------------------------------------------------------------
/dask/tests/test_backends.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | import dask
6 |
7 |
8 | @pytest.mark.gpu
9 | @pytest.mark.parametrize("backend", ["pandas", "cudf"])
10 | def test_CreationDispatch_error_informative_message(backend):
11 | # Check that an informative error is emitted when a backend dispatch
12 | # method fails
13 | pytest.importorskip(backend)
14 | dd = pytest.importorskip("dask.dataframe")
15 | data = {"a": [1, 2, 3, 4], "B": [10, 11, 12, 13]}
16 | with dask.config.set({"dataframe.backend": backend}):
17 | with pytest.raises(TypeError) as excinfo:
18 | dd.from_dict(data, npartitions=2, unsupported_kwarg=True)
19 |
20 | msg = str(excinfo.value)
21 | assert "error occurred while calling the from_dict method" in msg
22 | assert backend in msg
23 |
--------------------------------------------------------------------------------
/dask/widgets/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | try:
4 | from dask.widgets.widgets import (
5 | FILTERS,
6 | TEMPLATE_PATHS,
7 | get_environment,
8 | get_template,
9 | )
10 |
11 | except ImportError as e:
12 | msg = (
13 | "Dask diagnostics requirements are not installed.\n\n"
14 | "Please either conda or pip install as follows:\n\n"
15 | " conda install dask # either conda install\n"
16 | ' python -m pip install "dask[diagnostics]" --upgrade # or python -m pip install'
17 | )
18 | exception = e # Explicit reference for e as it will be lost outside the try block
19 | FILTERS = {}
20 | TEMPLATE_PATHS = []
21 |
22 | def get_environment():
23 | raise ImportError(msg) from exception
24 |
25 | def get_template(name: str):
26 | raise ImportError(msg) from exception
27 |
--------------------------------------------------------------------------------
/dask/bag/chunk.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 |
4 | def barrier(*args):
5 | return None
6 |
7 |
8 | def getitem(x, key):
9 | """Like :func:`operator.getitem`, but allows setting key using partial
10 | ``partial(chunk.getitem, key=key)
11 | """
12 | return x[key]
13 |
14 |
15 | def foldby_combine2(combine, acc, x):
16 | return combine(acc, x[1])
17 |
18 |
19 | def groupby_tasks_group_hash(x, hash, grouper):
20 | return hash(grouper(x)), x
21 |
22 |
23 | def var_chunk(seq):
24 | squares, total, n = 0.0, 0.0, 0
25 | for x in seq:
26 | squares += x**2
27 | total += x
28 | n += 1
29 | return squares, total, n
30 |
31 |
32 | def var_aggregate(x, ddof):
33 | squares, totals, counts = list(zip(*x))
34 | x2, x, n = float(sum(squares)), float(sum(totals)), sum(counts)
35 | result = (x2 / n) - (x / n) ** 2
36 | return result * n / (n - ddof)
37 |
--------------------------------------------------------------------------------
/docs/source/how-to/setup-prometheus.rst:
--------------------------------------------------------------------------------
1 | .. When modifying the contents of this page, please adjust the corresponding page in the dask.distributed documentation accordingly.
2 |
3 | Setup Prometheus monitoring
4 | ===========================
5 |
6 | Prometheus_ is a widely popular tool for monitoring and alerting a wide variety of
7 | systems. A distributed cluster offers a number of Prometheus metrics if the
8 | prometheus_client_ package is installed. The metrics are exposed in Prometheus'
9 | text-based format at the ``/metrics`` endpoint on both schedulers and workers.
10 |
11 |
12 | Available metrics
13 | -----------------
14 |
15 | Apart from the metrics exposed per default by the prometheus_client_, schedulers and
16 | workers expose a number of Dask-specific metrics.
17 | See the `dask.distributed documentation
18 | `_ for details.
19 |
20 |
21 | .. _Prometheus: https://prometheus.io
22 | .. _prometheus_client: https://github.com/prometheus/client_python
23 |
--------------------------------------------------------------------------------
/dask/bag/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | try:
4 | from dask.bag.avro import read_avro
5 | from dask.bag.core import Bag, Item
6 | from dask.bag.core import bag_map as map
7 | from dask.bag.core import bag_range as range
8 | from dask.bag.core import bag_zip as zip
9 | from dask.bag.core import (
10 | concat,
11 | from_delayed,
12 | from_sequence,
13 | from_url,
14 | map_partitions,
15 | to_textfiles,
16 | )
17 | from dask.bag.text import read_text
18 | from dask.bag.utils import assert_eq
19 | from dask.base import compute
20 | except ImportError as e:
21 | msg = (
22 | "Dask bag requirements are not installed.\n\n"
23 | "Please either conda or pip install as follows:\n\n"
24 | " conda install dask # either conda install\n"
25 | ' python -m pip install "dask[bag]" --upgrade # or python -m pip install'
26 | )
27 | raise ImportError(str(e) + "\n\n" + msg) from e
28 |
--------------------------------------------------------------------------------
/docs/source/logos.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 |
3 | Images and Logos
4 | ================
5 |
6 | Here are some commonly used Dask icons and logos
7 | (see the `Dask style guide `_ for more details).
8 |
9 | .. image:: images/dask_icon.svg
10 | :alt: Primary Dask icon.
11 |
12 | .. image:: images/dask_icon_black.svg
13 | :alt: Dask icon in black.
14 |
15 | .. image:: images/dask_icon_white.svg
16 | :alt: Dask icon in white.
17 |
18 | .. image:: images/dask_icon_on_pink.svg
19 | :alt: Dask icon to use on a pink background.
20 |
21 | .. image:: images/dask_horizontal.svg
22 | :alt: Primary Dask logo.
23 |
24 | .. image:: images/dask_horizontal_black.svg
25 | :alt: Dask logo in black.
26 |
27 | .. image:: images/dask_horizontal_white.svg
28 | :alt: Dask logo in white.
29 |
30 | .. image:: images/dask_horizontal_on_pink.svg
31 | :alt: Dask logo to use on a pink background.
32 |
33 | .. image:: images/dask_horizontal_on_blue.svg
34 | :alt: Dask logo to use on a blue background.
35 |
--------------------------------------------------------------------------------
/dask/tests/test_docs.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 |
5 | import pytest
6 |
7 |
8 | def test_development_guidelines_matches_ci():
9 | """When the environment.yaml changes in CI, make sure to change it in the docs as well"""
10 | root_dir = Path(__file__).parent.parent.parent
11 |
12 | if not (root_dir / ".github" / "workflows").exists():
13 | pytest.skip("Test can only be run on an editable install")
14 |
15 | development_doc_file = root_dir / "docs" / "source" / "develop.rst"
16 | additional_ci_file = root_dir / ".github" / "workflows" / "additional.yml"
17 | upstream_ci_file = root_dir / ".github" / "workflows" / "upstream.yml"
18 | latest_env = "environment-3.10.yaml"
19 |
20 | for filename in [development_doc_file, additional_ci_file, upstream_ci_file]:
21 | with open(filename, encoding="utf8") as f:
22 | assert any(
23 | latest_env in line for line in f
24 | ), f"{latest_env} not found in {filename}"
25 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_boolean.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 |
5 | import dask.dataframe as dd
6 |
7 |
8 | def test_meta():
9 | values = pd.array([True, False, None], dtype="boolean")
10 | ds = dd.from_pandas(pd.Series(values), 2)
11 | assert ds.dtype == pd.BooleanDtype()
12 |
13 | dd.utils.assert_eq(ds._meta_nonempty, pd.Series([True, pd.NA], dtype="boolean"))
14 |
15 | ddf = dd.from_pandas(pd.DataFrame({"A": values}), 2)
16 | assert ddf.dtypes["A"] == pd.BooleanDtype()
17 |
18 | dd.utils.assert_eq(
19 | ddf._meta_nonempty,
20 | pd.DataFrame({"A": pd.array([True, pd.NA], dtype="boolean")}),
21 | )
22 |
23 |
24 | def test_ops():
25 | s1 = pd.Series(pd.array([True, False, None] * 3, dtype="boolean"))
26 | s2 = pd.Series(pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean"))
27 |
28 | ds1 = dd.from_pandas(s1, 2)
29 | ds2 = dd.from_pandas(s2, 2)
30 |
31 | dd.utils.assert_eq(ds1 | ds2, s1 | s2)
32 | dd.utils.assert_eq(ds1 & ds2, s1 & s2)
33 | dd.utils.assert_eq(ds1 ^ ds2, s1 ^ s2)
34 |
--------------------------------------------------------------------------------
/docs/source/dashboard-progress-script.py:
--------------------------------------------------------------------------------
1 | """
2 | This script was run to produce some of the screenshots on https://docs.dask.org/en/stable/dashboard.html
3 | """
4 | from __future__ import annotations
5 |
6 | import time
7 |
8 | from dask import delayed
9 | from dask.distributed import Client, wait
10 |
11 |
12 | @delayed
13 | def inc(x):
14 | time.sleep(0.1)
15 | return x + 1
16 |
17 |
18 | @delayed
19 | def double(x):
20 | time.sleep(0.1)
21 | return 2 * x
22 |
23 |
24 | @delayed
25 | def add(x, y):
26 | time.sleep(0.1)
27 | return x + y
28 |
29 |
30 | if __name__ == "__main__":
31 | with Client(n_workers=4, threads_per_worker=2, memory_limit="4 GiB") as client:
32 | while True:
33 | data = list(range(1000))
34 | output = []
35 | for x in data:
36 | a = inc(x)
37 | b = double(x)
38 | c = add(a, b)
39 | output.append(c)
40 |
41 | total = delayed(sum)(output)
42 | total = total.persist()
43 | wait(total)
44 | time.sleep(5)
45 | del total
46 | time.sleep(2)
47 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234
2 | [flake8]
3 | # References:
4 | # https://flake8.readthedocs.io/en/latest/user/configuration.html
5 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html
6 | # https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
7 | exclude = __init__.py
8 | ignore =
9 | # Extra space in brackets
10 | E20
11 | # Multiple spaces around ","
12 | E231,E241
13 | # Comments
14 | E26
15 | # Import formatting
16 | E4
17 | # Comparing types instead of isinstance
18 | E721
19 | # Assigning lambda expression
20 | E731
21 | # Ambiguous variable names
22 | E741
23 | # Line break before binary operator
24 | W503
25 | # Line break after binary operator
26 | W504
27 | # Redefinition of unused 'loop' from line 10
28 | F811
29 | # No explicit stacklevel in warnings.warn. FIXME we should correct this in the code
30 | B028
31 |
32 | max-line-length = 120
33 | per-file-ignores =
34 | *_test.py:
35 | # Do not call assert False since python -O removes these calls
36 | B011,
37 | **/tests/*:
38 | # Do not call assert False since python -O removes these calls
39 | B011,
40 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_optimize_dataframe.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 |
5 | import dask
6 | import dask.dataframe as dd
7 |
8 | dsk = {
9 | ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
10 | ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
11 | ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
12 | }
13 | dfs = list(dsk.values())
14 |
15 |
16 | def test_fuse_ave_width():
17 | df = pd.DataFrame({"x": range(10)})
18 | df = dd.from_pandas(df, npartitions=5)
19 |
20 | s = (df.x + 1) + (df.x + 2)
21 |
22 | with dask.config.set({"optimization.fuse.ave-width": 4}):
23 | a = s.__dask_optimize__(s.dask, s.__dask_keys__())
24 |
25 | b = s.__dask_optimize__(s.dask, s.__dask_keys__())
26 |
27 | assert len(a) <= 15
28 | assert len(b) <= 15
29 |
30 |
31 | def test_optimize_blockwise():
32 | from dask.array.optimization import optimize_blockwise
33 |
34 | df = pd.DataFrame({"x": range(10), "y": range(10)})
35 | ddf = dd.from_pandas(df, npartitions=2)
36 |
37 | for _ in range(10):
38 | ddf["x"] = ddf.x + 1 + ddf.y
39 |
40 | graph = optimize_blockwise(ddf.dask)
41 |
42 | assert len(graph) <= 4
43 |
--------------------------------------------------------------------------------
/dask/array/tests/test_numpy_compat.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pytest
5 |
6 | import dask.array as da
7 | from dask.array.utils import assert_eq
8 |
9 |
10 | @pytest.fixture(
11 | params=[
12 | [("A", ("f4", (3, 2))), ("B", ("f4", 3)), ("C", ("f8", 3))],
13 | [("A", ("i4", (3, 2))), ("B", ("f4", 3)), ("C", ("S4", 3))],
14 | ]
15 | )
16 | def dtype(request):
17 | return np.dtype(request.param)
18 |
19 |
20 | @pytest.fixture(params=[["A"], ["A", "B"], ["A", "B", "C"]])
21 | def index(request):
22 | return request.param
23 |
24 |
25 | def test_basic():
26 | # sanity check
27 | dtype = [("a", "f8"), ("b", "f8"), ("c", "f8")]
28 | x = np.ones((5, 3), dtype=dtype)
29 | dx = da.ones((5, 3), dtype=dtype, chunks=3)
30 | result = dx[["a", "b"]]
31 | expected = x[["a", "b"]]
32 | assert_eq(result, expected)
33 |
34 |
35 | def test_min_max_round_funcs():
36 | # Regression test for gh-5031
37 | image = da.from_array(np.array([[0, 1], [1, 2]]), chunks=(1, 2))
38 | # These use __array_function__ (and min/max/round are aliased,
39 | # to amin/amax/round_ in numpy)
40 | assert int(np.min(image)) == 0
41 | assert int(np.max(image)) == 2
42 | assert np.round(image)[1, 1] == 2
43 |
--------------------------------------------------------------------------------
/dask/tests/test_hashing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | from dask.hashing import hash_buffer, hash_buffer_hex, hashers
6 |
7 | np = pytest.importorskip("numpy")
8 |
9 | buffers = [
10 | b"abc",
11 | bytearray(b"123"),
12 | memoryview(b"456"),
13 | np.array(42),
14 | np.ones((100, 100)),
15 | np.zeros((100, 100), dtype=[("a", "i4"), ("b", "i2")]),
16 | np.ones(10000, dtype=np.int8)[1:], # unaligned
17 | ]
18 |
19 |
20 | @pytest.mark.parametrize("x", buffers)
21 | def test_hash_buffer(x):
22 | for hasher in [None] + hashers:
23 | h = hash_buffer(x, hasher=hasher)
24 | assert isinstance(h, bytes)
25 | assert 8 <= len(h) < 32
26 | assert h == hash_buffer(x, hasher=hasher)
27 |
28 |
29 | @pytest.mark.parametrize("x", buffers)
30 | def test_hash_buffer_hex(x):
31 | for hasher in [None] + hashers:
32 | h = hash_buffer_hex(x, hasher=hasher)
33 | assert isinstance(h, str)
34 | assert 16 <= len(h) < 64
35 | assert h == hash_buffer_hex(x, hasher=hasher)
36 |
37 |
38 | @pytest.mark.parametrize("hasher", hashers)
39 | def test_hashers(hasher):
40 | # Sanity check
41 | x = b"x"
42 | h = hasher(x)
43 | assert isinstance(h, bytes)
44 | assert 8 <= len(h) < 32
45 |
--------------------------------------------------------------------------------
/dask/widgets/widgets.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | import html
5 | import os.path
6 |
7 | from jinja2 import Environment, FileSystemLoader, Template
8 | from jinja2.exceptions import TemplateNotFound
9 |
10 | from dask.utils import format_bytes, format_time, format_time_ago, key_split, typename
11 |
12 | FILTERS = {
13 | "datetime_from_timestamp": datetime.datetime.fromtimestamp,
14 | "format_bytes": format_bytes,
15 | "format_time": format_time,
16 | "format_time_ago": format_time_ago,
17 | "html_escape": html.escape,
18 | "key_split": key_split,
19 | "type": type,
20 | "typename": typename,
21 | }
22 |
23 | TEMPLATE_PATHS = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates")]
24 |
25 |
26 | def get_environment() -> Environment:
27 | loader = FileSystemLoader(TEMPLATE_PATHS)
28 | environment = Environment(loader=loader)
29 | environment.filters.update(FILTERS)
30 |
31 | return environment
32 |
33 |
34 | def get_template(name: str) -> Template:
35 | try:
36 | return get_environment().get_template(name)
37 | except TemplateNotFound as e:
38 | raise TemplateNotFound(
39 | f"Unable to find {name} in dask.widgets.TEMPLATE_PATHS {TEMPLATE_PATHS}"
40 | ) from e
41 |
--------------------------------------------------------------------------------
/docs/source/array-stats.rst:
--------------------------------------------------------------------------------
1 | Stats
2 | =====
3 |
4 | Dask Array implements a subset of the `scipy.stats`_ package.
5 |
6 | Statistical Functions
7 | ---------------------
8 |
9 | You can calculate various measures of an array including skewness, kurtosis, and arbitrary moments.
10 |
11 | .. code-block:: python
12 |
13 | >>> from dask.array import stats
14 | >>> rng = da.random.default_rng()
15 | >>> x = rng.beta(1, 1, size=(1000,), chunks=10)
16 | >>> k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
17 | >>> dask.compute(k, s, m)
18 | (1.7612340817172787, -0.064073498030693302, -0.00054523780628304799)
19 |
20 |
21 | Statistical Tests
22 | -----------------
23 |
24 | You can perform basic statistical tests on Dask arrays.
25 | Each of these tests return a ``dask.delayed`` wrapping one of the scipy ``namedtuple``
26 | results.
27 |
28 |
29 | .. code-block:: python
30 |
31 | >>> rng = da.random.default_rng()
32 | >>> a = rng.uniform(size=(50,), chunks=(25,))
33 | >>> b = a + rng.uniform(low=-0.15, high=0.15, size=(50,), chunks=(25,))
34 | >>> result = stats.ttest_rel(a, b)
35 | >>> result.compute()
36 | Ttest_relResult(statistic=-1.5102104380013242, pvalue=0.13741197274874514)
37 |
38 | .. _scipy.stats: https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html
39 |
--------------------------------------------------------------------------------
/dask/tests/test_datasets.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | import dask
6 |
7 |
8 | def test_mimesis():
9 | pytest.importorskip("mimesis")
10 |
11 | b = dask.datasets.make_people()
12 | assert b.take(5)
13 |
14 | assert b.take(3) == b.take(3)
15 |
16 |
17 | def test_full_dataset():
18 | pytest.importorskip("mimesis")
19 | b = dask.datasets.make_people(npartitions=2, records_per_partition=10)
20 | assert b.count().compute() == 20
21 |
22 |
23 | def test_make_dataset_with_processes():
24 | pytest.importorskip("mimesis")
25 | b = dask.datasets.make_people(npartitions=2)
26 | try:
27 | b.compute(scheduler="processes")
28 | except TypeError:
29 | pytest.fail("Failed to execute make_people using processes")
30 |
31 |
32 | def test_no_mimesis():
33 | try:
34 | import mimesis # noqa: F401
35 | except ImportError:
36 | with pytest.raises(Exception) as info:
37 | dask.datasets.make_people()
38 |
39 | assert "python -m pip install mimesis" in str(info.value)
40 |
41 |
42 | def test_deterministic():
43 | pytest.importorskip("mimesis")
44 |
45 | a = dask.datasets.make_people(seed=123)
46 | b = dask.datasets.make_people(seed=123)
47 |
48 | assert a.take(1)[0]["name"] == b.take(1)[0]["name"]
49 |
--------------------------------------------------------------------------------
/dask/tests/test_ci.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 |
5 | import importlib_metadata
6 | import pytest
7 | from packaging.version import Version
8 |
9 |
10 | @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/9735", strict=False)
11 | @pytest.mark.skipif(
12 | not os.environ.get("UPSTREAM_DEV", False),
13 | reason="Only check for dev packages in `upstream` CI build",
14 | )
15 | def test_upstream_packages_installed():
16 | # List of packages should match those specified in
17 | # `continuous_integration/scripts/install.sh`
18 |
19 | # FIXME: This test isn't sensative to projects that use git tags
20 | # to determine versions (e.g. versionseer) when installed
21 | # directly from GitHub as the latest `main` branch can sometimes
22 | # be pointing to a released version of the project.
23 | packages = [
24 | "bokeh",
25 | # "dask",
26 | # "distributed",
27 | # "fastparquet",
28 | # "fsspec",
29 | "numpy",
30 | "pandas",
31 | # "partd",
32 | "pyarrow",
33 | # "s3fs",
34 | "scipy",
35 | # "sparse",
36 | # "zarr",
37 | # "zict",
38 | ]
39 | for package in packages:
40 | v = Version(importlib_metadata.version(package))
41 | assert v.is_prerelease or v.local is not None, (package, str(v))
42 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Dask
2 | ====
3 |
4 | |Build Status| |Coverage| |Doc Status| |Discourse| |Version Status| |NumFOCUS|
5 |
6 | Dask is a flexible parallel computing library for analytics. See
7 | documentation_ for more information.
8 |
9 |
10 | LICENSE
11 | -------
12 |
13 | New BSD. See `License File `__.
14 |
15 | .. _documentation: https://dask.org
16 | .. |Build Status| image:: https://github.com/dask/dask/actions/workflows/tests.yml/badge.svg
17 | :target: https://github.com/dask/dask/actions/workflows/tests.yml
18 | .. |Coverage| image:: https://codecov.io/gh/dask/dask/branch/main/graph/badge.svg
19 | :target: https://codecov.io/gh/dask/dask/branch/main
20 | :alt: Coverage status
21 | .. |Doc Status| image:: https://readthedocs.org/projects/dask/badge/?version=latest
22 | :target: https://dask.org
23 | :alt: Documentation Status
24 | .. |Discourse| image:: https://img.shields.io/discourse/users?logo=discourse&server=https%3A%2F%2Fdask.discourse.group
25 | :alt: Discuss Dask-related things and ask for help
26 | :target: https://dask.discourse.group
27 | .. |Version Status| image:: https://img.shields.io/pypi/v/dask.svg
28 | :target: https://pypi.python.org/pypi/dask/
29 | .. |NumFOCUS| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A
30 | :target: https://www.numfocus.org/
31 |
--------------------------------------------------------------------------------
/dask/tests/test_context.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | import dask
6 | from dask.context import globalmethod
7 |
8 |
9 | def test_with_get():
10 | da = pytest.importorskip("dask.array")
11 | var = [0]
12 |
13 | def myget(dsk, keys, **kwargs):
14 | var[0] = var[0] + 1
15 | return dask.get(dsk, keys, **kwargs)
16 |
17 | x = da.ones(10, chunks=(5,))
18 |
19 | assert x.sum().compute() == 10
20 | assert var[0] == 0
21 |
22 | with dask.config.set(scheduler=myget):
23 | assert x.sum().compute() == 10
24 | assert var[0] == 1
25 |
26 | # Make sure we've cleaned up
27 | assert x.sum().compute() == 10
28 | assert var[0] == 1
29 |
30 |
31 | def foo():
32 | return "foo"
33 |
34 |
35 | def bar():
36 | return "bar"
37 |
38 |
39 | class Foo:
40 | @globalmethod(key="f")
41 | def f(): # type: ignore
42 | return 1
43 |
44 | g = globalmethod(foo, key="g", falsey=bar)
45 |
46 |
47 | def test_globalmethod():
48 | x = Foo()
49 |
50 | assert x.f() == 1
51 |
52 | with dask.config.set(f=lambda: 2):
53 | assert x.f() == 2
54 |
55 | with dask.config.set(f=foo):
56 | assert x.f is foo
57 | assert x.f() == "foo"
58 |
59 | assert x.g is foo
60 | assert x.g() == "foo"
61 |
62 | with dask.config.set(g=False):
63 | assert x.g is bar
64 | assert x.g() == "bar"
65 |
--------------------------------------------------------------------------------
/dask/widgets/templates/array.html.j2:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | | |
8 | Array |
9 | Chunk |
10 |
11 |
12 |
13 | {% if nbytes %}
14 |
15 | | Bytes |
16 | {{ nbytes }} |
17 | {{ cbytes }} |
18 |
19 | {% endif %}
20 |
21 | | Shape |
22 | {{ array.shape }} |
23 | {{ array.chunksize }} |
24 |
25 |
26 | | Dask graph |
27 | {{ array.npartitions }} chunks in {{ layers }} |
28 |
29 |
30 | | Data type |
31 | {{ array.dtype }} {{ array._meta | type | typename }} |
32 |
33 |
34 |
35 | |
36 |
37 | {{grid}}
38 | |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/continuous_integration/scripts/test_imports.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -o errexit
3 |
4 |
5 | test_import () {
6 | echo "Create environment: python=$PYTHON_VERSION $1"
7 | # Create an empty environment
8 | mamba create -q -y -n test-imports -c conda-forge python=$PYTHON_VERSION packaging pyyaml fsspec toolz partd click cloudpickle importlib-metadata $1
9 | conda activate test-imports
10 | if [[ $1 =~ "distributed" ]]; then
11 | # dask[distributed] depends on the latest version of distributed
12 | python -m pip install git+https://github.com/dask/distributed
13 | fi
14 | python -m pip install -e .
15 | mamba list
16 | echo "python -c '$2'"
17 | python -c "$2"
18 | # Ensure that no non-deterministic objects are tokenized at init time,
19 | # which can prevent the library from being imported at all.
20 | echo "python -c '$2' (ensure deterministic)"
21 | DASK_TOKENIZE__ENSURE_DETERMINISTIC=True python -c "$2"
22 | conda deactivate
23 | mamba env remove -n test-imports
24 | }
25 |
26 | test_import "" "import dask, dask.base, dask.multiprocessing, dask.threaded, dask.optimization, dask.bag, dask.delayed, dask.graph_manipulation, dask.layers"
27 | test_import "numpy" "import dask.array"
28 | test_import "pandas" "import dask.dataframe"
29 | test_import "bokeh" "import dask.diagnostics"
30 | test_import "distributed" "import dask.distributed"
31 |
--------------------------------------------------------------------------------
/continuous_integration/recipe/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set major_minor_patch = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').split('.') %}
2 | {% set new_patch = major_minor_patch[2] | int + 1 %}
3 | {% set version = (major_minor_patch[:2] + [new_patch]) | join('.') + environ.get('VERSION_SUFFIX', '') %}
4 |
5 |
6 | package:
7 | name: dask-core
8 | version: {{ version }}
9 |
10 | source:
11 | git_url: ../..
12 |
13 | build:
14 | number: {{ GIT_DESCRIBE_NUMBER }}
15 | noarch: python
16 | string: py_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
17 | script: {{ PYTHON }} -m pip install . -vv
18 | entry_points:
19 | - dask = dask.__main__:main
20 |
21 | requirements:
22 | host:
23 | - python >=3.9
24 | - pip
25 | - versioneer =0.28
26 | - tomli # [py<311]
27 |
28 | run:
29 | - python >=3.9
30 | - click >=8.0
31 | - cloudpickle >=1.5.0
32 | - fsspec >=2021.09.0
33 | - packaging >=20.0
34 | - partd >=1.2.0
35 | - pyyaml >=5.3.1
36 | - toolz >=0.10.0
37 | - importlib_metadata >=4.13.0
38 |
39 | test:
40 | imports:
41 | - dask
42 | commands:
43 | - pip check
44 | - dask docs --help
45 | - dask info --help
46 | - dask info versions --help
47 | requires:
48 | - pip
49 |
50 | about:
51 | home: https://github.com/dask/dask/
52 | license: BSD-3-Clause
53 | license_file:
54 | - LICENSE.txt
55 | - dask/array/NUMPY_LICENSE.txt
56 | summary: Parallel Python with task scheduling
57 | doc_url: https://dask.org/
58 | dev_url: https://github.com/dask/dask
59 |
--------------------------------------------------------------------------------
/dask/array/tests/test_xarray.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | import dask.array as da
6 | from dask.array.utils import assert_eq
7 |
8 | xr = pytest.importorskip("xarray")
9 |
10 |
11 | def test_mean():
12 | y = da.mean(xr.DataArray([1, 2, 3.0]))
13 | assert isinstance(y, da.Array)
14 | assert_eq(y, y)
15 |
16 |
17 | def test_asarray():
18 | y = da.asarray(xr.DataArray([1, 2, 3.0]))
19 | assert isinstance(y, da.Array)
20 | assert_eq(y, y)
21 |
22 |
23 | def test_asanyarray():
24 | y = da.asanyarray(xr.DataArray([1, 2, 3.0]))
25 | assert isinstance(y, da.Array)
26 | assert_eq(y, y)
27 |
28 |
29 | def test_asarray_xarray_intersphinx_workaround():
30 | # test that the intersphinx workaround in https://github.com/pydata/xarray/issues/4279 works
31 | module = xr.DataArray.__module__
32 | try:
33 | xr.DataArray.__module__ = "xarray"
34 | y = da.asarray(xr.DataArray([1, 2, 3.0]))
35 | assert isinstance(y, da.Array)
36 | assert type(y._meta).__name__ == "ndarray"
37 | assert_eq(y, y)
38 | finally:
39 | xr.DataArray.__module__ = module
40 |
41 |
42 | def test_fft():
43 | # Regression test for https://github.com/dask/dask/issues/9679
44 | coord = da.arange(8, chunks=-1)
45 | data = da.random.random((8, 8), chunks=-1) + 1
46 | x = xr.DataArray(data, coords={"x": coord, "y": coord}, dims=["x", "y"])
47 | result = da.fft.fft(x)
48 | expected = da.fft.fft(x.data)
49 | assert_eq(result, expected)
50 |
--------------------------------------------------------------------------------
/dask/widgets/tests/test_widgets.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os.path
4 |
5 | import pytest
6 |
7 | jinja2 = pytest.importorskip("jinja2")
8 |
9 | from dask.utils import format_bytes
10 | from dask.widgets import FILTERS, TEMPLATE_PATHS, get_environment, get_template
11 |
12 |
13 | @pytest.fixture(autouse=True)
14 | def setup_testing():
15 | TEMPLATE_PATHS.append(
16 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates")
17 | )
18 | FILTERS["custom_filter"] = lambda x: "baz"
19 |
20 |
21 | def test_widgets():
22 | template = get_template("example.html.j2")
23 | assert isinstance(template, jinja2.Template)
24 | rendered = template.render(foo="bar")
25 | assert "Hello bar" in rendered
26 |
27 |
28 | def test_environment():
29 | environment = get_environment()
30 | assert isinstance(environment, jinja2.Environment)
31 |
32 |
33 | def test_unknown_template():
34 | with pytest.raises(jinja2.TemplateNotFound) as e:
35 | get_template("does_not_exist.html.j2")
36 |
37 | # The error should contain all the registered template directories to help the user
38 | # understand where jinja2 is looking. Including the one we registered in the fixture.
39 | assert os.path.dirname(os.path.abspath(__file__)) in str(e)
40 |
41 |
42 | def test_filters():
43 | template = get_template("bytes.html.j2")
44 | assert format_bytes in FILTERS.values()
45 | assert format_bytes(2e9) in template.render(foo=2e9)
46 |
47 | template = get_template("custom_filter.html.j2")
48 | assert "baz" in template.render(foo=None)
49 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2014, Anaconda, Inc. and contributors
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | * Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | * Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-optional.yaml:
--------------------------------------------------------------------------------
1 | name: test-environment
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | # required dependencies
6 | - packaging=20.0
7 | - python=3.9
8 | - pyyaml=5.3.1
9 | - click=8.0
10 | - cloudpickle=1.5.0
11 | - partd=1.2.0
12 | - fsspec=2021.09.0
13 | - importlib-metadata=4.13.0
14 | - toolz=0.10.0
15 | # optional dependencies pulled in by pip install dask[array,dataframe]
16 | - numpy=1.21
17 | - pandas=1.3
18 | # optional dependencies pulled in by pip install dask[diagnostics]
19 | - bokeh=2.4.2
20 | - jinja2=2.10.3
21 | # optional dependencies pulled in by pip install dask[complete]
22 | - pyarrow=7.0
23 | - lz4=4.3.2
24 | # optional dependencies used by dask
25 | - cachey=0.1.1
26 | - crick=0.0.3
27 | - cytoolz=0.11.0
28 | - dask-ml=1.4.0
29 | - fastavro=1.1.0
30 | - fastparquet=0.8.2
31 | - h5py=2.10.0
32 | - ipycytoscape=1.0.1
33 | - IPython=7.16.1
34 | - matplotlib=3.4.1
35 | - mimesis=5.3.0
36 | - mmh3=2.5.1
37 | - psutil=5.7.2
38 | - python-cityhash=0.4.6
39 | - python-graphviz=0.8.4
40 | - python-snappy=0.5.4
41 | - python-xxhash=2.0.0
42 | - s3fs=2021.9.0
43 | - scikit-image=0.17.2
44 | - scipy=1.5.2
45 | - sparse=0.12.0
46 | - sqlalchemy=1.4.16
47 | - tblib=1.6.0
48 | - tiledb-py=0.8.1
49 | - zarr=2.12.0
50 | - pip
51 | - pip:
52 | # optional dependencies pulled in by pip install dask[distributed]
53 | - git+https://github.com/dask/distributed
54 | # test dependencies
55 | - pre-commit
56 | - pytest
57 | - pytest-cov
58 | - pytest-rerunfailures
59 | - pytest-xdist
60 |
--------------------------------------------------------------------------------
/dask/tests/test_system.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import builtins
4 | import io
5 | import os
6 | import sys
7 |
8 | import pytest
9 |
10 | from dask.system import cpu_count
11 |
12 | psutil = pytest.importorskip("psutil")
13 |
14 |
15 | def test_cpu_count():
16 | count = cpu_count()
17 | assert isinstance(count, int)
18 | assert count <= os.cpu_count()
19 | assert count >= 1
20 |
21 |
22 | @pytest.mark.parametrize("dirname", ["cpuacct,cpu", "cpu,cpuacct", None])
23 | def test_cpu_count_cgroups(dirname, monkeypatch):
24 | def mycpu_count():
25 | # Absurdly high, unlikely to match real value
26 | return 250
27 |
28 | monkeypatch.setattr(os, "cpu_count", mycpu_count)
29 |
30 | class MyProcess:
31 | def cpu_affinity(self):
32 | # No affinity set
33 | return []
34 |
35 | monkeypatch.setattr(psutil, "Process", MyProcess)
36 |
37 | if dirname:
38 | paths = {
39 | "/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname: io.StringIO("2005"),
40 | "/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname: io.StringIO("10"),
41 | }
42 | builtin_open = builtins.open
43 |
44 | def myopen(path, *args, **kwargs):
45 | if path in paths:
46 | return paths.get(path)
47 | return builtin_open(path, *args, **kwargs)
48 |
49 | monkeypatch.setattr(builtins, "open", myopen)
50 | monkeypatch.setattr(sys, "platform", "linux")
51 |
52 | count = cpu_count()
53 | if dirname:
54 | # Rounds up
55 | assert count == 201
56 | else:
57 | assert count == 250
58 |
--------------------------------------------------------------------------------
/dask/array/NUMPY_LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2005-2015, NumPy Developers.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are
6 | met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above
12 | copyright notice, this list of conditions and the following
13 | disclaimer in the documentation and/or other materials provided
14 | with the distribution.
15 |
16 | * Neither the name of the NumPy Developers nor the names of any
17 | contributors may be used to endorse or promote products derived
18 | from this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/docs/source/delayed-collections.rst:
--------------------------------------------------------------------------------
1 | Working with Collections
2 | ========================
3 |
4 | Often we want to do a bit of custom work with ``dask.delayed`` (for example,
5 | for complex data ingest), then leverage the algorithms in ``dask.array`` or
6 | ``dask.dataframe``, and then switch back to custom work. To this end, all
7 | collections support ``from_delayed`` functions and ``to_delayed``
8 | methods.
9 |
10 | As an example, consider the case where we store tabular data in a custom format
11 | not known by Dask DataFrame. This format is naturally broken apart into
12 | pieces and we have a function that reads one piece into a Pandas DataFrame.
13 | We use ``dask.delayed`` to lazily read these files into Pandas DataFrames,
14 | use ``dd.from_delayed`` to wrap these pieces up into a single
15 | Dask DataFrame, use the complex algorithms within the DataFrame
16 | (groupby, join, etc.), and then switch back to ``dask.delayed`` to save our results
17 | back to the custom format:
18 |
19 | .. code-block:: python
20 |
21 | import dask.dataframe as dd
22 | from dask.delayed import delayed
23 |
24 | from my_custom_library import load, save
25 |
26 | filenames = ...
27 | dfs = [delayed(load)(fn) for fn in filenames]
28 |
29 | df = dd.from_delayed(dfs)
30 | df = ... # do work with dask.dataframe
31 |
32 | dfs = df.to_delayed()
33 | writes = [delayed(save)(df, fn) for df, fn in zip(dfs, filenames)]
34 |
35 | dd.compute(*writes)
36 |
37 | Data science is often complex, and ``dask.delayed`` provides a release valve for
38 | users to manage this complexity on their own, and solve the last mile problem
39 | for custom formats and complex situations.
40 |
--------------------------------------------------------------------------------
/docs/source/images/dask_icon_black.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
26 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_extensions.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from decimal import Decimal
4 |
5 | import pytest
6 |
7 | import dask.dataframe as dd
8 | from dask.dataframe.utils import assert_eq
9 |
10 | pd = pytest.importorskip("pandas")
11 |
12 | from pandas.tests.extension.decimal.array import DecimalArray, DecimalDtype
13 |
14 | from dask.dataframe.extensions import make_array_nonempty, make_scalar
15 |
16 |
17 | @make_array_nonempty.register(DecimalDtype)
18 | def _(dtype):
19 | return DecimalArray._from_sequence([Decimal("0"), Decimal("NaN")], dtype=dtype)
20 |
21 |
22 | @make_scalar.register(Decimal)
23 | def _(x):
24 | return Decimal("1")
25 |
26 |
27 | def test_register_extension_type():
28 | arr = DecimalArray._from_sequence([Decimal("1.0")] * 10)
29 | ser = pd.Series(arr)
30 | dser = dd.from_pandas(ser, 2)
31 | assert_eq(ser, dser)
32 |
33 | df = pd.DataFrame({"A": ser})
34 | ddf = dd.from_pandas(df, 2)
35 | assert_eq(df, ddf)
36 |
37 |
38 | def test_reduction():
39 | ser = pd.Series(DecimalArray._from_sequence([Decimal("0"), Decimal("1")]))
40 | dser = dd.from_pandas(ser, 2)
41 | assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))
42 |
43 | # It's unclear whether this can be reliably provided, at least with the current
44 | # implementation, which uses pandas.DataFrame.sum(), returning a (homogenous)
45 | # series which has potentially cast values.
46 |
47 | # assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False))
48 |
49 |
50 | def test_scalar():
51 | result = dd.utils.make_meta(Decimal("1.0"), parent_meta=pd.DataFrame())
52 | assert result == Decimal("1.0")
53 |
--------------------------------------------------------------------------------
/docs/source/images/dask_icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
26 |
--------------------------------------------------------------------------------
/docs/source/images/dask_icon_on_pink.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
26 |
--------------------------------------------------------------------------------
/docs/source/images/dask_icon_white.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
26 |
--------------------------------------------------------------------------------
/dask/array/tests/test_image.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | from contextlib import contextmanager
5 |
6 | import pytest
7 |
8 | pytest.importorskip("skimage")
9 | import numpy as np
10 | from skimage.io import imsave
11 |
12 | from dask.array.image import imread as da_imread
13 | from dask.utils import tmpdir
14 |
15 |
16 | @contextmanager
17 | def random_images(n, shape):
18 | with tmpdir() as dirname:
19 | for i in range(n):
20 | fn = os.path.join(dirname, "image.%d.png" % i)
21 | x = np.random.randint(0, 255, size=shape).astype("u1")
22 | imsave(fn, x, check_contrast=False)
23 |
24 | yield os.path.join(dirname, "*.png")
25 |
26 |
27 | def test_imread():
28 | with random_images(4, (5, 6, 3)) as globstring:
29 | im = da_imread(globstring)
30 | assert im.shape == (4, 5, 6, 3)
31 | assert im.chunks == ((1, 1, 1, 1), (5,), (6,), (3,))
32 | assert im.dtype == "uint8"
33 |
34 | assert im.compute().shape == (4, 5, 6, 3)
35 | assert im.compute().dtype == "uint8"
36 |
37 |
38 | def test_imread_with_custom_function():
39 | def imread2(fn):
40 | return np.ones((2, 3, 4), dtype="i1")
41 |
42 | with random_images(4, (5, 6, 3)) as globstring:
43 | im = da_imread(globstring, imread=imread2)
44 | assert (im.compute() == np.ones((4, 2, 3, 4), dtype="u1")).all()
45 |
46 |
47 | def test_preprocess():
48 | def preprocess(x):
49 | x[:] = 1
50 | return x[:, :, 0]
51 |
52 | with random_images(4, (2, 3, 4)) as globstring:
53 | im = da_imread(globstring, preprocess=preprocess)
54 | assert (im.compute() == np.ones((4, 2, 3), dtype="u1")).all()
55 |
--------------------------------------------------------------------------------
/continuous_integration/environment-3.9.yaml:
--------------------------------------------------------------------------------
1 | # This job includes coverage
2 | name: test-environment
3 | channels:
4 | - conda-forge
5 | - nodefaults
6 | dependencies:
7 | # required dependencies
8 | - python=3.9
9 | - packaging
10 | - pyyaml
11 | - click
12 | - cloudpickle
13 | - partd
14 | - fsspec
15 | - importlib_metadata
16 | - toolz
17 | # test dependencies
18 | - pre-commit
19 | - pytest
20 | - pytest-cov
21 | - pytest-rerunfailures
22 | - pytest-timeout
23 | - pytest-xdist
24 | - moto
25 | # Optional dependencies
26 | - mimesis
27 | - numpy=1.22
28 | - pandas=1.4
29 | - flask
30 | - fastparquet
31 | - h5py
32 | - pytables
33 | - zarr
34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI.
35 | # We should unpin when possible.
36 | # https://github.com/dask/dask/pull/9569
37 | - tiledb-py<0.17.4
38 | - pyspark
39 | - tiledb>=2.5.0
40 | - xarray
41 | - sqlalchemy>=1.4.16,<2 # `pandas=1.4` doesn't support `sqlalchemy=2`
42 | - pyarrow=9
43 | - coverage
44 | - jsonschema
45 | # other -- IO
46 | - boto3
47 | - botocore
48 | - bokeh
49 | - httpretty
50 | - aiohttp
51 | - s3fs
52 | - crick
53 | - cytoolz
54 | - distributed
55 | - ipython
56 | - ipycytoscape
57 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
58 | - ipywidgets<8.0.5
59 | - ipykernel<6.22.0
60 | - lz4
61 | - numba
62 | - psutil
63 | - requests
64 | - scikit-image<0.20
65 | - scikit-learn
66 | - scipy
67 | - python-snappy
68 | - sparse
69 | - cachey
70 | - python-graphviz
71 | - python-xxhash
72 | - python-cityhash
73 | - mmh3
74 | - jinja2
75 | - pip
76 | - pip:
77 | - git+https://github.com/dask/distributed
78 |
--------------------------------------------------------------------------------
/continuous_integration/environment-3.10.yaml:
--------------------------------------------------------------------------------
1 | # This job includes coverage
2 | name: test-environment
3 | channels:
4 | - conda-forge
5 | - nodefaults
6 | dependencies:
7 | # required dependencies
8 | - python=3.10
9 | - packaging
10 | - pyyaml
11 | - click
12 | - cloudpickle
13 | - partd
14 | - fsspec
15 | - importlib_metadata
16 | - toolz
17 | # test dependencies
18 | - pre-commit
19 | - pytest
20 | - pytest-cov
21 | - pytest-rerunfailures
22 | - pytest-timeout
23 | - pytest-xdist
24 | - moto
25 | # Optional dependencies
26 | - mimesis
27 | - numpy=1.23
28 | - pandas=1.5
29 | - flask
30 | - fastparquet>=0.8.0
31 | - h5py
32 | - pytables
33 | - zarr
34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI.
35 | # We should unpin when possible.
36 | # https://github.com/dask/dask/pull/9569
37 | - tiledb-py<0.17.4
38 | - pyspark
39 | - tiledb>=2.5.0
40 | - xarray
41 | - sqlalchemy>=1.4.16,<2 # `pandas=1.5` doesn't support `sqlalchemy=2`
42 | - pyarrow=10
43 | - coverage
44 | - jsonschema
45 | # other -- IO
46 | - boto3
47 | - botocore
48 | - bokeh
49 | - httpretty
50 | - aiohttp
51 | - s3fs
52 | - crick
53 | - cytoolz
54 | - distributed
55 | - ipython
56 | - ipycytoscape
57 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
58 | - ipywidgets<8.0.5
59 | - ipykernel<6.22.0
60 | - lz4
61 | - numba
62 | - psutil
63 | - requests
64 | - scikit-image
65 | - scikit-learn
66 | - scipy
67 | - python-snappy
68 | - sparse
69 | - cachey
70 | - python-graphviz
71 | - python-xxhash
72 | - python-cityhash
73 | - mmh3
74 | - jinja2
75 | - pip
76 | - pip:
77 | - git+https://github.com/dask/distributed
78 |
--------------------------------------------------------------------------------
/dask/system.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import math
4 | import os
5 | import sys
6 |
7 | try:
8 | import psutil
9 | except ImportError:
10 | psutil = None # type: ignore
11 |
12 | __all__ = ("cpu_count", "CPU_COUNT")
13 |
14 |
15 | def cpu_count():
16 | """Get the available CPU count for this system.
17 |
18 | Takes the minimum value from the following locations:
19 |
20 | - Total system cpus available on the host.
21 | - CPU Affinity (if set)
22 | - Cgroups limit (if set)
23 | """
24 | count = os.cpu_count()
25 |
26 | # Check CPU affinity if available
27 | if psutil is not None:
28 | try:
29 | affinity_count = len(psutil.Process().cpu_affinity())
30 | if affinity_count > 0:
31 | count = min(count, affinity_count)
32 | except Exception:
33 | pass
34 |
35 | # Check cgroups if available
36 | if sys.platform == "linux":
37 | # The directory name isn't standardized across linux distros, check both
38 | for dirname in ["cpuacct,cpu", "cpu,cpuacct"]:
39 | try:
40 | with open("/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname) as f:
41 | quota = int(f.read())
42 | with open("/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname) as f:
43 | period = int(f.read())
44 | # We round up on fractional CPUs
45 | cgroups_count = math.ceil(quota / period)
46 | if cgroups_count > 0:
47 | count = min(count, cgroups_count)
48 | break
49 | except Exception:
50 | pass
51 |
52 | return count
53 |
54 |
55 | CPU_COUNT = cpu_count()
56 |
--------------------------------------------------------------------------------
/dask/tests/test_utils_test.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | import pytest
6 |
7 | from dask import utils_test
8 | from dask.highlevelgraph import HighLevelGraph
9 | from dask.utils_test import _check_warning
10 |
11 |
12 | def test_hlg_layer():
13 | a = {"x": 1}
14 | b = {"y": (utils_test.inc, "x")}
15 | layers = {"a-layer": a, "bee-layer": b}
16 | dependencies = {"a-layer": set(), "bee-layer": {"a-layer"}}
17 | hg = HighLevelGraph(layers, dependencies)
18 |
19 | assert utils_test.hlg_layer(hg, "a") is hg.layers["a-layer"]
20 | assert utils_test.hlg_layer(hg, "b") is hg.layers["bee-layer"]
21 | with pytest.raises(KeyError, match="No layer starts with"):
22 | utils_test.hlg_layer(hg, "foo")
23 |
24 |
25 | def test_hlg_layer_topological():
26 | a = {"x": 1}
27 | b = {"y": (utils_test.inc, "x")}
28 | c = {"z": (utils_test.inc, "x")}
29 | d = {"r": (sum, ["y", "z"])}
30 | layers = {"a": a, "b": b, "c": c, "d": d}
31 | dependencies = {"a": set(), "b": {"a"}, "c": {"a"}, "d": {"b", "c"}}
32 | hg = HighLevelGraph(layers, dependencies)
33 |
34 | assert utils_test.hlg_layer_topological(hg, -1) is hg.layers["d"]
35 | assert utils_test.hlg_layer_topological(hg, 0) is hg.layers["a"]
36 | assert utils_test.hlg_layer_topological(hg, 1) in (hg.layers["b"], hg.layers["c"])
37 |
38 |
39 | def test__check_warning():
40 | class MyWarning(Warning):
41 | pass
42 |
43 | with warnings.catch_warnings():
44 | warnings.simplefilter("error")
45 | with _check_warning(True, MyWarning, "foo"):
46 | warnings.warn("foo", MyWarning)
47 |
48 | with pytest.warns(MyWarning, match="foo"):
49 | with _check_warning(False, MyWarning, "foo"):
50 | warnings.warn("foo", MyWarning)
51 |
--------------------------------------------------------------------------------
/continuous_integration/environment-3.11.yaml:
--------------------------------------------------------------------------------
1 | # This job includes coverage
2 | name: test-environment
3 | channels:
4 | - conda-forge
5 | - nodefaults
6 | dependencies:
7 | # required dependencies
8 | - python=3.11
9 | - packaging
10 | - pyyaml
11 | - click
12 | - cloudpickle
13 | - partd
14 | - fsspec
15 | - importlib_metadata
16 | - toolz
17 | # test dependencies
18 | - pre-commit
19 | - pytest
20 | - pytest-cov
21 | - pytest-rerunfailures
22 | - pytest-timeout
23 | - pytest-xdist
24 | - moto
25 | # Optional dependencies
26 | - mimesis
27 | - numpy
28 | - pandas
29 | - flask
30 | - fastparquet>=0.8.0
31 | - h5py
32 | - pytables
33 | - zarr
34 | # `tiledb-py=0.17.5` lead to strange seg faults in CI, However 0.18 is needed for 3.11
35 | # https://github.com/dask/dask/pull/9569
36 | # - tiledb-py # crashes on Python 3.11
37 | # - pyspark
38 | # - tiledb>=2.5.0 # crashes on Python 3.11
39 | - xarray
40 | - sqlalchemy>=1.4.16
41 | - pyarrow>=11
42 | - coverage
43 | - jsonschema
44 | # # other -- IO
45 | - boto3
46 | - botocore
47 | - bokeh
48 | - httpretty
49 | - aiohttp
50 | - s3fs
51 | # Need a new `crick` release with support for `numpy=1.24+`
52 | # https://github.com/dask/crick/issues/25
53 | # - crick
54 | - cytoolz
55 | - distributed
56 | - ipython
57 | - ipycytoscape
58 | # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
59 | - ipywidgets<8.0.5
60 | - ipykernel<6.22.0
61 | - lz4
62 | - numba
63 | - psutil
64 | - requests
65 | - scikit-image
66 | - scikit-learn
67 | - scipy
68 | - python-snappy
69 | - sparse
70 | - cachey
71 | - python-graphviz
72 | - python-cityhash
73 | - python-xxhash
74 | - mmh3
75 | - jinja2
76 | - pip
77 | - pip:
78 | - git+https://github.com/dask/distributed
79 |
--------------------------------------------------------------------------------
/dask/dataframe/numeric.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pandas as pd
4 | from pandas.api.types import is_scalar as pd_is_scalar
5 |
6 | from dask.array import Array
7 | from dask.dataframe.core import Series
8 | from dask.delayed import delayed
9 | from dask.utils import derived_from
10 |
11 | __all__ = ("to_numeric",)
12 |
13 |
14 | @derived_from(pd, ua_args=["downcast"])
15 | def to_numeric(arg, errors="raise", meta=None):
16 | """
17 | Return type depends on input. Delayed if scalar, otherwise same as input.
18 | For errors, only "raise" and "coerce" are allowed.
19 | """
20 | if errors not in ("raise", "coerce"):
21 | raise ValueError("invalid error value specified")
22 |
23 | is_series = isinstance(arg, Series)
24 | is_array = isinstance(arg, Array)
25 | is_scalar = pd_is_scalar(arg)
26 |
27 | if not any([is_series, is_array, is_scalar]):
28 | raise TypeError(
29 | "arg must be a list, tuple, dask.array.Array, or dask.dataframe.Series"
30 | )
31 |
32 | if meta is not None:
33 | if is_scalar:
34 | raise KeyError("``meta`` is not allowed when input is a scalar.")
35 | else:
36 | if is_series or is_array:
37 | meta = pd.to_numeric(arg._meta)
38 |
39 | if is_series:
40 | return arg.map_partitions(
41 | pd.to_numeric,
42 | token=arg._name + "-to_numeric",
43 | meta=meta,
44 | enforce_metadata=False,
45 | errors=errors,
46 | )
47 | if is_array:
48 | return arg.map_blocks(
49 | pd.to_numeric,
50 | name=arg._name + "-to_numeric",
51 | meta=meta,
52 | errors=errors,
53 | )
54 | if is_scalar:
55 | return delayed(pd.to_numeric, pure=True)(arg, errors=errors)
56 |
--------------------------------------------------------------------------------
/docs/source/deploying-ssh.rst:
--------------------------------------------------------------------------------
1 | SSH
2 | ===
3 |
4 | It is easy to set up Dask on informally managed networks of machines using SSH.
5 | This can be done manually using SSH and the
6 | Dask :doc:`command line interface `,
7 | or automatically using either the :class:`dask.distributed.SSHCluster` Python *cluster manager* or the
8 | ``dask-ssh`` command line tool. This document describes both of these options.
9 |
10 | .. note::
11 | Before instaniating a ``SSHCluster`` it is recommended to configure keyless SSH
12 | for your local machine and other machines. For example, on a Mac to SSH into
13 | localhost (local machine) you need to ensure the Remote Login option is set in
14 | System Preferences -> Sharing. In addition, ``id_rsa.pub`` should be in
15 | ``authorized_keys`` for keyless login.
16 |
17 | Python Interface
18 | ----------------
19 |
20 | .. currentmodule:: dask.distributed
21 |
22 | .. autofunction:: SSHCluster
23 |
24 | Command Line
25 | ------------
26 |
27 | The convenience script ``dask-ssh`` opens several SSH connections to your
28 | target computers and initializes the network accordingly. You can
29 | give it a list of hostnames or IP addresses::
30 |
31 | $ dask-ssh 192.168.0.1 192.168.0.2 192.168.0.3 192.168.0.4
32 |
33 | Or you can use normal UNIX grouping::
34 |
35 | $ dask-ssh 192.168.0.{1,2,3,4}
36 |
37 | Or you can specify a hostfile that includes a list of hosts::
38 |
39 | $ cat hostfile.txt
40 | 192.168.0.1
41 | 192.168.0.2
42 | 192.168.0.3
43 | 192.168.0.4
44 |
45 | $ dask-ssh --hostfile hostfile.txt
46 |
47 | .. note::
48 |
49 | The command line documentation here may differ depending on your installed
50 | version. We recommend referring to the output of ``dask-ssh --help``.
51 |
52 | .. click:: distributed.cli.dask_ssh:main
53 | :prog: dask-ssh
54 | :show-nested:
55 |
--------------------------------------------------------------------------------
/docs/source/how-to/extend-sizeof.rst:
--------------------------------------------------------------------------------
1 | Extend `sizeof`
2 | ===============
3 |
4 | When Dask needs to compute the size of an object in bytes, e.g. to determine which objects to spill to disk, it uses the ``dask.sizeof.sizeof`` registration mechanism. Users who need to define a ``sizeof`` implementation for their own objects can use ``sizeof.register``:
5 |
6 | .. code-block:: python
7 |
8 | >>> import numpy as np
9 | >>> from dask.sizeof import sizeof
10 | >>> @sizeof.register(np.ndarray)
11 | >>> def sizeof_numpy_like(array):
12 | ... return array.nbytes
13 |
14 | This code can be executed in order to register the implementation with Dask by placing it in one of the library's modules e.g. ``__init__.py``. However, this introduces a maintenance burden on the developers of these libraries, and must be manually imported on all workers in the event that these libraries do not accept the patch.
15 |
16 | Therefore, Dask also exposes an `entrypoint `_ under the group ``dask.sizeof`` to enable third-party libraries to develop and maintain these ``sizeof`` implementations.
17 |
18 | For a fictitious library ``numpy_sizeof_dask.py``, the necessary ``setup.cfg`` configuration would be as follows:
19 |
20 | .. code-block:: ini
21 |
22 | [options.entry_points]
23 | dask.sizeof =
24 | numpy = numpy_sizeof_dask:sizeof_plugin
25 |
26 | whilst ``numpy_sizeof_dask.py`` would contain
27 |
28 | .. code-block:: python
29 |
30 | >>> import numpy as np
31 | >>> def sizeof_plugin(sizeof):
32 | ... @sizeof.register(np.ndarray)
33 | ... def sizeof_numpy_like(array):
34 | ... return array.nbytes
35 |
36 | Upon the first import of `dask.sizeof`, Dask calls the entrypoint (``sizeof_plugin``) with the ``dask.sizeof.sizeof`` object, which can then be used to register a sizeof implementation.
37 |
--------------------------------------------------------------------------------
/continuous_integration/gpuci/build.sh:
--------------------------------------------------------------------------------
1 | ##############################################
2 | # Dask GPU build and test script for CI #
3 | ##############################################
4 | set -e
5 | NUMARGS=$#
6 | ARGS=$*
7 |
8 | # Arg parsing function
9 | function hasArg {
10 | (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
11 | }
12 |
13 | # Set path and build parallel level
14 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
15 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
16 |
17 | # Set home to the job's workspace
18 | export HOME="$WORKSPACE"
19 |
20 | # Switch to project root; also root of repo checkout
21 | cd "$WORKSPACE"
22 |
23 | # Determine CUDA release version
24 | export CUDA_REL=${CUDA_VERSION%.*}
25 |
26 | ################################################################################
27 | # SETUP - Check environment
28 | ################################################################################
29 |
30 | gpuci_logger "Check environment variables"
31 | env
32 |
33 | gpuci_logger "Check GPU usage"
34 | nvidia-smi
35 |
36 | gpuci_logger "Activate conda env"
37 | . /opt/conda/etc/profile.d/conda.sh
38 | conda activate dask
39 |
40 | gpuci_logger "Install distributed"
41 | python -m pip install git+https://github.com/dask/distributed
42 |
43 | gpuci_logger "Install dask"
44 | python -m pip install --no-deps -e .
45 |
46 | gpuci_logger "Install pytest-timeout"
47 | python -m pip install pytest-timeout
48 |
49 | gpuci_logger "Check Python version"
50 | python --version
51 |
52 | gpuci_logger "Check conda environment"
53 | conda info
54 | conda config --show-sources
55 | conda list --show-channel-urls
56 |
57 | gpuci_logger "Python py.test for dask"
58 | py.test $WORKSPACE -n 3 -v -m gpu --junitxml="$WORKSPACE/junit-dask.xml" --cov-config="$WORKSPACE/pyproject.toml" --cov=dask --cov-report=xml:"$WORKSPACE/dask-coverage.xml" --cov-report term
59 |
--------------------------------------------------------------------------------
/docs/source/delayed-api.rst:
--------------------------------------------------------------------------------
1 | API
2 | ===
3 |
4 | The ``dask.delayed`` interface consists of one function, ``delayed``:
5 |
6 | - ``delayed`` wraps functions
7 |
8 | Wraps functions. Can be used as a decorator, or around function calls
9 | directly (i.e. ``delayed(foo)(a, b, c)``). Outputs from functions wrapped in
10 | ``delayed`` are proxy objects of type ``Delayed`` that contain a graph of
11 | all operations done to get to this result.
12 |
13 | - ``delayed`` wraps objects
14 |
15 | Wraps objects. Used to create ``Delayed`` proxies directly.
16 |
17 | ``Delayed`` objects can be thought of as representing a key in the dask task
18 | graph. A ``Delayed`` supports *most* python operations, each of which creates
19 | another ``Delayed`` representing the result:
20 |
21 | - Most operators (``*``, ``-``, and so on)
22 | - Item access and slicing (``a[0]``)
23 | - Attribute access (``a.size``)
24 | - Method calls (``a.index(0)``)
25 |
26 | Operations that aren't supported include:
27 |
28 | - Mutating operators (``a += 1``)
29 | - Mutating magics such as ``__setitem__``/``__setattr__`` (``a[0] = 1``, ``a.foo = 1``)
30 | - Iteration. (``for i in a: ...``)
31 | - Use as a predicate (``if a: ...``)
32 |
33 | The last two points in particular mean that ``Delayed`` objects cannot be used for
34 | control flow, meaning that no ``Delayed`` can appear in a loop or if statement.
35 | In other words you can't iterate over a ``Delayed`` object, or use it as part of
36 | a condition in an if statement, but ``Delayed`` object can be used in a body of a loop
37 | or if statement (i.e. the example above is fine, but if ``data`` was a ``Delayed``
38 | object it wouldn't be).
39 | Even with this limitation, many workflows can easily be parallelized.
40 |
41 | .. currentmodule:: dask.delayed
42 |
43 | .. autosummary::
44 | delayed
45 | Delayed
46 |
47 | .. autofunction:: delayed
48 | .. autoclass:: Delayed
49 |
--------------------------------------------------------------------------------
/.github/workflows/additional.yml:
--------------------------------------------------------------------------------
1 | name: Additional
2 |
3 | on: [push, pull_request]
4 |
5 | # Required shell entrypoint to have properly activated conda environments
6 | defaults:
7 | run:
8 | shell: bash -l {0}
9 |
10 | jobs:
11 | doctest:
12 | runs-on: "ubuntu-latest"
13 | timeout-minutes: 90
14 | steps:
15 | - name: Checkout source
16 | uses: actions/checkout@v3.5.3
17 |
18 | - name: Setup Conda Environment
19 | uses: conda-incubator/setup-miniconda@v2.2.0
20 | with:
21 | miniforge-variant: Mambaforge
22 | miniforge-version: latest
23 | use-mamba: true
24 | channel-priority: strict
25 | python-version: "3.10"
26 | environment-file: continuous_integration/environment-3.10.yaml
27 | activate-environment: test-environment
28 | auto-activate-base: false
29 |
30 | - name: Install
31 | run: source continuous_integration/scripts/install.sh
32 |
33 | - name: Run tests
34 | run: pytest -v --doctest-modules --ignore-glob='*/test_*.py' dask
35 |
36 | imports:
37 | runs-on: "ubuntu-latest"
38 | timeout-minutes: 90
39 | strategy:
40 | fail-fast: false
41 | matrix:
42 | python-version: ["3.9", "3.10", "3.11"]
43 | steps:
44 | - name: Checkout source
45 | uses: actions/checkout@v3.5.3
46 |
47 | - name: Setup Conda
48 | uses: conda-incubator/setup-miniconda@v2.2.0
49 | with:
50 | miniforge-variant: Mambaforge
51 | miniforge-version: latest
52 | use-mamba: true
53 | channel-priority: strict
54 | python-version: "3.9"
55 | activate-environment: test-environment
56 | auto-activate-base: false
57 |
58 | - name: Run import tests
59 | env:
60 | PYTHON_VERSION: ${{ matrix.python-version }}
61 | run: source continuous_integration/scripts/test_imports.sh
62 |
--------------------------------------------------------------------------------
/dask/tests/test_cache.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from operator import add
4 | from time import sleep
5 |
6 | import pytest
7 |
8 | from dask.cache import Cache
9 | from dask.callbacks import Callback
10 | from dask.local import get_sync
11 | from dask.threaded import get
12 |
13 | cachey = pytest.importorskip("cachey")
14 |
15 |
16 | flag = []
17 |
18 |
19 | def inc(x):
20 | flag.append(x)
21 | return x + 1
22 |
23 |
24 | def test_cache():
25 | c = cachey.Cache(10000)
26 | cc = Cache(c)
27 |
28 | with cc:
29 | assert get({"x": (inc, 1)}, "x") == 2
30 |
31 | assert flag == [1]
32 | assert c.data["x"] == 2
33 |
34 | assert not cc.starttimes
35 | assert not cc.durations
36 |
37 | while flag:
38 | flag.pop()
39 | dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")}
40 | with cc:
41 | assert get(dsk, "z") == 5
42 |
43 | assert flag == [2] # no x present
44 |
45 | assert not Callback.active
46 |
47 |
48 | def test_cache_with_number():
49 | c = Cache(10000, limit=1)
50 | assert isinstance(c.cache, cachey.Cache)
51 | assert c.cache.available_bytes == 10000
52 | assert c.cache.limit == 1
53 |
54 |
55 | def test_cache_correctness():
56 | # https://github.com/dask/dask/issues/3631
57 | c = Cache(10000)
58 | da = pytest.importorskip("dask.array")
59 | from numpy import ones, zeros
60 |
61 | z = da.from_array(zeros(1), chunks=10)
62 | o = da.from_array(ones(1), chunks=10)
63 | with c:
64 | assert (z.compute() == 0).all()
65 | assert (o.compute() == 1).all()
66 |
67 |
68 | def f(duration, size, *args):
69 | sleep(duration)
70 | return [0] * size
71 |
72 |
73 | def test_prefer_cheap_dependent():
74 | dsk = {"x": (f, 0.01, 10), "y": (f, 0.000001, 1, "x")}
75 | c = Cache(10000)
76 | with c:
77 | get_sync(dsk, "y")
78 |
79 | assert c.cache.scorer.cost["x"] < c.cache.scorer.cost["y"]
80 |
--------------------------------------------------------------------------------
/docs/source/understanding-performance.rst:
--------------------------------------------------------------------------------
1 | Understanding Performance
2 | =========================
3 |
4 | The first step in making computations run quickly is to understand the costs involved.
5 | In Python we often rely on tools like
6 | the `CProfile module `_,
7 | `%%prun IPython magic `_,
8 | `VMProf `_, or
9 | `snakeviz `_
10 | to understand the costs associated with our code.
11 | However, few of these tools work well on multi-threaded or multi-process code,
12 | and fewer still on computations distributed among many machines.
13 | We also have new costs like data transfer, serialization, task scheduling overhead, and more
14 | that we may not be accustomed to tracking.
15 |
16 | Fortunately, the Dask schedulers come with diagnostics
17 | to help you understand the performance characteristics of your computations.
18 | By using these diagnostics and with some thought,
19 | we can often identify the slow parts of troublesome computations.
20 |
21 | The :doc:`single-machine and distributed schedulers ` come with *different* diagnostic tools.
22 | These tools are deeply integrated into each scheduler,
23 | so a tool designed for one will not transfer over to the other.
24 |
25 | These pages provide four options for profiling parallel code:
26 |
27 | 1. :doc:`Visualize task graphs `
28 | 2. :ref:`Single threaded scheduler and a normal Python profiler `
29 | 3. :doc:`Diagnostics for the single-machine scheduler `
30 | 4. :doc:`Diagnostics for the distributed scheduler and dashboard `
31 |
32 | Additionally, if you are interested in understanding the various phases where
33 | slowdown can occur, you may wish to read the following:
34 |
35 | - :doc:`Phases of computation `
36 |
--------------------------------------------------------------------------------
/.github/workflows/conda.yml:
--------------------------------------------------------------------------------
1 | name: Conda build
2 | on:
3 | push:
4 | branches:
5 | - main
6 | pull_request:
7 | paths:
8 | - setup.py
9 | - continuous_integration/recipe/**
10 | - .github/workflows/conda.yml
11 |
12 | # When this workflow is queued, automatically cancel any previous running
13 | # or pending jobs from the same branch
14 | concurrency:
15 | group: conda-${{ github.head_ref }}
16 | cancel-in-progress: true
17 |
18 | # Required shell entrypoint to have properly activated conda environments
19 | defaults:
20 | run:
21 | shell: bash -l {0}
22 |
23 | jobs:
24 | conda:
25 | name: Build (and upload)
26 | runs-on: ubuntu-latest
27 | steps:
28 | - uses: actions/checkout@v3.5.3
29 | with:
30 | fetch-depth: 0
31 | - name: Set up Python
32 | uses: conda-incubator/setup-miniconda@v2.2.0
33 | with:
34 | miniforge-variant: Mambaforge
35 | use-mamba: true
36 | python-version: 3.9
37 | channel-priority: strict
38 | - name: Install dependencies
39 | run: |
40 | mamba install -c conda-forge boa conda-verify
41 |
42 | which python
43 | pip list
44 | mamba list
45 | - name: Build conda package
46 | run: |
47 | # suffix for nightly package versions
48 | export VERSION_SUFFIX=a`date +%y%m%d`
49 |
50 | conda mambabuild continuous_integration/recipe \
51 | --no-anaconda-upload \
52 | --output-folder .
53 | - name: Upload conda package
54 | if: |
55 | github.event_name == 'push'
56 | && github.ref == 'refs/heads/main'
57 | && github.repository == 'dask/dask'
58 | env:
59 | ANACONDA_API_TOKEN: ${{ secrets.DASK_CONDA_TOKEN }}
60 | run: |
61 | # install anaconda for upload
62 | mamba install -c conda-forge anaconda-client
63 |
64 | anaconda upload --label dev noarch/*.tar.bz2
65 |
--------------------------------------------------------------------------------
/dask/context.py:
--------------------------------------------------------------------------------
1 | """
2 | Control global computation context
3 | """
4 | from __future__ import annotations
5 |
6 | import threading
7 | from functools import partial
8 |
9 | from dask import config
10 |
11 | _globals = config.config
12 |
13 |
14 | thread_state = threading.local()
15 |
16 |
17 | def globalmethod(default=None, key=None, falsey=None):
18 | """Allow function to be taken over by globals
19 |
20 | This modifies a method so that occurrences of it may be taken over by
21 | functions registered in the global options. Can be used as a decorator or a
22 | function.
23 |
24 | Parameters
25 | ----------
26 | default : callable
27 | The default callable to use.
28 | key : str
29 | Key under which we register this function in the global parameters
30 | falsey : callable, None, optional
31 | A function to use if the option is falsey. If not provided, the default
32 | is used instead.
33 |
34 | Examples
35 | --------
36 | >>> import dask
37 | >>> class Foo:
38 | ... @globalmethod(key='bar', falsey=lambda: 3)
39 | ... def bar():
40 | ... return 1
41 | >>> f = Foo()
42 | >>> f.bar()
43 | 1
44 | >>> with dask.config.set(bar=lambda: 2):
45 | ... print(f.bar())
46 | 2
47 | >>> with dask.config.set(bar=False):
48 | ... print(f.bar())
49 | 3
50 | """
51 | if default is None:
52 | return partial(globalmethod, key=key, falsey=falsey)
53 | return GlobalMethod(default=default, key=key, falsey=falsey)
54 |
55 |
56 | class GlobalMethod:
57 | def __init__(self, default, key, falsey=None):
58 | self._default = default
59 | self._key = key
60 | self._falsey = falsey
61 |
62 | def __get__(self, instance, owner=None):
63 | if self._key in _globals:
64 | if _globals[self._key]:
65 | return _globals[self._key]
66 | elif self._falsey is not None:
67 | return self._falsey
68 | return self._default
69 |
--------------------------------------------------------------------------------
/dask/dataframe/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | try:
4 | import dask.dataframe._pyarrow_compat
5 | from dask.base import compute
6 | from dask.dataframe import backends, dispatch, rolling
7 | from dask.dataframe.core import (
8 | DataFrame,
9 | Index,
10 | Series,
11 | _Frame,
12 | map_partitions,
13 | repartition,
14 | to_datetime,
15 | to_timedelta,
16 | )
17 | from dask.dataframe.groupby import Aggregation
18 | from dask.dataframe.io import (
19 | demo,
20 | from_array,
21 | from_dask_array,
22 | from_delayed,
23 | from_dict,
24 | from_map,
25 | from_pandas,
26 | read_csv,
27 | read_fwf,
28 | read_hdf,
29 | read_json,
30 | read_sql,
31 | read_sql_query,
32 | read_sql_table,
33 | read_table,
34 | to_bag,
35 | to_csv,
36 | to_hdf,
37 | to_json,
38 | to_records,
39 | to_sql,
40 | )
41 | from dask.dataframe.multi import concat, merge, merge_asof
42 | from dask.dataframe.numeric import to_numeric
43 | from dask.dataframe.optimize import optimize
44 | from dask.dataframe.reshape import get_dummies, melt, pivot_table
45 | from dask.dataframe.utils import assert_eq
46 |
47 | try:
48 | from dask.dataframe.io import read_parquet, to_parquet
49 | except ImportError:
50 | pass
51 | try:
52 | from dask.dataframe.io import read_orc, to_orc
53 | except ImportError:
54 | pass
55 | try:
56 | from dask.dataframe.core import isna
57 | except ImportError:
58 | pass
59 | except ImportError as e:
60 | msg = (
61 | "Dask dataframe requirements are not installed.\n\n"
62 | "Please either conda or pip install as follows:\n\n"
63 | " conda install dask # either conda install\n"
64 | ' python -m pip install "dask[dataframe]" --upgrade # or python -m pip install'
65 | )
66 | raise ImportError(msg) from e
67 |
--------------------------------------------------------------------------------
/dask/dataframe/_pyarrow_compat.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import copyreg
4 |
5 | import pandas as pd
6 |
7 | try:
8 | import pyarrow as pa
9 | except ImportError:
10 | pa = None
11 |
12 | from dask.dataframe._compat import PANDAS_GE_150, PANDAS_GE_200
13 |
14 | # Pickling of pyarrow arrays is effectively broken - pickling a slice of an
15 | # array ends up pickling the entire backing array.
16 | #
17 | # See https://issues.apache.org/jira/browse/ARROW-10739
18 | #
19 | # This comes up when using pandas `string[pyarrow]` dtypes, which are backed by
20 | # a `pyarrow.StringArray`. To fix this, we register a *global* override for
21 | # pickling `ArrowStringArray` or `ArrowExtensionArray` types (where available).
22 | # We do this at the pandas level rather than the pyarrow level for efficiency reasons
23 | # (a pandas ArrowStringArray may contain many small pyarrow StringArray objects).
24 | #
25 | # The implementation here is based on https://github.com/pandas-dev/pandas/pull/49078
26 | # which is included in pandas=2+. We can remove all this once Dask's minimum
27 | # supported pandas version is at least 2.0.0.
28 |
29 |
30 | def rebuild_arrowextensionarray(type_, chunks):
31 | array = pa.chunked_array(chunks)
32 | return type_(array)
33 |
34 |
35 | def reduce_arrowextensionarray(x):
36 | return (rebuild_arrowextensionarray, (type(x), x._data.combine_chunks()))
37 |
38 |
39 | # `pandas=2` includes efficient serialization of `pyarrow`-backed extension arrays.
40 | # See https://github.com/pandas-dev/pandas/pull/49078 for details.
41 | # We only need to backport efficient serialization for `pandas<2`.
42 | if pa is not None and not PANDAS_GE_200:
43 | if PANDAS_GE_150:
44 | # Applies to all `pyarrow`-backed extension arrays (e.g. `string[pyarrow]`, `int64[pyarrow]`)
45 | for type_ in [pd.arrays.ArrowExtensionArray, pd.arrays.ArrowStringArray]:
46 | copyreg.dispatch_table[type_] = reduce_arrowextensionarray
47 | else:
48 | # Only `string[pyarrow]` is implemented, so just patch that
49 | copyreg.dispatch_table[pd.arrays.ArrowStringArray] = reduce_arrowextensionarray
50 |
--------------------------------------------------------------------------------
/dask/widgets/templates/highlevelgraph_layer.html.j2:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
11 |
12 | Layer{{ layer_index }}: {{ shortname }}
13 |
14 |
15 | {{ highlevelgraph_key }}
16 |
17 |
18 |
19 |
20 |
21 |
22 | {% for key, val in info.items() %}
23 |
24 | | {{ key }} |
25 | {{ val }} |
26 |
27 | {% endfor %}
28 | {% for dep in dependencies %}
29 | {% if loop.index > 1 %}
30 |
31 | |
32 | {{ dep }} |
33 |
34 | {% else %}
35 |
36 | | depends on |
37 | {{ dep }} |
38 |
39 | {% endif %}
40 | {% endfor %}
41 |
42 | |
43 |
44 | {{ svg_repr }}
45 | |
46 |
47 |
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/docs/source/bag-api.rst:
--------------------------------------------------------------------------------
1 | API
2 | ===
3 |
4 | .. currentmodule:: dask.bag
5 |
6 | Create Bags
7 | -----------
8 |
9 | .. autosummary::
10 | :toctree: generated/
11 |
12 | from_sequence
13 | from_delayed
14 | from_url
15 | range
16 | read_text
17 | read_avro
18 |
19 | From dataframe
20 | ~~~~~~~~~~~~~~
21 |
22 | .. currentmodule:: dask.dataframe
23 |
24 | .. autosummary::
25 | :toctree: generated/
26 |
27 | DataFrame.to_bag
28 | Series.to_bag
29 |
30 | Top-level functions
31 | -------------------
32 |
33 | .. currentmodule:: dask.bag
34 |
35 | .. autosummary::
36 | :toctree: generated/
37 |
38 | concat
39 | map
40 | map_partitions
41 | to_textfiles
42 | zip
43 |
44 | Random Sampling
45 | ---------------
46 |
47 | .. autosummary::
48 | :toctree: generated/
49 |
50 | random.choices
51 | random.sample
52 |
53 |
54 | Turn Bags into other things
55 | ---------------------------
56 |
57 | .. autosummary::
58 | :toctree: generated/
59 |
60 | Bag.to_textfiles
61 | Bag.to_dataframe
62 | Bag.to_delayed
63 | Bag.to_avro
64 |
65 |
66 | Bag Methods
67 | -----------
68 |
69 | .. autosummary::
70 | :toctree: generated/
71 |
72 | Bag
73 | Bag.accumulate
74 | Bag.all
75 | Bag.any
76 | Bag.compute
77 | Bag.count
78 | Bag.distinct
79 | Bag.filter
80 | Bag.flatten
81 | Bag.fold
82 | Bag.foldby
83 | Bag.frequencies
84 | Bag.groupby
85 | Bag.join
86 | Bag.map
87 | Bag.map_partitions
88 | Bag.max
89 | Bag.mean
90 | Bag.min
91 | Bag.persist
92 | Bag.pluck
93 | Bag.product
94 | Bag.reduction
95 | Bag.random_sample
96 | Bag.remove
97 | Bag.repartition
98 | Bag.starmap
99 | Bag.std
100 | Bag.sum
101 | Bag.take
102 | Bag.to_avro
103 | Bag.to_dataframe
104 | Bag.to_delayed
105 | Bag.to_textfiles
106 | Bag.topk
107 | Bag.var
108 | Bag.visualize
109 |
110 |
111 | Item Methods
112 | ------------
113 |
114 | .. autosummary::
115 | :toctree: generated/
116 |
117 | Item
118 | Item.apply
119 | Item.compute
120 | Item.from_delayed
121 | Item.persist
122 | Item.to_delayed
123 | Item.visualize
124 |
--------------------------------------------------------------------------------
/dask/cache.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 | from numbers import Number
5 | from timeit import default_timer
6 |
7 | from dask.callbacks import Callback
8 |
9 | overhead = sys.getsizeof(1.23) * 4 + sys.getsizeof(()) * 4
10 |
11 |
12 | class Cache(Callback):
13 | """Use cache for computation
14 |
15 | Examples
16 | --------
17 |
18 | >>> cache = Cache(1e9) # doctest: +SKIP
19 |
20 | The cache can be used locally as a context manager around ``compute`` or
21 | ``get`` calls:
22 |
23 | >>> with cache: # doctest: +SKIP
24 | ... result = x.compute()
25 |
26 | You can also register a cache globally, so that it works for all
27 | computations:
28 |
29 | >>> cache.register() # doctest: +SKIP
30 | >>> cache.unregister() # doctest: +SKIP
31 | """
32 |
33 | def __init__(self, cache, *args, **kwargs):
34 | try:
35 | import cachey
36 | except ImportError as ex:
37 | raise ImportError(
38 | 'Cache requires cachey, "{ex}" problem ' "importing".format(ex=str(ex))
39 | ) from ex
40 | self._nbytes = cachey.nbytes
41 | if isinstance(cache, Number):
42 | cache = cachey.Cache(cache, *args, **kwargs)
43 | else:
44 | assert not args and not kwargs
45 | self.cache = cache
46 | self.starttimes = dict()
47 |
48 | def _start(self, dsk):
49 | self.durations = dict()
50 | overlap = set(dsk) & set(self.cache.data)
51 | for key in overlap:
52 | dsk[key] = self.cache.data[key]
53 |
54 | def _pretask(self, key, dsk, state):
55 | self.starttimes[key] = default_timer()
56 |
57 | def _posttask(self, key, value, dsk, state, id):
58 | duration = default_timer() - self.starttimes[key]
59 | deps = state["dependencies"][key]
60 | if deps:
61 | duration += max(self.durations.get(k, 0) for k in deps)
62 | self.durations[key] = duration
63 | nb = self._nbytes(value) + overhead + sys.getsizeof(key) * 4
64 | self.cache.put(key, value, cost=duration / nb / 1e9, nbytes=nb)
65 |
66 | def _finish(self, dsk, state, errored):
67 | self.starttimes.clear()
68 | self.durations.clear()
69 |
--------------------------------------------------------------------------------
/dask/array/image.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import os
4 | from glob import glob
5 |
6 | try:
7 | from skimage.io import imread as sk_imread
8 | except (AttributeError, ImportError):
9 | pass
10 |
11 | from dask.array.core import Array
12 | from dask.base import tokenize
13 |
14 |
15 | def add_leading_dimension(x):
16 | return x[None, ...]
17 |
18 |
19 | def imread(filename, imread=None, preprocess=None):
20 | """Read a stack of images into a dask array
21 |
22 | Parameters
23 | ----------
24 |
25 | filename: string
26 | A globstring like 'myfile.*.png'
27 | imread: function (optional)
28 | Optionally provide custom imread function.
29 | Function should expect a filename and produce a numpy array.
30 | Defaults to ``skimage.io.imread``.
31 | preprocess: function (optional)
32 | Optionally provide custom function to preprocess the image.
33 | Function should expect a numpy array for a single image.
34 |
35 | Examples
36 | --------
37 |
38 | >>> from dask.array.image import imread
39 | >>> im = imread('2015-*-*.png') # doctest: +SKIP
40 | >>> im.shape # doctest: +SKIP
41 | (365, 1000, 1000, 3)
42 |
43 | Returns
44 | -------
45 |
46 | Dask array of all images stacked along the first dimension.
47 | Each separate image file will be treated as an individual chunk.
48 | """
49 | imread = imread or sk_imread
50 | filenames = sorted(glob(filename))
51 | if not filenames:
52 | raise ValueError("No files found under name %s" % filename)
53 |
54 | name = "imread-%s" % tokenize(filenames, map(os.path.getmtime, filenames))
55 |
56 | sample = imread(filenames[0])
57 | if preprocess:
58 | sample = preprocess(sample)
59 |
60 | keys = [(name, i) + (0,) * len(sample.shape) for i in range(len(filenames))]
61 | if preprocess:
62 | values = [
63 | (add_leading_dimension, (preprocess, (imread, fn))) for fn in filenames
64 | ]
65 | else:
66 | values = [(add_leading_dimension, (imread, fn)) for fn in filenames]
67 | dsk = dict(zip(keys, values))
68 |
69 | chunks = ((1,) * len(filenames),) + tuple((d,) for d in sample.shape)
70 |
71 | return Array(dsk, name, chunks, sample.dtype)
72 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.4.0
4 | hooks:
5 | - id: end-of-file-fixer
6 | - id: debug-statements
7 | - repo: https://github.com/MarcoGorelli/absolufy-imports
8 | rev: v0.3.1
9 | hooks:
10 | - id: absolufy-imports
11 | name: absolufy-imports
12 | - repo: https://github.com/pycqa/isort
13 | rev: 5.12.0
14 | hooks:
15 | - id: isort
16 | language_version: python3
17 | - repo: https://github.com/asottile/pyupgrade
18 | rev: v3.4.0
19 | hooks:
20 | - id: pyupgrade
21 | args:
22 | - --py39-plus
23 | - repo: https://github.com/psf/black
24 | rev: 23.3.0
25 | hooks:
26 | - id: black
27 | language_version: python3
28 | args:
29 | - --target-version=py39
30 | - repo: https://github.com/pycqa/flake8
31 | rev: 6.0.0
32 | hooks:
33 | - id: flake8
34 | language_version: python3
35 | additional_dependencies:
36 | # NOTE: autoupdate does not pick up flake8-bugbear since it is a transitive
37 | # dependency. Make sure to update flake8-bugbear manually on a regular basis.
38 | - flake8-bugbear==23.2.13
39 | - repo: https://github.com/codespell-project/codespell
40 | rev: v2.2.4
41 | hooks:
42 | - id: codespell
43 | types_or: [rst, markdown]
44 | files: docs
45 | additional_dependencies:
46 | - tomli
47 | - repo: https://github.com/pre-commit/mirrors-mypy
48 | # pinned due to
49 | # https://github.com/python/typeshed/pull/9771 and
50 | # https://github.com/python/mypy/issues/15257 for DaskCollection.__dask_scheduler__
51 | rev: v1.1.1
52 | hooks:
53 | - id: mypy
54 | # Override default --ignore-missing-imports
55 | # Use pyproject.toml if possible instead of adding command line parameters here
56 | args: [--warn-unused-configs]
57 | additional_dependencies:
58 | # Type stubs
59 | # - pandas-stubs # TODO
60 | - types-docutils
61 | - types-PyYAML
62 | - types-psutil
63 | - types-requests
64 | - types-setuptools
65 | # Typed libraries
66 | - numpy
67 | - pytest
68 |
--------------------------------------------------------------------------------
/dask/dask.yaml:
--------------------------------------------------------------------------------
1 | temporary-directory: null # Directory for local disk like /tmp, /scratch, or /local
2 |
3 | visualization:
4 | engine: null # Default visualization engine to use when calling `.visualize()` on a collection
5 |
6 | tokenize:
7 | ensure-deterministic: false # If true, tokenize will error instead of falling back to uuids
8 |
9 | dataframe:
10 | backend: "pandas" # Backend dataframe library for input IO and data creation
11 | shuffle:
12 | method: null
13 | compression: null # compression for on disk-shuffling. Partd supports ZLib, BZ2, SNAPPY
14 | parquet:
15 | metadata-task-size-local: 512 # Number of files per local metadata-processing task
16 | metadata-task-size-remote: 1 # Number of files per remote metadata-processing task
17 | convert-string: null # Whether to convert string-like data to pyarrow strings
18 |
19 | array:
20 | backend: "numpy" # Backend array library for input IO and data creation
21 | chunk-size: "128MiB"
22 | rechunk:
23 | method: "tasks" # Rechunking method to use
24 | threshold: 4
25 | svg:
26 | size: 120 # pixels
27 | slicing:
28 | split-large-chunks: null # How to handle large output chunks in slicing. Warns by default.
29 |
30 | optimization:
31 | annotations:
32 | fuse: true # Automatically fuse compatible annotations on layers
33 | fuse:
34 | active: null # Treat as false for dask.dataframe, true for everything else
35 | ave-width: 1
36 | max-width: null # 1.5 + ave_width * log(ave_width + 1)
37 | max-height: .inf
38 | max-depth-new-edges: null # ave_width * 1.5
39 | subgraphs: null # true for dask.dataframe, false for everything else
40 | rename-keys: true
41 |
42 | admin:
43 | traceback:
44 | shorten:
45 | when:
46 | - dask[\\\/]base.py
47 | - distributed[\\\/]client.py
48 | what:
49 | - dask[\\\/]base.py
50 | - dask[\\\/]core.py
51 | - dask[\\\/]array[\\\/]core.py
52 | - dask[\\\/]optimization.py
53 | - dask[\\\/]dataframe[\\\/]core.py
54 | - dask[\\\/]dataframe[\\\/]methods.py
55 | - dask[\\\/]utils.py
56 | - distributed[\\\/]worker.py
57 | - distributed[\\\/]scheduler.py
58 | - distributed[\\\/]client.py
59 | - distributed[\\\/]utils.py
60 | - tornado[\\\/]gen.py
61 | - pandas[\\\/]core[\\\/]
62 |
--------------------------------------------------------------------------------
/docs/source/array-stack.rst:
--------------------------------------------------------------------------------
1 | Stack, Concatenate, and Block
2 | =============================
3 |
4 | Often we have many arrays stored on disk that we want to stack together and
5 | think of as one large array. This is common with geospatial data in which we
6 | might have many HDF5/NetCDF files on disk, one for every day, but we want to do
7 | operations that span multiple days.
8 |
9 | To solve this problem, we use the functions ``da.stack``, ``da.concatenate``,
10 | and ``da.block``.
11 |
12 | Stack
13 | -----
14 |
15 | We stack many existing Dask arrays into a new array, creating a new dimension
16 | as we go.
17 |
18 | .. code-block:: python
19 |
20 | >>> import dask.array as da
21 |
22 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
23 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
24 |
25 | >>> data = [arr0, arr1]
26 |
27 | >>> x = da.stack(data, axis=0)
28 | >>> x.shape
29 | (2, 3, 4)
30 |
31 | >>> da.stack(data, axis=1).shape
32 | (3, 2, 4)
33 |
34 | >>> da.stack(data, axis=-1).shape
35 | (3, 4, 2)
36 |
37 | This creates a new dimension with length equal to the number of slices
38 |
39 | Concatenate
40 | -----------
41 |
42 | We concatenate existing arrays into a new array, extending them along an
43 | existing dimension
44 |
45 | .. code-block:: python
46 |
47 | >>> import dask.array as da
48 | >>> import numpy as np
49 |
50 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
51 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
52 |
53 | >>> data = [arr0, arr1]
54 |
55 | >>> x = da.concatenate(data, axis=0)
56 | >>> x.shape
57 | (6, 4)
58 |
59 | >>> da.concatenate(data, axis=1).shape
60 | (3, 8)
61 |
62 | Block
63 | -----
64 |
65 | We can handle a larger variety of cases with ``da.block`` as it allows
66 | concatenation to be applied over multiple dimensions at once. This is useful if
67 | your chunks tile a space, for example if small squares tile a larger 2-D plane.
68 |
69 | .. code-block:: python
70 |
71 | >>> import dask.array as da
72 | >>> import numpy as np
73 |
74 | >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
75 | >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
76 |
77 | >>> data = [
78 | ... [arr0, arr1],
79 | ... [arr1, arr0]
80 | ... ]
81 |
82 | >>> x = da.block(data)
83 | >>> x.shape
84 | (6, 8)
85 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_numeric.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 |
7 | from dask.array import Array, from_array
8 | from dask.dataframe import Series, from_pandas, to_numeric
9 | from dask.dataframe.utils import pyarrow_strings_enabled
10 | from dask.delayed import Delayed
11 |
12 |
13 | @pytest.mark.parametrize("arg", ["5", 5, "5 "])
14 | def test_to_numeric_on_scalars(arg):
15 | output = to_numeric(arg)
16 | assert isinstance(output, Delayed)
17 | assert output.compute() == 5
18 |
19 |
20 | def test_to_numeric_on_dask_array():
21 | arg = from_array(["1.0", "2", "-3", "5.1"])
22 | expected = np.array([1.0, 2.0, -3.0, 5.1])
23 | output = to_numeric(arg)
24 | assert isinstance(output, Array)
25 | assert list(output.compute()) == list(expected)
26 |
27 |
28 | def test_to_numeric_on_dask_dataframe_series():
29 | s = pd.Series(["1.0", "2", -3, -5.1])
30 | arg = from_pandas(s, npartitions=2)
31 | expected = pd.to_numeric(s)
32 | output = to_numeric(arg)
33 | expected_dtype = "int64"
34 | if pyarrow_strings_enabled():
35 | # `to_numeric` output depends on input dtype
36 | expected_dtype = "Int64"
37 | assert output.dtype == expected_dtype
38 | assert isinstance(output, Series)
39 | assert list(output.compute()) == list(expected)
40 |
41 |
42 | def test_to_numeric_on_dask_dataframe_series_with_meta():
43 | s = pd.Series(["1.0", "2", -3, -5.1])
44 | arg = from_pandas(s, npartitions=2)
45 | expected = pd.to_numeric(s)
46 | output = to_numeric(arg, meta=pd.Series([], dtype="float64"))
47 | assert output.dtype == "float64"
48 | assert isinstance(output, Series)
49 | assert list(output.compute()) == list(expected)
50 |
51 |
52 | def test_to_numeric_on_dask_dataframe_dataframe_raises_error():
53 | s = pd.Series(["1.0", "2", -3, -5.1])
54 | df = pd.DataFrame({"a": s, "b": s})
55 | arg = from_pandas(df, npartitions=2)
56 | with pytest.raises(TypeError, match="arg must be a list, tuple, dask."):
57 | to_numeric(arg)
58 |
59 |
60 | def test_to_numeric_raises():
61 | with pytest.raises(ValueError, match="invalid error value"):
62 | to_numeric("10", errors="invalid")
63 | with pytest.raises(KeyError, match="``meta`` is not allowed"):
64 | to_numeric("10", meta=pd.Series([], dtype="float64"))
65 |
--------------------------------------------------------------------------------
/dask/array/cupy_entry_point.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import dask.array as da
4 | from dask import config
5 | from dask.array.backends import ArrayBackendEntrypoint, register_cupy
6 | from dask.array.core import Array
7 | from dask.array.dispatch import to_cupy_dispatch
8 |
9 |
10 | def _cupy(strict=True):
11 | try:
12 | import cupy
13 | except ImportError:
14 | if strict:
15 | raise ImportError("Please install `cupy` to use `CupyBackendEntrypoint`")
16 | return None
17 | return cupy
18 |
19 |
20 | def _da_with_cupy_meta(attr, *args, meta=None, **kwargs):
21 | # Call the dask.array api with cupy-based meta
22 | meta = _cupy().empty(()) if meta is None else meta
23 | with config.set({"array.backend": "numpy"}):
24 | return getattr(da, attr)(*args, meta=meta, **kwargs)
25 |
26 |
27 | class CupyBackendEntrypoint(ArrayBackendEntrypoint):
28 | def __init__(self):
29 | """Register data-directed dispatch functions"""
30 | if _cupy(strict=False):
31 | register_cupy()
32 |
33 | @classmethod
34 | def to_backend_dispatch(cls):
35 | return to_cupy_dispatch
36 |
37 | @classmethod
38 | def to_backend(cls, data: Array, **kwargs):
39 | if isinstance(data._meta, _cupy().ndarray):
40 | # Already a cupy-backed collection
41 | return data
42 | return data.map_blocks(cls.to_backend_dispatch(), **kwargs)
43 |
44 | @property
45 | def RandomState(self):
46 | return _cupy().random.RandomState
47 |
48 | @property
49 | def default_bit_generator(self):
50 | return _cupy().random.XORWOW
51 |
52 | @staticmethod
53 | def ones(*args, **kwargs):
54 | return _da_with_cupy_meta("ones", *args, **kwargs)
55 |
56 | @staticmethod
57 | def zeros(*args, **kwargs):
58 | return _da_with_cupy_meta("zeros", *args, **kwargs)
59 |
60 | @staticmethod
61 | def empty(*args, **kwargs):
62 | return _da_with_cupy_meta("empty", *args, **kwargs)
63 |
64 | @staticmethod
65 | def full(*args, **kwargs):
66 | return _da_with_cupy_meta("full", *args, **kwargs)
67 |
68 | @staticmethod
69 | def arange(*args, like=None, **kwargs):
70 | like = _cupy().empty(()) if like is None else like
71 | with config.set({"array.backend": "numpy"}):
72 | return da.arange(*args, like=like, **kwargs)
73 |
--------------------------------------------------------------------------------
/dask/dataframe/_dtypes.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from datetime import date, time
4 | from decimal import Decimal
5 |
6 | import pandas as pd
7 |
8 | from dask.dataframe._compat import PANDAS_GE_150
9 | from dask.dataframe.extensions import make_array_nonempty, make_scalar
10 |
11 |
12 | @make_array_nonempty.register(pd.DatetimeTZDtype)
13 | def _(dtype):
14 | return pd.array([pd.Timestamp(1), pd.NaT], dtype=dtype)
15 |
16 |
17 | @make_scalar.register(pd.DatetimeTZDtype)
18 | def _(x):
19 | return pd.Timestamp(1, tz=x.tz, unit=x.unit)
20 |
21 |
22 | @make_array_nonempty.register(pd.StringDtype)
23 | def _(dtype):
24 | return pd.array(["a", pd.NA], dtype=dtype)
25 |
26 |
27 | if PANDAS_GE_150:
28 |
29 | @make_array_nonempty.register(pd.ArrowDtype)
30 | def _make_array_nonempty_pyarrow_dtype(dtype):
31 | import pyarrow as pa
32 |
33 | if pa.types.is_integer(dtype.pyarrow_dtype):
34 | data = [1, 2]
35 | elif pa.types.is_floating(dtype.pyarrow_dtype):
36 | data = [1.5, 2.5]
37 | elif pa.types.is_boolean(dtype.pyarrow_dtype):
38 | data = [True, False]
39 | elif pa.types.is_string(dtype.pyarrow_dtype) or pa.types.is_large_string(
40 | dtype.pyarrow_dtype
41 | ):
42 | data = ["a", "b"]
43 | elif pa.types.is_timestamp(dtype.pyarrow_dtype):
44 | data = [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-02")]
45 | elif pa.types.is_date(dtype.pyarrow_dtype):
46 | data = [date(1970, 1, 1), date(1970, 1, 2)]
47 | elif pa.types.is_binary(dtype.pyarrow_dtype) or pa.types.is_large_binary(
48 | dtype.pyarrow_dtype
49 | ):
50 | data = [b"a", b"b"]
51 | elif pa.types.is_decimal(dtype.pyarrow_dtype):
52 | data = [Decimal("1"), Decimal("0.0")]
53 | elif pa.types.is_duration(dtype.pyarrow_dtype):
54 | data = [pd.Timedelta("1 day"), pd.Timedelta("2 days")]
55 | elif pa.types.is_time(dtype.pyarrow_dtype):
56 | data = [time(12, 0), time(0, 12)]
57 | else:
58 | data = dtype.empty(2)
59 | return pd.array(data, dtype=dtype)
60 |
61 |
62 | @make_scalar.register(str)
63 | def _(x):
64 | return "s"
65 |
66 |
67 | @make_array_nonempty.register(pd.BooleanDtype)
68 | def _(dtype):
69 | return pd.array([True, pd.NA], dtype=dtype)
70 |
71 |
72 | @make_scalar.register(bool)
73 | def _(x):
74 | return True
75 |
--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_reductions.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import warnings
4 |
5 | import numpy as np
6 | import pytest
7 |
8 | pytestmark = pytest.mark.gpu
9 |
10 | import dask
11 | import dask.array as da
12 | from dask.array.utils import assert_eq
13 |
14 | cupy = pytest.importorskip("cupy")
15 |
16 |
17 | @pytest.mark.parametrize(
18 | ["dfunc", "func"],
19 | [
20 | (da.argmin, np.argmin),
21 | (da.argmax, np.argmax),
22 | (da.nanargmin, np.nanargmin),
23 | (da.nanargmax, np.nanargmax),
24 | ],
25 | )
26 | def test_arg_reductions(dfunc, func):
27 | x = cupy.random.default_rng().random((10, 10, 10))
28 | a = da.from_array(x, chunks=(3, 4, 5))
29 |
30 | assert_eq(dfunc(a), func(x))
31 | assert_eq(dfunc(a, 0), func(x, 0))
32 | assert_eq(dfunc(a, 1), func(x, 1))
33 | assert_eq(dfunc(a, 2), func(x, 2))
34 | with dask.config.set(split_every=2):
35 | assert_eq(dfunc(a), func(x))
36 | assert_eq(dfunc(a, 0), func(x, 0))
37 | assert_eq(dfunc(a, 1), func(x, 1))
38 | assert_eq(dfunc(a, 2), func(x, 2))
39 |
40 | pytest.raises(ValueError, lambda: dfunc(a, 3))
41 | pytest.raises(TypeError, lambda: dfunc(a, (0, 1)))
42 |
43 | x2 = cupy.arange(10)
44 | a2 = da.from_array(x2, chunks=3)
45 | assert_eq(dfunc(a2), func(x2))
46 | assert_eq(dfunc(a2, 0), func(x2, 0))
47 | assert_eq(dfunc(a2, 0, split_every=2), func(x2, 0))
48 |
49 |
50 | @pytest.mark.parametrize(
51 | ["dfunc", "func"], [(da.nanargmin, np.nanargmin), (da.nanargmax, np.nanargmax)]
52 | )
53 | def test_nanarg_reductions(dfunc, func):
54 | x = cupy.random.default_rng().random((10, 10, 10))
55 | x[5] = cupy.nan
56 | a = da.from_array(x, chunks=(3, 4, 5))
57 | assert_eq(dfunc(a), func(x))
58 | assert_eq(dfunc(a, 0), func(x, 0))
59 |
60 | with warnings.catch_warnings():
61 | warnings.simplefilter("ignore", RuntimeWarning) # All-NaN slice encountered
62 | with pytest.raises(ValueError):
63 | dfunc(a, 1).compute()
64 |
65 | with pytest.raises(ValueError):
66 | dfunc(a, 2).compute()
67 |
68 | x[:] = cupy.nan
69 | a = da.from_array(x, chunks=(3, 4, 5))
70 | with pytest.raises(ValueError):
71 | dfunc(a).compute()
72 |
73 |
74 | @pytest.mark.parametrize("func", [np.cumsum, np.cumprod])
75 | def test_cumreduction_with_cupy(func):
76 | a = cupy.ones((10, 10))
77 | b = da.from_array(a, chunks=(4, 4))
78 | result = func(b, axis=0)
79 | assert_eq(result, func(a, axis=0))
80 |
--------------------------------------------------------------------------------
/dask/widgets/templates/highlevelgraph.html.j2:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
14 |
15 |
16 |
HighLevelGraph
17 |
18 | {{ type }} with {{ layers | length }} layers and {{ n_outputs }} keys from all layers.
19 |
20 | {% for layer in toposort %}
21 | {{ layers[layer]._repr_html_(layer_index=loop.index, highlevelgraph_key=layer, dependencies=layer_dependencies[layer])}}
22 | {% endfor %}
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/.github/workflows/upstream.yml:
--------------------------------------------------------------------------------
1 | name: Upstream
2 |
3 | on:
4 | schedule:
5 | - cron: "0 1 * * *"
6 | push:
7 | pull_request:
8 | workflow_dispatch:
9 |
10 | # Required shell entrypoint to have properly activated conda environments
11 | defaults:
12 | run:
13 | shell: bash -l {0}
14 |
15 | jobs:
16 |
17 | check:
18 | runs-on: ubuntu-latest
19 | if: github.event_name == 'push' || github.event_name == 'pull_request'
20 | outputs:
21 | test-upstream: ${{ steps.detect-trigger.outputs.trigger-found }}
22 | steps:
23 | - uses: actions/checkout@v3.5.3
24 | with:
25 | fetch-depth: 2
26 | - uses: xarray-contrib/ci-trigger@v1
27 | id: detect-trigger
28 | with:
29 | keyword: "test-upstream"
30 |
31 | build:
32 | needs: check
33 | runs-on: ubuntu-latest
34 | if: |
35 | always()
36 | && (
37 | needs.check.outputs.test-upstream == 'true'
38 | || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'upstream'))
39 | || (github.repository == 'dask/dask' && github.event_name != 'pull_request')
40 | )
41 | timeout-minutes: 90
42 |
43 | env:
44 | COVERAGE: "true"
45 | PARALLEL: "true"
46 | UPSTREAM_DEV: 1
47 |
48 | steps:
49 | - name: Checkout source
50 | uses: actions/checkout@v3.5.3
51 |
52 | - name: Setup Conda Environment
53 | uses: conda-incubator/setup-miniconda@v2.2.0
54 | with:
55 | miniforge-variant: Mambaforge
56 | miniforge-version: latest
57 | use-mamba: true
58 | channel-priority: strict
59 | python-version: "3.10"
60 | environment-file: continuous_integration/environment-3.10.yaml
61 | activate-environment: test-environment
62 | auto-activate-base: false
63 |
64 | - name: Install
65 | run: source continuous_integration/scripts/install.sh
66 |
67 | - name: Run tests
68 | id: run_tests
69 | env:
70 | XTRATESTARGS: "--report-log output-log.jsonl"
71 | run: source continuous_integration/scripts/run_tests.sh
72 |
73 | - name: Open or update issue on failure
74 | if: |
75 | failure()
76 | && github.event_name != 'pull_request'
77 | && github.repository == 'dask/dask'
78 | && steps.run_tests.outcome == 'failure'
79 | uses: xarray-contrib/issue-from-pytest-log@v1.2.6
80 | with:
81 | log-path: output-log.jsonl
82 | issue-title: ⚠️ Upstream CI failed ⚠️
83 | issue-label: upstream
84 |
85 | - name: Coverage
86 | uses: codecov/codecov-action@v3
87 |
--------------------------------------------------------------------------------
/continuous_integration/scripts/install.sh:
--------------------------------------------------------------------------------
1 | set -xe
2 |
3 | if [[ ${UPSTREAM_DEV} ]]; then
4 |
5 | # NOTE: `dask/tests/test_ci.py::test_upstream_packages_installed` should up be
6 | # updated when pacakges here are updated.
7 |
8 | # FIXME https://github.com/mamba-org/mamba/issues/412
9 | # mamba uninstall --force ...
10 | conda uninstall --force bokeh
11 | mamba install -y -c bokeh/label/dev bokeh
12 |
13 | # FIXME https://github.com/mamba-org/mamba/issues/412
14 | # mamba uninstall --force ...
15 | conda uninstall --force pyarrow
16 | python -m pip install --no-deps \
17 | --extra-index-url https://pypi.fury.io/arrow-nightlies/ \
18 | --prefer-binary --pre pyarrow
19 |
20 | # FIXME https://github.com/mamba-org/mamba/issues/412
21 | # mamba uninstall --force ...
22 | conda uninstall --force fastparquet
23 | python -m pip install \
24 | --upgrade \
25 | locket \
26 | git+https://github.com/pydata/sparse \
27 | git+https://github.com/dask/s3fs \
28 | git+https://github.com/intake/filesystem_spec \
29 | git+https://github.com/dask/partd \
30 | git+https://github.com/dask/zict \
31 | git+https://github.com/dask/distributed \
32 | git+https://github.com/dask/fastparquet \
33 | git+https://github.com/zarr-developers/zarr-python
34 |
35 | # FIXME https://github.com/mamba-org/mamba/issues/412
36 | # mamba uninstall --force ...
37 | conda uninstall --force numpy pandas scipy
38 | python -m pip install --no-deps --pre --retries 10 \
39 | -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
40 | numpy \
41 | pandas \
42 | scipy
43 |
44 | # Used when automatically opening an issue when the `upstream` CI build fails
45 | mamba install pytest-reportlog
46 |
47 | # Crick doesn't work with latest nightly `numpy`. Temporarily remove
48 | # `crick` from the upstream CI environment as a workaround.
49 | # Can restore `crick` once https://github.com/dask/crick/issues/25 is closed.
50 |
51 | # Tiledb is causing segfaults. Temporarily remove `tiledb` and `tiledb-py`
52 | # as a workaround.
53 |
54 | # FIXME https://github.com/mamba-org/mamba/issues/412
55 | # mamba uninstall --force ...
56 | conda uninstall --force crick tiledb tiledb-py
57 |
58 |
59 | fi
60 |
61 | # Install dask
62 | python -m pip install --quiet --no-deps -e .[complete]
63 | echo mamba list
64 | mamba list
65 |
66 | # For debugging
67 | echo -e "--\n--Conda Environment (re-create this with \`conda env create --name -f \`)\n--"
68 | mamba env export | grep -E -v '^prefix:.*$' > env.yaml
69 | cat env.yaml
70 |
71 | set +xe
72 |
--------------------------------------------------------------------------------
/docs/source/images/unoverlapping-neighbors.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
77 |
--------------------------------------------------------------------------------
/.github/workflows/update-gpuci.yml:
--------------------------------------------------------------------------------
1 | name: Check for gpuCI updates
2 |
3 | on:
4 | schedule:
5 | - cron: "0 0 * * *" # Daily “At 00:00” UTC
6 | workflow_dispatch:
7 |
8 | jobs:
9 | update-gpuci:
10 | runs-on: ubuntu-latest
11 | if: github.repository == 'dask/dask'
12 |
13 | steps:
14 | - uses: actions/checkout@v3.5.3
15 |
16 | - name: Parse current axis YAML
17 | id: rapids_current
18 | uses: the-coding-turtle/ga-yaml-parser@v0.1.2
19 | with:
20 | file: continuous_integration/gpuci/axis.yaml
21 |
22 | - name: Get latest cuDF nightly version
23 | id: cudf_latest
24 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.3
25 | with:
26 | org: "rapidsai-nightly"
27 | package: "cudf"
28 | version_system: "CalVer"
29 |
30 | - name: Get latest UCX-Py nightly version
31 | id: ucx_py_latest
32 | uses: jacobtomlinson/gha-anaconda-package-version@0.1.3
33 | with:
34 | org: "rapidsai-nightly"
35 | package: "ucx-py"
36 | version_system: "CalVer"
37 |
38 | - name: Get old RAPIDS / UCX-Py versions
39 | env:
40 | FULL_RAPIDS_VER: ${{ steps.cudf_latest.outputs.version }}
41 | FULL_UCX_PY_VER: ${{ steps.ucx_py_latest.outputs.version }}
42 | run: |
43 | echo RAPIDS_VER=${{ steps.rapids_current.outputs.RAPIDS_VER_0 }} >> $GITHUB_ENV
44 | echo UCX_PY_VER=$(curl -sL https://version.gpuci.io/rapids/${{ steps.rapids_current.outputs.RAPIDS_VER_0 }}) >> $GITHUB_ENV
45 | echo NEW_RAPIDS_VER=${FULL_RAPIDS_VER::-4} >> $GITHUB_ENV
46 | echo NEW_UCX_PY_VER=${FULL_UCX_PY_VER::-4} >> $GITHUB_ENV
47 |
48 | - name: Update RAPIDS version
49 | uses: jacobtomlinson/gha-find-replace@v3
50 | with:
51 | include: 'continuous_integration\/gpuci\/axis\.yaml'
52 | find: "${{ env.RAPIDS_VER }}"
53 | replace: "${{ env.NEW_RAPIDS_VER }}"
54 | regex: false
55 |
56 | - name: Create Pull Request
57 | uses: peter-evans/create-pull-request@v5
58 | if: ${{ env.UCX_PY_VER != env.NEW_UCX_PY_VER }} # make sure new ucx-py nightlies are available
59 | with:
60 | token: ${{ secrets.GITHUB_TOKEN }}
61 | draft: true
62 | commit-message: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`"
63 | title: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`"
64 | author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
65 | branch: "upgrade-gpuci-rapids"
66 | body: |
67 | New cuDF and ucx-py nightly versions have been detected.
68 |
69 | Updated `axis.yaml` to use `${{ env.NEW_RAPIDS_VER }}`.
70 |
--------------------------------------------------------------------------------
/dask/dataframe/hyperloglog.py:
--------------------------------------------------------------------------------
1 | """Implementation of HyperLogLog
2 |
3 | This implements the HyperLogLog algorithm for cardinality estimation, found
4 | in
5 |
6 | Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier.
7 | "HyperLogLog: the analysis of a near-optimal cardinality estimation
8 | algorithm". 2007 Conference on Analysis of Algorithms. Nice, France
9 | (2007)
10 |
11 | """
12 | from __future__ import annotations
13 |
14 | import numpy as np
15 | import pandas as pd
16 | from pandas.util import hash_pandas_object
17 |
18 |
19 | def compute_first_bit(a):
20 | "Compute the position of the first nonzero bit for each int in an array."
21 | # TODO: consider making this less memory-hungry
22 | bits = np.bitwise_and.outer(a, 1 << np.arange(32))
23 | bits = bits.cumsum(axis=1).astype(bool)
24 | return 33 - bits.sum(axis=1)
25 |
26 |
27 | def compute_hll_array(obj, b):
28 | # b is the number of bits
29 |
30 | if not 8 <= b <= 16:
31 | raise ValueError("b should be between 8 and 16")
32 | num_bits_discarded = 32 - b
33 | m = 1 << b
34 |
35 | # Get an array of the hashes
36 | hashes = hash_pandas_object(obj, index=False)
37 | if isinstance(hashes, pd.Series):
38 | hashes = hashes._values
39 | hashes = hashes.astype(np.uint32)
40 |
41 | # Of the first b bits, which is the first nonzero?
42 | j = hashes >> num_bits_discarded
43 | first_bit = compute_first_bit(hashes)
44 |
45 | # Pandas can do the max aggregation
46 | df = pd.DataFrame({"j": j, "first_bit": first_bit})
47 | series = df.groupby("j").max()["first_bit"]
48 |
49 | # Return a dense array so we can concat them and get a result
50 | # that is easy to deal with
51 | return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
52 |
53 |
54 | def reduce_state(Ms, b):
55 | m = 1 << b
56 |
57 | # We concatenated all of the states, now we need to get the max
58 | # value for each j in both
59 | Ms = Ms.reshape((len(Ms) // m), m)
60 | return Ms.max(axis=0)
61 |
62 |
63 | def estimate_count(Ms, b):
64 | m = 1 << b
65 |
66 | # Combine one last time
67 | M = reduce_state(Ms, b)
68 |
69 | # Estimate cardinality, no adjustments
70 | alpha = 0.7213 / (1 + 1.079 / m)
71 | E = alpha * m / (2.0 ** -(M.astype("f8"))).sum() * m
72 | # ^^^^ starts as unsigned, need a signed type for
73 | # negation operator to do something useful
74 |
75 | # Apply adjustments for small / big cardinalities, if applicable
76 | if E < 2.5 * m:
77 | V = (M == 0).sum()
78 | if V:
79 | return m * np.log(m / V)
80 | if E > 2**32 / 30.0:
81 | return -(2**32) * np.log1p(-E / 2**32)
82 | return E
83 |
--------------------------------------------------------------------------------
/docs/source/graph_manipulation.rst:
--------------------------------------------------------------------------------
1 | .. _graph_manipulation:
2 |
3 | Advanced graph manipulation
4 | ===========================
5 | There are some situations where computations with Dask collections will result in
6 | suboptimal memory usage (e.g. an entire Dask DataFrame is loaded into memory).
7 | This may happen when Dask’s scheduler doesn’t automatically delay the computation of
8 | nodes in a task graph to avoid occupying memory with their output for prolonged periods
9 | of time, or in scenarios where recalculating nodes is much cheaper than holding their
10 | output in memory.
11 |
12 | This page highlights a set of graph manipulation utilities which can be used to help
13 | avoid these scenarios. In particular, the utilities described below rewrite the
14 | underlying Dask graph for Dask collections, producing equivalent collections with
15 | different sets of keys.
16 |
17 | Consider the following example:
18 |
19 | .. code-block:: python
20 |
21 | >>> import dask.array as da
22 | >>> x = da.random.default_rng().normal(size=500_000_000, chunks=100_000)
23 | >>> x_mean = x.mean()
24 | >>> y = (x - x_mean).max().compute()
25 |
26 | The above example computes the largest value of a distribution after removing its bias.
27 | This involves loading the chunks of ``x`` into memory in order to compute ``x_mean``.
28 | However, since the ``x`` array is needed later in the computation to compute ``y``, the
29 | entire ``x`` array is kept in memory. For large Dask Arrays this can be very
30 | problematic.
31 |
32 | To alleviate the need for the entire ``x`` array to be kept in memory, one could rewrite
33 | the last line as follows:
34 |
35 | .. code-block:: python
36 |
37 | >>> from dask.graph_manipulation import bind
38 | >>> xb = bind(x, x_mean)
39 | >>> y = (xb - x_mean).max().compute()
40 |
41 | Here we use :func:`~dask.graph_manipulation.bind` to create a new Dask Array, ``xb``,
42 | which produces exactly the same output as ``x``, but whose underlying Dask graph has
43 | different keys than ``x``, and will only be computed after ``x_mean`` has been
44 | calculated.
45 |
46 | This results in the chunks of ``x`` being computed and immediately individually reduced
47 | by ``mean``; then recomputed and again immediately pipelined into the subtraction
48 | followed by reduction with ``max``. This results in a much smaller peak memory usage as
49 | the full ``x`` array is no longer loaded into memory. However, the tradeoff is that the
50 | compute time increases as ``x`` is computed twice.
51 |
52 |
53 | API
54 | ---
55 |
56 | .. currentmodule:: dask.graph_manipulation
57 |
58 | .. autosummary::
59 |
60 | checkpoint
61 | wait_on
62 | bind
63 | clone
64 |
65 |
66 | Definitions
67 | ~~~~~~~~~~~
68 |
69 | .. autofunction:: checkpoint
70 | .. autofunction:: wait_on
71 | .. autofunction:: bind
72 | .. autofunction:: clone
73 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | import dask
6 |
7 | # The doctests in these files fail due to either:
8 | # - Non-required dependencies not being installed
9 | # - Imported doctests due to pulling the docstrings from other packages
10 | # (e.g. `numpy`). No need to run these doctests.
11 | collect_ignore = [
12 | "dask/bytes/hdfs3.py",
13 | "dask/bytes/pyarrow.py",
14 | "dask/bytes/s3.py",
15 | "dask/array/ghost.py",
16 | "dask/array/fft.py",
17 | "dask/dataframe/io/io.py",
18 | "dask/dataframe/io/parquet/arrow.py",
19 | "dask/dot.py",
20 | "dask/ml.py",
21 | ]
22 |
23 | collect_ignore_glob = []
24 | try:
25 | import numpy # noqa: F401
26 | except ImportError:
27 | collect_ignore_glob.append("dask/array/*")
28 |
29 | try:
30 | import pandas # noqa: F401
31 | except ImportError:
32 | collect_ignore_glob.append("dask/dataframe/*")
33 |
34 | try:
35 | import scipy # noqa: F401
36 | except ImportError:
37 | collect_ignore.append("dask/array/stats.py")
38 |
39 | try:
40 | import pyarrow # noqa: F401
41 | except ImportError:
42 | collect_ignore.append("dask/dataframe/io/orc/arrow.py")
43 |
44 | try:
45 | import tiledb # noqa: F401
46 | except ImportError:
47 | collect_ignore.append("dask/array/tiledb_io.py")
48 |
49 | try:
50 | import sqlalchemy # noqa: F401
51 | except ImportError:
52 | collect_ignore.append("dask/dataframe/io/sql.py")
53 |
54 |
55 | def pytest_addoption(parser):
56 | parser.addoption("--runslow", action="store_true", help="run slow tests")
57 |
58 |
59 | def pytest_runtest_setup(item):
60 | if "slow" in item.keywords and not item.config.getoption("--runslow"):
61 | pytest.skip("need --runslow option to run")
62 |
63 |
64 | try:
65 | from dask.dataframe.utils import pyarrow_strings_enabled
66 |
67 | convert_string = pyarrow_strings_enabled()
68 | except (ImportError, RuntimeError):
69 | convert_string = False
70 |
71 | skip_with_pyarrow_strings = pytest.mark.skipif(
72 | convert_string,
73 | reason="No need to run with pyarrow strings",
74 | )
75 |
76 | xfail_with_pyarrow_strings = pytest.mark.xfail(
77 | convert_string,
78 | reason="Known failure with pyarrow strings",
79 | )
80 |
81 |
82 | def pytest_collection_modifyitems(config, items):
83 | for item in items:
84 | if "skip_with_pyarrow_strings" in item.keywords:
85 | item.add_marker(skip_with_pyarrow_strings)
86 | if "xfail_with_pyarrow_strings" in item.keywords:
87 | item.add_marker(xfail_with_pyarrow_strings)
88 |
89 |
90 | pytest.register_assert_rewrite(
91 | "dask.array.utils", "dask.dataframe.utils", "dask.bag.utils"
92 | )
93 |
94 |
95 | @pytest.fixture(params=["disk", "tasks"])
96 | def shuffle_method(request):
97 | with dask.config.set({"dataframe.shuffle.method": request.param}):
98 | yield request.param
99 |
--------------------------------------------------------------------------------
/docs/source/images/optimize_dask5.svg:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
50 |
--------------------------------------------------------------------------------
/dask/hashing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import binascii
4 | import hashlib
5 |
6 | hashers = [] # In decreasing performance order
7 |
8 |
9 | # Timings on a largish array:
10 | # - CityHash is 2x faster than MurmurHash
11 | # - xxHash is slightly slower than CityHash
12 | # - MurmurHash is 8x faster than SHA1
13 | # - SHA1 is significantly faster than all other hashlib algorithms
14 |
15 | try:
16 | import cityhash # `python -m pip install cityhash`
17 | except ImportError:
18 | pass
19 | else:
20 | # CityHash disabled unless the reference leak in
21 | # https://github.com/escherba/python-cityhash/pull/16
22 | # is fixed.
23 | if cityhash.__version__ >= "0.2.2":
24 |
25 | def _hash_cityhash(buf):
26 | """
27 | Produce a 16-bytes hash of *buf* using CityHash.
28 | """
29 | h = cityhash.CityHash128(buf)
30 | return h.to_bytes(16, "little")
31 |
32 | hashers.append(_hash_cityhash)
33 |
34 | try:
35 | import xxhash # `python -m pip install xxhash`
36 | except ImportError:
37 | pass
38 | else:
39 |
40 | def _hash_xxhash(buf):
41 | """
42 | Produce a 8-bytes hash of *buf* using xxHash.
43 | """
44 | return xxhash.xxh64(buf).digest()
45 |
46 | hashers.append(_hash_xxhash)
47 |
48 | try:
49 | import mmh3 # `python -m pip install mmh3`
50 | except ImportError:
51 | pass
52 | else:
53 |
54 | def _hash_murmurhash(buf):
55 | """
56 | Produce a 16-bytes hash of *buf* using MurmurHash.
57 | """
58 | return mmh3.hash_bytes(buf)
59 |
60 | hashers.append(_hash_murmurhash)
61 |
62 |
63 | def _hash_sha1(buf):
64 | """
65 | Produce a 20-bytes hash of *buf* using SHA1.
66 | """
67 | return hashlib.sha1(buf).digest()
68 |
69 |
70 | hashers.append(_hash_sha1)
71 |
72 |
73 | def hash_buffer(buf, hasher=None):
74 | """
75 | Hash a bytes-like (buffer-compatible) object. This function returns
76 | a good quality hash but is not cryptographically secure. The fastest
77 | available algorithm is selected. A fixed-length bytes object is returned.
78 | """
79 | if hasher is not None:
80 | try:
81 | return hasher(buf)
82 | except (TypeError, OverflowError):
83 | # Some hash libraries may have overly-strict type checking,
84 | # not accepting all buffers
85 | pass
86 | for hasher in hashers:
87 | try:
88 | return hasher(buf)
89 | except (TypeError, OverflowError):
90 | pass
91 | raise TypeError(f"unsupported type for hashing: {type(buf)}")
92 |
93 |
94 | def hash_buffer_hex(buf, hasher=None):
95 | """
96 | Same as hash_buffer, but returns its result in hex-encoded form.
97 | """
98 | h = hash_buffer(buf, hasher)
99 | s = binascii.b2a_hex(h)
100 | return s.decode()
101 |
--------------------------------------------------------------------------------
/docs/source/images/dask_horizontal.svg:
--------------------------------------------------------------------------------
1 |
14 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_hashing.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 | from pandas.util import hash_pandas_object
7 |
8 | import dask.dataframe as dd
9 | from dask.dataframe import _compat
10 | from dask.dataframe._compat import tm
11 | from dask.dataframe.utils import assert_eq
12 |
13 |
14 | @pytest.mark.parametrize(
15 | "obj",
16 | [
17 | pd.Series([1, 2, 3]),
18 | pd.Series([1.0, 1.5, 3.2]),
19 | pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
20 | pd.Series(["a", "b", "c"]),
21 | pd.Series([True, False, True]),
22 | pd.Index([1, 2, 3]),
23 | pd.Index([True, False, True]),
24 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
25 | _compat.makeMissingDataframe(),
26 | _compat.makeMixedDataFrame(),
27 | _compat.makeTimeDataFrame(),
28 | _compat.makeTimeSeries(),
29 | _compat.makeTimedeltaIndex(),
30 | ],
31 | )
32 | def test_hash_pandas_object(obj):
33 | a = hash_pandas_object(obj)
34 | b = hash_pandas_object(obj)
35 | if isinstance(a, np.ndarray):
36 | np.testing.assert_equal(a, b)
37 | else:
38 | assert_eq(a, b)
39 |
40 |
41 | def test_categorical_consistency():
42 | # Check that categoricals hash consistent with their values, not codes
43 | # This should work for categoricals of any dtype
44 | for s1 in [
45 | pd.Series(["a", "b", "c", "d"]),
46 | pd.Series([1000, 2000, 3000, 4000]),
47 | pd.Series(pd.date_range(0, periods=4)),
48 | ]:
49 | s2 = s1.astype("category").cat.set_categories(s1)
50 | s3 = s2.cat.set_categories(list(reversed(s1)))
51 | for categorize in [True, False]:
52 | # These should all hash identically
53 | h1 = hash_pandas_object(s1, categorize=categorize)
54 | h2 = hash_pandas_object(s2, categorize=categorize)
55 | h3 = hash_pandas_object(s3, categorize=categorize)
56 | tm.assert_series_equal(h1, h2)
57 | tm.assert_series_equal(h1, h3)
58 |
59 |
60 | def test_object_missing_values():
61 | # Check that the presence of missing values doesn't change how object dtype
62 | # is hashed.
63 | s = pd.Series(["a", "b", "c", None])
64 | h1 = hash_pandas_object(s).iloc[:3]
65 | h2 = hash_pandas_object(s.iloc[:3])
66 | tm.assert_series_equal(h1, h2)
67 |
68 |
69 | @pytest.mark.parametrize(
70 | "obj",
71 | [
72 | pd.Index([1, 2, 3]),
73 | pd.Index([True, False, True]),
74 | pd.Series([1, 2, 3]),
75 | pd.Series([1.0, 1.5, 3.2]),
76 | pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
77 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
78 | pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}, index=["a", "z", "x"]),
79 | ],
80 | )
81 | def test_hash_object_dispatch(obj):
82 | result = dd.dispatch.hash_object_dispatch(obj)
83 | expected = pd.util.hash_pandas_object(obj)
84 | assert_eq(result, expected)
85 |
--------------------------------------------------------------------------------
/dask/array/tests/test_wrap.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import pytest
4 |
5 | pytest.importorskip("numpy")
6 |
7 | import numpy as np
8 |
9 | import dask.array as da
10 | from dask.array.utils import assert_eq
11 | from dask.array.wrap import ones
12 |
13 |
14 | def test_ones():
15 | a = ones((10, 10), dtype="i4", chunks=(4, 4))
16 | x = np.array(a)
17 | assert (x == np.ones((10, 10), "i4")).all()
18 |
19 | assert a.name.startswith("ones_like-")
20 |
21 |
22 | def test_size_as_list():
23 | a = ones([10, 10], dtype="i4", chunks=(4, 4))
24 | x = np.array(a)
25 | assert (x == np.ones((10, 10), dtype="i4")).all()
26 |
27 |
28 | def test_singleton_size():
29 | a = ones(10, dtype="i4", chunks=(4,))
30 | x = np.array(a)
31 | assert (x == np.ones(10, dtype="i4")).all()
32 |
33 |
34 | def test_kwargs():
35 | a = ones(10, dtype="i4", chunks=(4,))
36 | x = np.array(a)
37 | assert (x == np.ones(10, dtype="i4")).all()
38 |
39 |
40 | def test_full():
41 | a = da.full((3, 3), 100, chunks=(2, 2), dtype="i8")
42 |
43 | assert (a.compute() == 100).all()
44 | assert a.dtype == a.compute(scheduler="sync").dtype == "i8"
45 |
46 | assert a.name.startswith("full_like-")
47 |
48 |
49 | def test_full_error_nonscalar_fill_value():
50 | with pytest.raises(ValueError, match="fill_value must be scalar"):
51 | da.full((3, 3), [100, 100], chunks=(2, 2), dtype="i8")
52 |
53 |
54 | def test_full_detects_da_dtype():
55 | x = da.from_array(100)
56 | with pytest.warns(FutureWarning, match="not implemented by Dask array") as record:
57 | # This shall not raise an NotImplementedError due to dtype detected as object.
58 | a = da.full(shape=(3, 3), fill_value=x)
59 | assert a.dtype == x.dtype
60 | assert_eq(a, np.full(shape=(3, 3), fill_value=100))
61 | assert len(record) == 1
62 |
63 |
64 | def test_full_none_dtype():
65 | a = da.full(shape=(3, 3), fill_value=100, dtype=None)
66 | assert_eq(a, np.full(shape=(3, 3), fill_value=100, dtype=None))
67 |
68 |
69 | def test_full_like_error_nonscalar_fill_value():
70 | x = np.full((3, 3), 1, dtype="i8")
71 | with pytest.raises(ValueError, match="fill_value must be scalar"):
72 | da.full_like(x, [100, 100], chunks=(2, 2), dtype="i8")
73 |
74 |
75 | def test_can_make_really_big_array_of_ones():
76 | ones((1000000, 1000000), chunks=(100000, 100000))
77 | ones(shape=(1000000, 1000000), chunks=(100000, 100000))
78 |
79 |
80 | def test_wrap_consistent_names():
81 | assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) == sorted(
82 | ones(10, dtype="i4", chunks=(4,)).dask
83 | )
84 | assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) != sorted(
85 | ones(10, chunks=(4,)).dask
86 | )
87 | assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask) == sorted(
88 | da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask
89 | )
90 | assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="i2").dask) != sorted(
91 | da.full((3, 3), 100, chunks=(2, 2)).dask
92 | )
93 |
--------------------------------------------------------------------------------
/dask/array/tests/test_svg.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import xml.etree.ElementTree
4 |
5 | import pytest
6 |
7 | import dask.array as da
8 | from dask.array.svg import draw_sizes
9 |
10 |
11 | def parses(text):
12 | cleaned = text.replace("→", "") # xml doesn't like righarrow character
13 | assert xml.etree.ElementTree.fromstring(cleaned) is not None # parses cleanly
14 |
15 |
16 | def test_basic():
17 | parses(da.ones(10).to_svg())
18 | parses(da.ones((10, 10)).to_svg())
19 | parses(da.ones((10, 10, 10)).to_svg())
20 | parses(da.ones((10, 10, 10, 10)).to_svg())
21 | parses(da.ones((10, 10, 10, 10, 10)).to_svg())
22 | parses(da.ones((10, 10, 10, 10, 10, 10)).to_svg())
23 | parses(da.ones((10, 10, 10, 10, 10, 10, 10)).to_svg())
24 |
25 |
26 | def test_repr_html():
27 | pytest.importorskip("jinja2")
28 | assert da.ones([])._repr_html_()
29 | assert da.ones(10)[:0]._repr_html_()
30 | assert da.ones(10)._repr_html_()
31 | assert da.ones((10, 10))._repr_html_()
32 | assert da.ones((10, 10, 10))._repr_html_()
33 | assert da.ones((10, 10, 10, 10))._repr_html_()
34 |
35 |
36 | def test_errors():
37 | # empty arrays
38 | with pytest.raises(NotImplementedError) as excpt:
39 | da.ones([]).to_svg()
40 | assert "0 dimensions" in str(excpt.value)
41 |
42 | # Scalars
43 | with pytest.raises(NotImplementedError) as excpt:
44 | da.asarray(1).to_svg()
45 | assert "0 dimensions" in str(excpt.value)
46 |
47 | # 0-length dims arrays
48 | with pytest.raises(NotImplementedError) as excpt:
49 | da.ones(10)[:0].to_svg()
50 | assert "0-length dimensions" in str(excpt.value)
51 |
52 | # unknown chunk sizes
53 | with pytest.raises(NotImplementedError) as excpt:
54 | x = da.ones(10)
55 | x = x[x > 5]
56 | x.to_svg()
57 | assert "unknown chunk sizes" in str(excpt.value)
58 |
59 |
60 | def test_repr_html_size_units():
61 | pytest.importorskip("jinja2")
62 | x = da.ones((10000, 5000))
63 | x = da.ones((3000, 10000), chunks=(1000, 1000))
64 | text = x._repr_html_()
65 |
66 | assert "MB" in text or "MiB" in text
67 | assert str(x.shape) in text
68 | assert str(x.dtype) in text
69 |
70 | parses(text)
71 |
72 | x = da.ones((3000, 10000, 50), chunks=(1000, 1000, 10))
73 | parses(x._repr_html_())
74 |
75 |
76 | def test_draw_sizes():
77 | assert draw_sizes((10, 10), size=100) == (100, 100) # respect symmetry
78 | assert draw_sizes((10, 10), size=200) == (200, 200) # respect size keyword
79 | assert draw_sizes((10, 5), size=100) == (100, 50) # respect small ratios
80 |
81 | a, b, c = draw_sizes((1000, 100, 10))
82 | assert a > b
83 | assert b > c
84 | assert a < b * 5
85 | assert b < c * 5
86 |
87 |
88 | def test_too_many_lines_fills_sides_darker():
89 | data = da.ones((16000, 2400, 3600), chunks=(1, 2400, 3600))
90 | text = data.to_svg()
91 | assert "8B4903" in text
92 | assert text.count("\n") < 300
93 |
94 |
95 | def test_3d():
96 | text = da.ones((10, 10, 10, 10, 10)).to_svg()
97 | assert text.count("`_
18 | and so it is suitable for use both normally as a Jupyter server, and also as
19 | part of a JupyterHub deployment. It also includes a matching Dask software
20 | environment described above. This image is about 2GB in size.
21 |
22 | Example
23 | -------
24 |
25 | Here is a simple example on a dedicated virtual network
26 |
27 | .. code-block:: bash
28 |
29 | docker network create dask
30 |
31 | docker run --network dask -p 8787:8787 --name scheduler ghcr.io/dask/dask dask-scheduler # start scheduler
32 |
33 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
34 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
35 | docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
36 |
37 | docker run --network dask -p 8888:8888 ghcr.io/dask/dask-notebook # start Jupyter server
38 |
39 | Then from within the notebook environment you can connect to the Dask cluster like this:
40 |
41 | .. code-block:: python
42 |
43 | from dask.distributed import Client
44 | client = Client("scheduler:8786")
45 | client
46 |
47 | Extensibility
48 | -------------
49 |
50 | Users can mildly customize the software environment by populating the
51 | environment variables ``EXTRA_APT_PACKAGES``, ``EXTRA_CONDA_PACKAGES``, and
52 | ``EXTRA_PIP_PACKAGES``. If these environment variables are set in the container,
53 | they will trigger calls to the following respectively::
54 |
55 | apt-get install $EXTRA_APT_PACKAGES
56 | conda install $EXTRA_CONDA_PACKAGES
57 | python -m pip install $EXTRA_PIP_PACKAGES
58 |
59 | For example, the following ``conda`` installs the ``joblib`` package into
60 | the Dask worker software environment:
61 |
62 | .. code-block:: bash
63 |
64 | docker run --network dask -e EXTRA_CONDA_PACKAGES="joblib" ghcr.io/dask/dask dask-worker scheduler:8786
65 |
66 | Note that using these can significantly delay the container from starting,
67 | especially when using ``apt``, or ``conda`` (``pip`` is relatively fast).
68 |
69 | Remember that it is important for software versions to match between Dask
70 | workers and Dask clients. As a result, it is often useful to include the same
71 | extra packages in both Jupyter and Worker images.
72 |
73 | Source
74 | ------
75 |
76 | Docker files are maintained at https://github.com/dask/dask-docker.
77 | This repository also includes a docker-compose configuration.
78 |
--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_percentile.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pytest
5 |
6 | pytestmark = pytest.mark.gpu
7 |
8 | import dask.array as da
9 | from dask.array.utils import assert_eq, same_keys
10 |
11 | cupy = pytest.importorskip("cupy")
12 |
13 |
14 | def test_percentile():
15 | d = da.from_array(cupy.ones((16,)), chunks=(4,))
16 | qs = np.array([0, 50, 100])
17 |
18 | result = da.percentile(d, qs, method="midpoint")
19 | assert_eq(result, np.array([1, 1, 1], dtype=d.dtype), check_type=False)
20 |
21 | x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20])
22 | d = da.from_array(x, chunks=(3,))
23 |
24 | result = da.percentile(d, qs, method="midpoint")
25 | assert_eq(result, np.array([0, 5, 20], dtype=result.dtype), check_type=False)
26 |
27 | assert not same_keys(
28 | da.percentile(d, qs, "midpoint"),
29 | da.percentile(d, [0, 50], "midpoint"),
30 | )
31 |
32 |
33 | @pytest.mark.xfail(
34 | reason="Non-deterministic tokenize(cupy.array(...)), "
35 | "see https://github.com/dask/dask/issues/6718"
36 | )
37 | def test_percentile_tokenize():
38 | d = da.from_array(cupy.ones((16,)), chunks=(4,))
39 | qs = np.array([0, 50, 100])
40 | assert same_keys(da.percentile(d, qs), da.percentile(d, qs))
41 |
42 |
43 | def test_percentiles_with_empty_arrays():
44 | x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),))
45 | result = da.percentile(x, [10, 50, 90], method="midpoint")
46 | assert type(result._meta) == cupy.ndarray
47 | assert_eq(result, result) # Check that _meta and computed arrays match types
48 | assert_eq(result, np.array([1, 1, 1], dtype=x.dtype), check_type=False)
49 |
50 |
51 | def test_percentiles_with_empty_q():
52 | x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),))
53 | result = da.percentile(x, [], method="midpoint")
54 | assert type(result._meta) == cupy.ndarray
55 | assert_eq(result, result) # Check that _meta and computed arrays match types
56 | assert_eq(result, np.array([], dtype=x.dtype), check_type=False)
57 |
58 |
59 | @pytest.mark.parametrize("q", [5, 5.0, np.int64(5), np.float64(5)])
60 | def test_percentiles_with_scaler_percentile(q):
61 | # Regression test to ensure da.percentile works with scalar percentiles
62 | # See #3020
63 | d = da.from_array(cupy.ones((16,)), chunks=(4,))
64 | result = da.percentile(d, q, method="midpoint")
65 | assert type(result._meta) == cupy.ndarray
66 | assert_eq(result, result) # Check that _meta and computed arrays match types
67 | assert_eq(result, np.array([1], dtype=d.dtype), check_type=False)
68 |
69 |
70 | def test_percentiles_with_unknown_chunk_sizes():
71 | rng = da.random.default_rng(cupy.random.default_rng())
72 | x = rng.random(1000, chunks=(100,))
73 | x._chunks = ((np.nan,) * 10,)
74 |
75 | result = da.percentile(x, 50, method="midpoint").compute()
76 | assert type(result) == cupy.ndarray
77 | assert 0.1 < result < 0.9
78 |
79 | a, b = da.percentile(x, [40, 60], method="midpoint").compute()
80 | assert type(a) == cupy.ndarray
81 | assert type(b) == cupy.ndarray
82 | assert 0.1 < a < 0.9
83 | assert 0.1 < b < 0.9
84 | assert a < b
85 |
--------------------------------------------------------------------------------
/docs/source/_static/main-page.css:
--------------------------------------------------------------------------------
1 | /* GLOBAL STYLES
2 | -------------------------------------------------- */
3 | /* Padding below the footer and lighter body text */
4 |
5 | body {
6 | padding-bottom: 3rem;
7 | color: #5a5a5a;
8 | }
9 |
10 | /* navbar
11 | * ----------------------------------------*/
12 |
13 | .navbar {
14 | background-color: #000000;
15 | }
16 | .navbar li {
17 | transition: .3s background-color;
18 | text-align: center;
19 | background-color: transparent;
20 | padding: 0rem 1rem;
21 | text-decoration: none;
22 | border-radius: 0.3rem;
23 | }
24 | .navbar li:hover {
25 | background-color: #FDA061;
26 | }
27 | .navbar li .nav-link{
28 | color: #FDA061;
29 | }
30 | .navbar li:hover .nav-link{
31 | color: #212529;
32 | }
33 |
34 | .dropdown-menu {
35 | background-color: #000000d0;
36 | }
37 |
38 | .dropdown-item {
39 | color: #FDA061;
40 | }
41 |
42 | .dropdown-item:hover {
43 | background-color: #FDA061D0;
44 | }
45 |
46 | .hero {
47 | background-color: rgba(0,0,0,0.92);
48 | text-color: white;
49 | }
50 |
51 |
52 | .top-image {
53 | height: 10rem;
54 | max-width: 20rem;
55 | }
56 |
57 |
58 | .outline-dask {
59 | color: #FDA061;
60 | background-color: transparent;
61 | border-color: #FDA061;
62 | }
63 |
64 |
65 | .outline-dask:hover {
66 | color: #212529;
67 | background-color: #FDA061;
68 | border-color: #FDA061;
69 | }
70 |
71 | .solid-dask {
72 | color: #212529;
73 | background-color: #FDA061;
74 | }
75 |
76 | .solid-dask:hover {
77 | color: #212529;
78 | background-color: #EC9050;
79 | }
80 |
81 |
82 | /* MARKETING CONTENT
83 | -------------------------------------------------- */
84 |
85 | /* Center align the text within the three columns below the carousel */
86 | .marketing .col-lg-4 {
87 | margin-bottom: 1.5rem;
88 | text-align: center;
89 | }
90 | .marketing .col-lg-4 p {
91 | margin-right: .75rem;
92 | margin-left: .75rem;
93 | }
94 |
95 |
96 | /* Featurettes
97 | ------------------------- */
98 |
99 | .featurette-divider {
100 | margin: 3rem 0; /* Space out the Bootstrap
more */
101 | }
102 |
103 | /* Thin out the marketing headings */
104 | .featurette-heading {
105 | font-weight: 300;
106 | line-height: 1;
107 | letter-spacing: -.05rem;
108 | }
109 |
110 | .featurette-subheading {
111 | text-transform: uppercase;
112 | font-size: 1.2rem;
113 | display: block;
114 | font-weight: 600;
115 | margin: 1.2rem 0;
116 | }
117 |
118 | /* Supporters
119 | * ----------------------------*/
120 |
121 | .supporters {
122 | text-align: center;
123 | }
124 |
125 | .supporter {
126 | margin: 0.5rem 0;
127 | width: 100%;
128 | }
129 |
130 | .supporter img{
131 | max-height: 100%;
132 | max-width: 85%;
133 | position: relative;
134 | top: 50%;
135 | transform: translateY(-50%);
136 |
137 | }
138 |
139 |
140 | /* RESPONSIVE CSS
141 | -------------------------------------------------- */
142 |
143 | @media (min-width: 40em) {
144 | .featurette-heading {
145 | font-size: 50px;
146 | }
147 | }
148 |
149 | @media (min-width: 62em) {
150 | .featurette-heading {
151 | margin-top: 3rem;
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/docs/source/array-gufunc.rst:
--------------------------------------------------------------------------------
1 | Generalized Ufuncs
2 | ==================
3 |
4 | `NumPy `_ provides the concept of `generalized ufuncs `_. Generalized ufuncs are functions
5 | that distinguish the various dimensions of passed arrays in the two classes loop dimensions
6 | and core dimensions. To accomplish this, a `signature `_ is specified for NumPy generalized ufuncs.
7 |
8 | `Dask `_ integrates interoperability with NumPy's generalized ufuncs
9 | by adhering to respective `ufunc protocol `_, and provides a wrapper to make a Python function a generalized ufunc.
10 |
11 |
12 | Usage
13 | -----
14 |
15 | NumPy Generalized UFuncs
16 | ~~~~~~~~~~~~~~~~~~~~~~~~
17 | .. note::
18 |
19 | `NumPy `_ generalized ufuncs are currently (v1.14.3 and below) stored in
20 | inside ``np.linalg._umath_linalg`` and might change in the future.
21 |
22 |
23 | .. code-block:: python
24 |
25 | import dask.array as da
26 | import numpy as np
27 |
28 | x = da.random.default_rng().normal(size=(3, 10, 10), chunks=(2, 10, 10))
29 |
30 | w, v = np.linalg._umath_linalg.eig(x, output_dtypes=(float, float))
31 |
32 |
33 | Create Generalized UFuncs
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~
35 |
36 | It can be difficult to create your own GUFuncs without going into the CPython API.
37 | However, the `Numba `_ project does provide a
38 | nice implementation with their ``numba.guvectorize`` decorator. See `Numba's
39 | documentation
40 | `_
41 | for more information.
42 |
43 | Wrap your own Python function
44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
45 | ``gufunc`` can be used to make a Python function behave like a generalized ufunc:
46 |
47 |
48 | .. code-block:: python
49 |
50 | x = da.random.default_rng().normal(size=(10, 5), chunks=(2, 5))
51 |
52 | def foo(x):
53 | return np.mean(x, axis=-1)
54 |
55 | gufoo = da.gufunc(foo, signature="(i)->()", output_dtypes=float, vectorize=True)
56 |
57 | y = gufoo(x)
58 |
59 |
60 | Instead of ``gufunc``, also the ``as_gufunc`` decorator can be used for convenience:
61 |
62 |
63 | .. code-block:: python
64 |
65 | x = da.random.normal(size=(10, 5), chunks=(2, 5))
66 |
67 | @da.as_gufunc(signature="(i)->()", output_dtypes=float, vectorize=True)
68 | def gufoo(x):
69 | return np.mean(x, axis=-1)
70 |
71 | y = gufoo(x)
72 |
73 |
74 | Disclaimer
75 | ----------
76 | This experimental generalized ufunc integration is not complete:
77 |
78 | * ``gufunc`` does not create a true generalized ufunc to be used with other input arrays besides Dask.
79 | I.e., at the moment, ``gufunc`` casts all input arguments to ``dask.array.Array``
80 |
81 | * Inferring ``output_dtypes`` automatically is not implemented yet
82 |
83 |
84 | API
85 | ---
86 |
87 | .. currentmodule:: dask.array.gufunc
88 |
89 | .. autosummary::
90 | apply_gufunc
91 | as_gufunc
92 | gufunc
93 |
--------------------------------------------------------------------------------
/dask/dataframe/tests/test_hyperloglog.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pytest
6 |
7 | import dask.dataframe as dd
8 |
9 | rs = np.random.RandomState(96)
10 |
11 |
12 | @pytest.mark.parametrize(
13 | "df",
14 | [
15 | pd.DataFrame(
16 | {
17 | "x": [1, 2, 3] * 3,
18 | "y": [1.2, 3.4, 5.6] * 3,
19 | "z": -(np.arange(9, dtype=np.int8)),
20 | }
21 | ),
22 | pd.DataFrame(
23 | {
24 | "x": rs.randint(0, 1000000, (10000,)),
25 | "y": rs.randn(10000),
26 | "z": rs.uniform(0, 9999999, (10000,)),
27 | }
28 | ),
29 | pd.DataFrame(
30 | {
31 | "x": np.repeat(rs.randint(0, 1000000, (1000,)), 3),
32 | "y": np.repeat(rs.randn(1000), 3),
33 | "z": np.repeat(rs.uniform(0, 9999999, (1000,)), 3),
34 | }
35 | ),
36 | pd.DataFrame({"x": rs.randint(0, 1000000, (10000,))}),
37 | pd.DataFrame(
38 | {
39 | "x": rs.randint(0, 1000000, (7,)),
40 | "y": ["a", "bet", "is", "a", "tax", "on", "bs"],
41 | }
42 | ),
43 | pd.DataFrame(
44 | {
45 | "w": np.zeros((20000,)),
46 | "x": np.zeros((20000,)),
47 | "y": np.zeros((20000,)) + 4803592,
48 | "z": np.zeros((20000,)),
49 | }
50 | ),
51 | pd.DataFrame({"x": [1, 2, 3] * 1000}),
52 | pd.DataFrame({"x": np.random.random(1000)}),
53 | pd.DataFrame(
54 | {
55 | "a": [1, 2, 3] * 3,
56 | "b": [1.2, 3.4, 5.6] * 3,
57 | "c": [1 + 2j, 3 + 4j, 5 + 6j] * 3,
58 | "d": -(np.arange(9, dtype=np.int8)),
59 | }
60 | ),
61 | pd.Series([1, 2, 3] * 1000),
62 | pd.Series(np.random.random(1000)),
63 | pd.Series(np.random.random(1000), index=np.ones(1000)),
64 | pd.Series(np.random.random(1000), index=np.random.random(1000)),
65 | ],
66 | )
67 | @pytest.mark.parametrize("npartitions", [2, 20])
68 | def test_basic(df, npartitions):
69 | ddf = dd.from_pandas(df, npartitions=npartitions)
70 |
71 | approx = ddf.nunique_approx().compute(scheduler="sync")
72 | exact = len(df.drop_duplicates())
73 | assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
74 |
75 |
76 | @pytest.mark.parametrize("split_every", [None, 2, 10])
77 | @pytest.mark.parametrize("npartitions", [2, 20])
78 | def test_split_every(split_every, npartitions):
79 | df = pd.Series([1, 2, 3] * 1000)
80 | ddf = dd.from_pandas(df, npartitions=npartitions)
81 |
82 | approx = ddf.nunique_approx(split_every=split_every).compute(scheduler="sync")
83 | exact = len(df.drop_duplicates())
84 | assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
85 |
86 |
87 | def test_larger_data():
88 | df = dd.demo.make_timeseries(
89 | "2000-01-01",
90 | "2000-04-01",
91 | {"value": float, "id": int},
92 | freq="10s",
93 | partition_freq="1D",
94 | seed=1,
95 | )
96 | assert df.nunique_approx().compute() > 1000
97 |
--------------------------------------------------------------------------------
/docs/source/deploying-cloud.rst:
--------------------------------------------------------------------------------
1 | Cloud
2 | =====
3 |
4 | There are a variety of ways to deploy Dask on cloud providers.
5 | Cloud providers provide managed services,
6 | like VMs, Kubernetes, Yarn, or custom APIs with which Dask can connect easily.
7 | You may want to consider the following options:
8 |
9 | 1. A managed Kubernetes service and Dask's
10 | :doc:`Kubernetes integration `.
11 | 2. A managed Yarn service,
12 | like `Amazon EMR `_
13 | or `Google Cloud DataProc `_
14 | and `Dask-Yarn `_.
15 |
16 | Specific documentation for the popular Amazon EMR service can be found
17 | `here `_.
18 | 3. Directly launching cloud resources such as VMs or containers via a cluster manager with
19 | `Dask Cloud Provider `_.
20 | 4. A commercial Dask deployment option like `Coiled `_ to handle the creation and management of Dask clusters on a cloud computing environment (AWS and GCP).
21 |
22 | Cloud Deployment Example
23 | ------------------------
24 |
25 | Using `Dask Cloud Provider `_ to launch a cluster of
26 | VMs on a platform like `DigitalOcean `_ can be as convenient as
27 | launching a local cluster.
28 |
29 | .. code-block:: python
30 |
31 | >>> import dask.config
32 |
33 | >>> dask.config.set({"cloudprovider.digitalocean.token": "yourAPItoken"})
34 |
35 | >>> from dask_cloudprovider.digitalocean import DropletCluster
36 |
37 | >>> cluster = DropletCluster(n_workers=1)
38 | Creating scheduler instance
39 | Created droplet dask-38b817c1-scheduler
40 | Waiting for scheduler to run
41 | Scheduler is running
42 | Creating worker instance
43 | Created droplet dask-38b817c1-worker-dc95260d
44 |
45 | Many of the cluster managers in Dask Cloud Provider work by launching VMs with a startup script
46 | that pulls down the :doc:`Dask Docker image ` and runs Dask components within that container.
47 | As with all cluster managers the VM resources, Docker image, etc are all configurable.
48 |
49 | You can then connect a client and work with the cluster as if it were on your local machine.
50 |
51 | .. code-block:: python
52 |
53 | >>> from dask.distributed import Client
54 |
55 | >>> client = Client(cluster)
56 |
57 | Data Access
58 | -----------
59 |
60 | You may want to install additional libraries in your Jupyter and worker images
61 | to access the object stores of each cloud (see :doc:`how-to/connect-to-remote-data`):
62 |
63 | - `s3fs `_ for Amazon's S3
64 | - `gcsfs `_ for Google's GCS
65 | - `adlfs `_ for Microsoft's ADL
66 |
67 | Historical Libraries
68 | --------------------
69 |
70 | Dask previously maintained libraries for deploying Dask on
71 | Amazon's EC2 and Google GKE.
72 | Due to sporadic interest,
73 | and churn both within the Dask library and EC2 itself,
74 | these were not well maintained.
75 | They have since been deprecated in favor of the
76 | :doc:`Kubernetes ` solutions.
77 |
--------------------------------------------------------------------------------
/docs/source/deploying-python.rst:
--------------------------------------------------------------------------------
1 | Python API
2 | ==========
3 |
4 | You can create a ``dask.distributed`` scheduler by importing and creating a
5 | ``Client`` with no arguments. This overrides whatever default was previously
6 | set.
7 |
8 | .. code-block:: python
9 |
10 | from dask.distributed import Client
11 | client = Client()
12 |
13 | You can navigate to ``http://localhost:8787/status`` to see the diagnostic
14 | dashboard if you have Bokeh installed.
15 |
16 | Client
17 | ------
18 |
19 | You can trivially set up a local cluster on your machine by instantiating a Dask
20 | Client with no arguments
21 |
22 | .. code-block:: python
23 |
24 | from dask.distributed import Client
25 | client = Client()
26 |
27 | This sets up a scheduler in your local process along with a number of workers and
28 | threads per worker related to the number of cores in your machine.
29 |
30 | If you want to run workers in your same process, you can pass the
31 | ``processes=False`` keyword argument.
32 |
33 | .. code-block:: python
34 |
35 | client = Client(processes=False)
36 |
37 | This is sometimes preferable if you want to avoid inter-worker communication
38 | and your computations release the GIL. This is common when primarily using
39 | NumPy or Dask Array.
40 |
41 |
42 | LocalCluster
43 | ------------
44 |
45 | The ``Client()`` call described above is shorthand for creating a LocalCluster
46 | and then passing that to your client.
47 |
48 | .. code-block:: python
49 |
50 | from dask.distributed import Client, LocalCluster
51 | cluster = LocalCluster()
52 | client = Client(cluster)
53 |
54 | This is equivalent, but somewhat more explicit.
55 |
56 | You may want to look at the
57 | keyword arguments available on ``LocalCluster`` to understand the options available
58 | to you on handling the mixture of threads and processes, like specifying explicit
59 | ports, and so on.
60 |
61 | To create a local cluster with all workers running in dedicated subprocesses,
62 | ``dask.distributed`` also offers the experimental ``SubprocessCluster``.
63 |
64 | Cluster manager features
65 | ------------------------
66 |
67 | Instantiating a cluster manager class like ``LocalCluster`` and then passing it to the
68 | ``Client`` is a common pattern. Cluster managers also provide useful utilities to help
69 | you understand what is going on.
70 |
71 | For example you can retrieve the Dashboard URL.
72 |
73 | .. code-block:: python
74 |
75 | >>> cluster.dashboard_link
76 | 'http://127.0.0.1:8787/status'
77 |
78 | You can retrieve logs from cluster components.
79 |
80 | .. code-block:: python
81 |
82 | >>> cluster.get_logs()
83 | {'Cluster': '',
84 | 'Scheduler': "distributed.scheduler - INFO - Clear task state\ndistributed.scheduler - INFO - S...
85 |
86 | If you are using a cluster manager that supports scaling you can modify the number of workers manually
87 | or automatically based on workload.
88 |
89 | .. code-block:: python
90 |
91 | >>> cluster.scale(10) # Sets the number of workers to 10
92 |
93 | >>> cluster.adapt(minimum=1, maximum=10) # Allows the cluster to auto scale to 10 when tasks are computed
94 |
95 | Reference
96 | ---------
97 |
98 | .. currentmodule:: distributed.deploy.local
99 |
100 | .. autoclass:: LocalCluster
101 | :members:
102 |
--------------------------------------------------------------------------------
/docs/source/array-assignment.rst:
--------------------------------------------------------------------------------
1 | .. _array.assignment:
2 |
3 | Assignment
4 | ==========
5 |
6 | Dask Array supports most of the NumPy assignment indexing syntax. In
7 | particular, it supports combinations of the following:
8 |
9 | * Indexing by integers: ``x[1] = y``
10 | * Indexing by slices: ``x[2::-1] = y``
11 | * Indexing by a list of integers: ``x[[0, -1, 1]] = y``
12 | * Indexing by a 1-d :class:`numpy` array of integers: ``x[np.arange(3)] = y``
13 | * Indexing by a 1-d :class:`~dask.array.Array` of integers: ``x[da.arange(3)] = y``, ``x[da.from_array([0, -1, 1])] = y``, ``x[da.where(np.array([1, 2, 3]) < 3)[0]] = y``
14 | * Indexing by a list of booleans: ``x[[False, True, True]] = y``
15 | * Indexing by a 1-d :class:`numpy` array of booleans: ``x[np.arange(3) > 0] = y``
16 |
17 | It also supports:
18 |
19 | * Indexing by one broadcastable :class:`~dask.array.Array` of
20 | booleans: ``x[x > 0] = y``.
21 |
22 | However, it does not currently support the following:
23 |
24 | * Indexing with lists in multiple axes: ``x[[1, 2, 3], [3, 1, 2]] = y``
25 |
26 |
27 | .. _array.assignment.broadcasting:
28 |
29 | Broadcasting
30 | ------------
31 |
32 | The normal NumPy broadcasting rules apply:
33 |
34 | .. code-block:: python
35 |
36 | >>> x = da.zeros((2, 6))
37 | >>> x[0] = 1
38 | >>> x[..., 1] = 2.0
39 | >>> x[:, 2] = [3, 4]
40 | >>> x[:, 5:2:-2] = [[6, 5]]
41 | >>> x.compute()
42 | array([[1., 2., 3., 5., 1., 6.],
43 | [0., 2., 4., 5., 0., 6.]])
44 | >>> x[1] = -x[0]
45 | >>> x.compute()
46 | array([[ 1., 2., 3., 5., 1., 6.],
47 | [-1., -2., -3., -5., -1., -6.]])
48 |
49 | .. _array.assignment.masking:
50 |
51 | Masking
52 | -------
53 |
54 | Elements may be masked by assigning to the NumPy masked value, or to an
55 | array with masked values:
56 |
57 | .. code-block:: python
58 |
59 | >>> x = da.ones((2, 6))
60 | >>> x[0, [1, -2]] = np.ma.masked
61 | >>> x[1] = np.ma.array([0, 1, 2, 3, 4, 5], mask=[0, 1, 1, 0, 0, 0])
62 | >>> print(x.compute())
63 | [[1.0 -- 1.0 1.0 -- 1.0]
64 | [0.0 -- -- 3.0 4.0 5.0]]
65 | >>> x[:, 0] = x[:, 1]
66 | >>> print(x.compute())
67 | [[1.0 -- 1.0 1.0 -- 1.0]
68 | [0.0 -- -- 3.0 4.0 5.0]]
69 | >>> x[:, 0] = x[:, 1]
70 | >>> print(x.compute())
71 | [[-- -- 1.0 1.0 -- 1.0]
72 | [-- -- -- 3.0 4.0 5.0]]
73 |
74 | If, and only if, a single broadcastable :class:`~dask.array.Array` of
75 | booleans is provided then masked array assignment does not yet work as
76 | expected. In this case the data underlying the mask are assigned:
77 |
78 | .. code-block:: python
79 |
80 | >>> x = da.arange(12).reshape(2, 6)
81 | >>> x[x > 7] = np.ma.array(-99, mask=True)
82 | >>> print(x.compute())
83 | [[ 0 1 2 3 4 5]
84 | [ 6 7 -99 -99 -99 -99]]
85 |
86 | Note that masked assignments do work when a boolean
87 | :class:`~dask.array.Array` index used in a tuple, or implicit tuple,
88 | of indices:
89 |
90 | .. code-block:: python
91 |
92 | >>> x = da.arange(12).reshape(2, 6)
93 | >>> x[1, x[0] > 3] = np.ma.masked
94 | >>> print(x.compute())
95 | [[0 1 2 3 4 5]
96 | [6 7 8 9 -- --]]
97 | >>> x = da.arange(12).reshape(2, 6)
98 | >>> print(x.compute())
99 | [[ 0 1 2 3 4 5]
100 | [ 6 7 8 9 10 11]]
101 | >>> x[(x[:, 2] < 4,)] = np.ma.masked
102 | >>> print(x.compute())
103 | [[-- -- -- -- -- --]
104 | [6 7 8 9 10 11]]
105 |
--------------------------------------------------------------------------------