├── dask
    ├── py.typed
    ├── tests
    │   ├── __init__.py
    │   ├── warning_aliases.py
    │   ├── test_compatibility.py
    │   ├── test_ml.py
    │   ├── test_backends.py
    │   ├── test_docs.py
    │   ├── test_hashing.py
    │   ├── test_datasets.py
    │   ├── test_ci.py
    │   ├── test_context.py
    │   ├── test_system.py
    │   ├── test_utils_test.py
    │   ├── test_cache.py
    │   └── test_callbacks.py
    ├── array
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_testing.py
    │   │   ├── test_cupy_gufunc.py
    │   │   ├── test_numpy_compat.py
    │   │   ├── test_xarray.py
    │   │   ├── test_image.py
    │   │   ├── test_cupy_reductions.py
    │   │   ├── test_wrap.py
    │   │   ├── test_svg.py
    │   │   └── test_cupy_percentile.py
    │   ├── lib
    │   │   ├── __init__.py
    │   │   └── stride_tricks.py
    │   ├── dispatch.py
    │   ├── NUMPY_LICENSE.txt
    │   ├── image.py
    │   └── cupy_entry_point.py
    ├── bag
    │   ├── tests
    │   │   └── __init__.py
    │   ├── utils.py
    │   ├── chunk.py
    │   └── __init__.py
    ├── bytes
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── test_compression.py
    │   ├── __init__.py
    │   └── utils.py
    ├── dataframe
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── test_methods.py
    │   │   ├── test_boolean.py
    │   │   ├── test_optimize_dataframe.py
    │   │   ├── test_extensions.py
    │   │   ├── test_numeric.py
    │   │   ├── test_hashing.py
    │   │   └── test_hyperloglog.py
    │   ├── io
    │   │   ├── tests
    │   │   │   └── __init__.py
    │   │   ├── orc
    │   │   │   ├── __init__.py
    │   │   │   └── utils.py
    │   │   ├── parquet
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── tseries
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   └── __init__.py
    │   ├── extensions.py
    │   ├── numeric.py
    │   ├── __init__.py
    │   ├── _pyarrow_compat.py
    │   ├── _dtypes.py
    │   └── hyperloglog.py
    ├── diagnostics
    │   ├── tests
    │   │   └── __init__.py
    │   └── __init__.py
    ├── widgets
    │   ├── tests
    │   │   ├── templates
    │   │   │   ├── example.html.j2
    │   │   │   ├── bytes.html.j2
    │   │   │   └── custom_filter.html.j2
    │   │   └── test_widgets.py
    │   ├── templates
    │   │   ├── dataframe.html.j2
    │   │   ├── array.html.j2
    │   │   ├── highlevelgraph_layer.html.j2
    │   │   └── highlevelgraph.html.j2
    │   ├── __init__.py
    │   └── widgets.py
    ├── __main__.py
    ├── ml.py
    ├── __init__.py
    ├── compatibility.py
    ├── _compatibility.py
    ├── distributed.py
    ├── system.py
    ├── context.py
    ├── cache.py
    ├── dask.yaml
    └── hashing.py
├── docs
    ├── source
    │   ├── daskcheatsheet.pdf
    │   ├── images
    │   │   ├── reshape.png
    │   │   ├── gputester-msg.png
    │   │   ├── merge_chunks.png
    │   │   ├── order-failure.png
    │   │   ├── order-success.png
    │   │   ├── scaling-edges.png
    │   │   ├── scaling-nodes.png
    │   │   ├── simple-dask.png
    │   │   ├── dashboard_link.png
    │   │   ├── reshape_problem.png
    │   │   ├── HHMI_Janelia_Color.png
    │   │   ├── async-embarrassing.gif
    │   │   ├── dashboard_memory.png
    │   │   ├── dashboard_progress.png
    │   │   ├── dashboard_status.png
    │   │   ├── merge_chunks_false.png
    │   │   ├── reshape_rechunked.png
    │   │   ├── 10_minutes_bag_graph.png
    │   │   ├── dashboard_jupyterlab.png
    │   │   ├── dashboard_memory_new.gif
    │   │   ├── growth_of_languages.png
    │   │   ├── growth_of_libraries.png
    │   │   ├── map_blocks_drop_axis.png
    │   │   ├── 10_minutes_array_graph.png
    │   │   ├── transpose-hlg-html-repr.png
    │   │   ├── dashboard_task_processing.png
    │   │   ├── 10_minutes_dataframe_graph.png
    │   │   ├── concurrent-futures-threaded.webp
    │   │   ├── dashboard_taskstream_healthy.png
    │   │   ├── transpose-hlg-hovertooltip.png
    │   │   ├── dashboard_task_stream_unhealthy.png
    │   │   ├── dask_icon_black.svg
    │   │   ├── dask_icon.svg
    │   │   ├── dask_icon_on_pink.svg
    │   │   ├── dask_icon_white.svg
    │   │   ├── unoverlapping-neighbors.svg
    │   │   ├── optimize_dask5.svg
    │   │   └── dask_horizontal.svg
    │   ├── _static
    │   │   ├── dask-simple.png
    │   │   ├── theme_overrides.css
    │   │   ├── style.css
    │   │   └── main-page.css
    │   ├── _templates
    │   │   └── layout.html
    │   ├── cheatsheet.rst
    │   ├── internals.rst
    │   ├── debugging-performance.rst
    │   ├── how-to
    │   │   ├── index.rst
    │   │   ├── setup-prometheus.rst
    │   │   └── extend-sizeof.rst
    │   ├── logos.rst
    │   ├── dashboard-progress-script.py
    │   ├── array-stats.rst
    │   ├── delayed-collections.rst
    │   ├── deploying-ssh.rst
    │   ├── delayed-api.rst
    │   ├── understanding-performance.rst
    │   ├── bag-api.rst
    │   ├── array-stack.rst
    │   ├── graph_manipulation.rst
    │   ├── deploying-docker.rst
    │   ├── array-gufunc.rst
    │   ├── deploying-cloud.rst
    │   ├── deploying-python.rst
    │   └── array-assignment.rst
    ├── requirements-docs.txt
    └── README.rst
├── .github
    ├── PULL_REQUEST_TEMPLATE.md
    ├── CONTRIBUTING.md
    ├── dependabot.yml
    ├── workflows
    │   ├── label-prs.yml
    │   ├── label-all.yml
    │   ├── pre-commit.yml
    │   ├── stale-bot.yaml
    │   ├── additional.yml
    │   ├── conda.yml
    │   ├── upstream.yml
    │   └── update-gpuci.yml
    ├── labeler.yml
    └── release.yml
├── continuous_integration
    ├── gpuci
    │   ├── axis.yaml
    │   └── build.sh
    ├── scripts
    │   ├── run_tests.sh
    │   ├── test_imports.sh
    │   └── install.sh
    ├── environment-mindeps-non-optional.yaml
    ├── environment-mindeps-array.yaml
    ├── environment-mindeps-dataframe.yaml
    ├── environment-mindeps-distributed.yaml
    ├── recipe
    │   └── meta.yaml
    ├── environment-mindeps-optional.yaml
    ├── environment-3.9.yaml
    ├── environment-3.10.yaml
    └── environment-3.11.yaml
├── setup.py
├── CONTRIBUTING.md
├── .readthedocs.yaml
├── MANIFEST.in
├── .gitignore
├── .git-blame-ignore-revs
├── codecov.yml
├── .flake8
├── README.rst
├── LICENSE.txt
├── .pre-commit-config.yaml
└── conftest.py


/dask/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/array/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/bag/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/bytes/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/dataframe/io/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/dataframe/tseries/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/diagnostics/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/dataframe/tseries/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dask/widgets/tests/templates/example.html.j2:
--------------------------------------------------------------------------------
1 | <p>
2 |     Hello {{ foo }}!
3 | </p>
4 | 


--------------------------------------------------------------------------------
/dask/widgets/tests/templates/bytes.html.j2:
--------------------------------------------------------------------------------
1 | <p>
2 |     {{ foo | format_bytes }}
3 | </p>
4 | 


--------------------------------------------------------------------------------
/dask/widgets/tests/templates/custom_filter.html.j2:
--------------------------------------------------------------------------------
1 | <p>
2 |     {{ foo | custom_filter }}
3 | </p>
4 | 


--------------------------------------------------------------------------------
/dask/bytes/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.bytes.core import read_bytes
4 | 


--------------------------------------------------------------------------------
/docs/source/daskcheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/daskcheatsheet.pdf


--------------------------------------------------------------------------------
/docs/source/images/reshape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape.png


--------------------------------------------------------------------------------
/dask/array/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.array.lib import stride_tricks
4 | 


--------------------------------------------------------------------------------
/docs/source/_static/dask-simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/_static/dask-simple.png


--------------------------------------------------------------------------------
/docs/source/images/gputester-msg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/gputester-msg.png


--------------------------------------------------------------------------------
/docs/source/images/merge_chunks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks.png


--------------------------------------------------------------------------------
/docs/source/images/order-failure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-failure.png


--------------------------------------------------------------------------------
/docs/source/images/order-success.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/order-success.png


--------------------------------------------------------------------------------
/docs/source/images/scaling-edges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-edges.png


--------------------------------------------------------------------------------
/docs/source/images/scaling-nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/scaling-nodes.png


--------------------------------------------------------------------------------
/docs/source/images/simple-dask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/simple-dask.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_link.png


--------------------------------------------------------------------------------
/docs/source/images/reshape_problem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_problem.png


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | - [ ] Closes #xxxx
2 | - [ ] Tests added / passed
3 | - [ ] Passes `pre-commit run --all-files`
4 | 


--------------------------------------------------------------------------------
/docs/source/images/HHMI_Janelia_Color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/HHMI_Janelia_Color.png


--------------------------------------------------------------------------------
/docs/source/images/async-embarrassing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/async-embarrassing.gif


--------------------------------------------------------------------------------
/docs/source/images/dashboard_memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_progress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_progress.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_status.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_status.png


--------------------------------------------------------------------------------
/docs/source/images/merge_chunks_false.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/merge_chunks_false.png


--------------------------------------------------------------------------------
/docs/source/images/reshape_rechunked.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/reshape_rechunked.png


--------------------------------------------------------------------------------
/dask/dataframe/io/orc/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.dataframe.io.orc.core import read_orc, to_orc
4 | 


--------------------------------------------------------------------------------
/docs/source/images/10_minutes_bag_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_bag_graph.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_jupyterlab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_jupyterlab.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_memory_new.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_memory_new.gif


--------------------------------------------------------------------------------
/docs/source/images/growth_of_languages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_languages.png


--------------------------------------------------------------------------------
/docs/source/images/growth_of_libraries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/growth_of_libraries.png


--------------------------------------------------------------------------------
/docs/source/images/map_blocks_drop_axis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/map_blocks_drop_axis.png


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | See [developer documentation](https://docs.dask.org/en/latest/develop.html)
2 | for tips on how to get started.
3 | 


--------------------------------------------------------------------------------
/docs/source/images/10_minutes_array_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_array_graph.png


--------------------------------------------------------------------------------
/docs/source/images/transpose-hlg-html-repr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-html-repr.png


--------------------------------------------------------------------------------
/dask/array/lib/stride_tricks.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.array.overlap import sliding_window_view  # noqa: F401
4 | 


--------------------------------------------------------------------------------
/docs/source/images/dashboard_task_processing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_processing.png


--------------------------------------------------------------------------------
/docs/source/images/10_minutes_dataframe_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/10_minutes_dataframe_graph.png


--------------------------------------------------------------------------------
/docs/source/images/concurrent-futures-threaded.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/concurrent-futures-threaded.webp


--------------------------------------------------------------------------------
/docs/source/images/dashboard_taskstream_healthy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_taskstream_healthy.png


--------------------------------------------------------------------------------
/docs/source/images/transpose-hlg-hovertooltip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/transpose-hlg-hovertooltip.png


--------------------------------------------------------------------------------
/docs/source/images/dashboard_task_stream_unhealthy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/dask/main/docs/source/images/dashboard_task_stream_unhealthy.png


--------------------------------------------------------------------------------
/dask/widgets/templates/dataframe.html.j2:
--------------------------------------------------------------------------------
1 | <div><strong>Dask DataFrame Structure:</strong></div>
2 | {{ data }}
3 | <div>Dask Name: {{ name | key_split }}, {{ layers }}</div>
4 | 


--------------------------------------------------------------------------------
/dask/__main__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dask.cli import run_cli
 4 | 
 5 | 
 6 | def main():
 7 |     run_cli()
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     main()
12 | 


--------------------------------------------------------------------------------
/continuous_integration/gpuci/axis.yaml:
--------------------------------------------------------------------------------
 1 | PYTHON_VER:
 2 | - "3.9"
 3 | - "3.10"
 4 | 
 5 | CUDA_VER:
 6 | - "11.5"
 7 | 
 8 | LINUX_VER:
 9 | - ubuntu18.04
10 | 
11 | RAPIDS_VER:
12 | - "23.10"
13 | 
14 | excludes:
15 | 


--------------------------------------------------------------------------------
/docs/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% set css_files = css_files + ["_static/style.css"] %}
3 | {% set script_files = script_files + ["_static/yaml.min.js", "_static/config_converter.js"] %}
4 | 


--------------------------------------------------------------------------------
/dask/dataframe/io/parquet/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.dataframe.io.parquet.core import (
4 |     create_metadata_file,
5 |     read_parquet,
6 |     read_parquet_part,
7 |     to_parquet,
8 | )
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import versioneer
 6 | from setuptools import setup
 7 | 
 8 | setup(
 9 |     version=versioneer.get_version(),
10 |     cmdclass=versioneer.get_cmdclass(),
11 | )
12 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Dask is a community maintained project. We welcome contributions in the form of bug reports, documentation, code, design proposals, and more. 
2 | 
3 | For general information on how to contribute see https://docs.dask.org/en/latest/develop.html.
4 | 


--------------------------------------------------------------------------------
/docs/source/cheatsheet.rst:
--------------------------------------------------------------------------------
1 | :orphan:
2 | 
3 | Dask Cheat Sheet
4 | ================
5 | 
6 | The 300KB pdf :download:`Dask cheat sheet <daskcheatsheet.pdf>`
7 | is a single page summary about using Dask.
8 | It is commonly distributed at conferences and trade shows.
9 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Set update schedule for GitHub Actions
 2 | 
 3 | version: 2
 4 | updates:
 5 |   - package-ecosystem: "github-actions"
 6 |     directory: "/"
 7 |     schedule:
 8 |       # Check for updates to GitHub Actions every weekday
 9 |       interval: "weekly"
10 | 


--------------------------------------------------------------------------------
/dask/tests/warning_aliases.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | try:
 4 |     from sqlalchemy.exc import RemovedIn20Warning
 5 | except ImportError:
 6 | 
 7 |     class _RemovedIn20Warning(Warning):
 8 |         pass
 9 | 
10 |     RemovedIn20Warning = _RemovedIn20Warning
11 | 


--------------------------------------------------------------------------------
/.github/workflows/label-prs.yml:
--------------------------------------------------------------------------------
 1 | name: "PR Labeler"
 2 | on:
 3 | - pull_request_target
 4 | 
 5 | jobs:
 6 |   label:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/labeler@main
10 |       with:
11 |         repo-token: "${{ secrets.GITHUB_TOKEN }}"
12 |         sync-labels: false
13 | 


--------------------------------------------------------------------------------
/dask/bag/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def assert_eq(a, b, scheduler="sync"):
 5 |     if hasattr(a, "compute"):
 6 |         a = a.compute(scheduler=scheduler)
 7 |     if hasattr(b, "compute"):
 8 |         b = b.compute(scheduler=scheduler)
 9 | 
10 |     assert a == b
11 | 


--------------------------------------------------------------------------------
/dask/diagnostics/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from dask.callbacks import Callback
4 | from dask.diagnostics.profile import CacheProfiler, Profiler, ResourceProfiler
5 | from dask.diagnostics.profile_visualize import visualize
6 | from dask.diagnostics.progress import ProgressBar
7 | 


--------------------------------------------------------------------------------
/.github/workflows/label-all.yml:
--------------------------------------------------------------------------------
 1 | name: "Issue and PR Labeler"
 2 | on:
 3 |   pull_request:
 4 |     types: [opened]
 5 |   issues:
 6 |     types: [opened, reopened]
 7 | jobs:
 8 |   label-all-on-open:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: andymckay/labeler@1.0.4
12 |         with:
13 |           add-labels: "needs triage"
14 |           ignore-if-labeled: false
15 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 2 | version: 2
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | sphinx:
 9 |   configuration: docs/source/conf.py
10 |   fail_on_warning: true
11 | 
12 | python:
13 |   install:
14 |     - requirements: docs/requirements-docs.txt
15 |     - method: pip
16 |       path: .
17 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include dask *.py
 2 | recursive-include dask *.j2
 3 | recursive-include docs/source *
 4 | include docs/Makefile docs/make.bat
 5 | 
 6 | include setup.py
 7 | include README.rst
 8 | include MANIFEST.in
 9 | include dask/dask.yaml
10 | include dask/dask-schema.yaml
11 | include dask/py.typed
12 | 
13 | include versioneer.py
14 | include dask/_version.py
15 | 
16 | include conftest.py
17 | 


--------------------------------------------------------------------------------
/continuous_integration/scripts/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | if [[ $PARALLEL == 'true' ]]; then
 6 |     export XTRATESTARGS="-n4 $XTRATESTARGS"
 7 | fi
 8 | 
 9 | if [[ $COVERAGE == 'true' ]]; then
10 |     export XTRATESTARGS="--cov=dask --cov-report=xml $XTRATESTARGS"
11 | fi
12 | 
13 | echo "py.test dask --runslow $XTRATESTARGS"
14 | py.test dask --runslow $XTRATESTARGS
15 | 
16 | set +e
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .hypothesis
 2 | *.py[cod]
 3 | __pycache__/
 4 | *.egg-info
 5 | .mypy_cache
 6 | dask-worker-space/
 7 | docs/build
 8 | docs/source/generated
 9 | build/
10 | dist/
11 | .idea/
12 | log.*
13 | log
14 | .pytest_cache/
15 | .coverage
16 | .coverage.*
17 | coverage.xml
18 | .DS_Store
19 | *.sqlite
20 | *.swp
21 | *.swo
22 | .cache/
23 | hdfs-initialized-indicator
24 | .ipynb_checkpoints
25 | .vscode/
26 | .history
27 | 


--------------------------------------------------------------------------------
/.github/workflows/pre-commit.yml:
--------------------------------------------------------------------------------
 1 | name: Linting
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 |   pull_request:
 7 |     branches: main
 8 | 
 9 | jobs:
10 |   checks:
11 |     name: pre-commit hooks
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v3.5.3
15 |       - uses: actions/setup-python@v4
16 |         with:
17 |           python-version: '3.9'
18 |       - uses: pre-commit/action@v3.0.0
19 | 


--------------------------------------------------------------------------------
/docs/source/internals.rst:
--------------------------------------------------------------------------------
 1 | Dask Internals
 2 | ==============
 3 | 
 4 | This section is intended for contributors and power users who are interested in
 5 | learning more about how Dask works internally.
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 | 
10 |    user-interfaces.rst
11 |    understanding-performance.rst
12 |    phases-of-computation.rst
13 |    order.rst
14 |    caching.rst
15 |    shared.rst
16 |    scheduling-policy.rst
17 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | numpydoc
 2 | sphinx>=4.0.0
 3 | dask-sphinx-theme>=3.0.0
 4 | sphinx-click
 5 | sphinx-copybutton
 6 | sphinx-remove-toctrees
 7 | sphinx_autosummary_accessors
 8 | sphinx-tabs
 9 | sphinx-design
10 | jupyter_sphinx
11 | toolz
12 | cloudpickle>=1.5.0
13 | pandas>=1.4.0
14 | git+https://github.com/dask/distributed
15 | fsspec
16 | scipy
17 | pytest
18 | pytest-check-links
19 | requests-cache
20 | ipython
21 | ipykernel<6.22.0
22 | 


--------------------------------------------------------------------------------
/docs/source/_static/theme_overrides.css:
--------------------------------------------------------------------------------
 1 | /* override table width restrictions */
 2 | @media screen and (min-width: 767px) {
 3 | 
 4 |    .wy-table-responsive table td {
 5 |       /* !important prevents the common CSS stylesheets from overriding
 6 |          this as on RTD they are loaded after this stylesheet */
 7 |       white-space: normal !important;
 8 |    }
 9 | 
10 |    .wy-table-responsive {
11 |       overflow: visible !important;
12 |    }
13 | }
14 | 


--------------------------------------------------------------------------------
/docs/source/debugging-performance.rst:
--------------------------------------------------------------------------------
 1 | Debugging and Performance
 2 | ==========================
 3 | 
 4 | This section contains resources to help you debug and understand performance.
 5 | 
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 | 
10 |    how-to/debug.rst
11 |    Visualize task graphs <graphviz.rst>
12 |    Dashboard <dashboard.rst> 
13 |    diagnostics-local.rst
14 |    diagnostics-distributed.rst
15 |    Phases of computation <phases-of-computation.rst>
16 | 


--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
 1 | dataframe:
 2 |   - dask/dataframe/*
 3 |   - dask/dataframe/**/*
 4 | 
 5 | array:
 6 |   - dask/array/*
 7 |   - dask/array/**/*
 8 | 
 9 | io:
10 |   - dask/dataframe/io/*
11 |   - dask/dataframe/io/**/*
12 | 
13 | documentation:
14 |   - docs/*
15 |   - docs/**/*
16 | 
17 | dispatch:
18 |   - dask/array/backends.py
19 |   - dask/array/dispatch.py
20 |   - dask/dataframe/backends.py
21 |   - dask/dataframe/dispatch.py
22 |   - dask/dataframe/extensions.py
23 | 


--------------------------------------------------------------------------------
/dask/tests/test_compatibility.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from dask._compatibility import entry_points
 6 | 
 7 | 
 8 | def test_deprecation():
 9 |     with pytest.warns(DeprecationWarning):
10 |         from dask.compatibility import _EMSCRIPTEN  # noqa
11 | 
12 | 
13 | def test_entry_points():
14 |     with pytest.warns(DeprecationWarning):
15 |         assert "pytest" in [ep.name for ep in entry_points(group="console_scripts")]
16 | 


--------------------------------------------------------------------------------
/dask/tests/test_ml.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def test_basic():
 5 |     try:
 6 |         import dask_ml  # noqa: F401
 7 |     except ImportError:
 8 |         try:
 9 |             from dask.ml.model_selection import GridSearchCV  # noqa: F401
10 |         except ImportError as e:
11 |             assert "conda install dask-ml" in str(e)
12 |         else:
13 |             assert False
14 |     else:
15 |         from dask.ml.model_selection import GridSearchCV  # noqa: F401
16 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-non-optional.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # required dependencies
 6 |   - packaging=20.0
 7 |   - python=3.9
 8 |   - pyyaml=5.3.1
 9 |   - click=8.0
10 |   - cloudpickle=1.5.0
11 |   - partd=1.2.0
12 |   - fsspec=2021.09.0
13 |   - importlib-metadata=4.13.0
14 |   - toolz=0.10.0
15 |   # test dependencies
16 |   - pre-commit
17 |   - pytest
18 |   - pytest-cov
19 |   - pytest-rerunfailures
20 |   - pytest-xdist
21 | 


--------------------------------------------------------------------------------
/docs/source/how-to/index.rst:
--------------------------------------------------------------------------------
 1 | How To...
 2 | =========
 3 | 
 4 | This section contains snippets and suggestions about how to perform different actions
 5 | using Dask. If you have an idea of a how-to that we should add, please
 6 | `make a suggestion <https://github.com/dask/dask/tree/main/docs/source/how-to>`_!
 7 | 
 8 | .. Articles in this section should be short and not contain much explanation.
 9 | 
10 | .. toctree::
11 |    :caption: How To...
12 |    :maxdepth: 1
13 |    :glob:
14 | 
15 |    *
16 |    Use GPUs <../gpu.rst>
17 | 


--------------------------------------------------------------------------------
/dask/ml.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def __getattr__(value):
 5 |     try:
 6 |         import dask_ml
 7 |     except ImportError as e:
 8 |         msg = (
 9 |             "Dask-ML is not installed.\n\n"
10 |             "Please either conda or pip install dask-ml:\n\n"
11 |             "  conda install dask-ml                      # either conda install\n"
12 |             "  python -m pip install dask-ml --upgrade    # or pip install"
13 |         )
14 |         raise ImportError(msg) from e
15 |     return getattr(dask_ml, value)
16 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_methods.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | import dask.dataframe.methods as methods
 7 | from dask.dataframe._compat import PANDAS_GE_140
 8 | 
 9 | 
10 | def test_assign_not_modifying_array_inplace():
11 |     df = pd.DataFrame({"a": [1, 2, 3], "b": 1.5})
12 |     result = methods.assign(df, "a", 5)
13 |     assert not np.shares_memory(df["a"].values, result["a"].values)
14 |     if PANDAS_GE_140:
15 |         assert np.shares_memory(df["b"].values, result["b"].values)
16 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-array.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # required dependencies
 6 |   - packaging=20.0
 7 |   - python=3.9
 8 |   - pyyaml=5.3.1
 9 |   - click=8.0
10 |   - cloudpickle=1.5.0
11 |   - partd=1.2.0
12 |   - fsspec=2021.09.0
13 |   - importlib-metadata=4.13.0
14 |   - toolz=0.10.0
15 |   # optional dependencies pulled in by pip install dask[array]
16 |   - numpy=1.21
17 |   # test dependencies
18 |   - pre-commit
19 |   - pytest
20 |   - pytest-cov
21 |   - pytest-rerunfailures
22 |   - pytest-xdist
23 | 


--------------------------------------------------------------------------------
/dask/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dask import config, datasets
 4 | from dask._version import get_versions
 5 | from dask.base import (
 6 |     annotate,
 7 |     compute,
 8 |     get_annotations,
 9 |     is_dask_collection,
10 |     optimize,
11 |     persist,
12 |     visualize,
13 | )
14 | from dask.core import istask
15 | from dask.delayed import delayed
16 | from dask.local import get_sync as get
17 | 
18 | versions = get_versions()
19 | __version__ = versions["version"]
20 | __git_revision__ = versions["full-revisionid"]
21 | del get_versions, versions
22 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
 1 | # .github/release.yml
 2 | 
 3 | changelog:
 4 |   categories:
 5 |     - title: New Features
 6 |       labels:
 7 |         - feature
 8 |     - title: Enhancements
 9 |       labels:
10 |         - enhancement
11 |     - title: Bug Fixes
12 |       labels:
13 |         - bug
14 |     - title: Deprecations
15 |       labels:
16 |         - deprecation
17 |     - title: Documentation
18 |       labels:
19 |         - documentation
20 |     - title: Maintenance
21 |       labels:
22 |         - tests
23 |         - hygiene
24 |     - title: Misc
25 |       labels:
26 |         - "*"
27 | 


--------------------------------------------------------------------------------
/dask/compatibility.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | 
 5 | from dask._compatibility import EMSCRIPTEN as _EMSCRIPTEN  # noqa
 6 | from dask._compatibility import PY_VERSION as _PY_VERSION  # noqa
 7 | from dask._compatibility import entry_points, parse_version  # noqa
 8 | 
 9 | warnings.warn(
10 |     "`dask.compatibility` is not intended for external use and has been renamed to `dask._compatibility`. "
11 |     "This backward-compatible shim will be removed in a future release. Please find an alternative.",
12 |     DeprecationWarning,
13 |     stacklevel=2,
14 | )
15 | 


--------------------------------------------------------------------------------
/docs/README.rst:
--------------------------------------------------------------------------------
 1 | To build a local copy of the Dask documentation, install the packages in
 2 | ``requirements-docs.txt`` and run ``make html``.
 3 | 
 4 | Optionally create and activate a ``conda`` environment first::
 5 | 
 6 |   conda create -n daskdocs -c conda-forge python=3.11
 7 |   conda activate daskdocs
 8 | 
 9 | Install the dependencies with ``pip``::
10 | 
11 |   python -m pip install -r requirements-docs.txt
12 | 
13 | After running ``make html`` the generated HTML documentation can be found in
14 | the ``build/html`` directory. Open ``build/html/index.html`` to view the home
15 | page for the documentation.
16 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-dataframe.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # required dependencies
 6 |   - packaging=20.0
 7 |   - python=3.9
 8 |   - pyyaml=5.3.1
 9 |   - click=8.0
10 |   - cloudpickle=1.5.0
11 |   - partd=1.2.0
12 |   - fsspec=2021.09.0
13 |   - importlib-metadata=4.13.0
14 |   - toolz=0.10.0
15 |   # optional dependencies pulled in by pip install dask[dataframe]
16 |   - numpy=1.21
17 |   - pandas=1.3
18 |   # test dependencies
19 |   - pre-commit
20 |   - pytest
21 |   - pytest-cov
22 |   - pytest-rerunfailures
23 |   - pytest-xdist
24 | 


--------------------------------------------------------------------------------
/dask/dataframe/io/orc/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | class ORCEngine:
 5 |     """The API necessary to provide a new ORC reader/writer"""
 6 | 
 7 |     @classmethod
 8 |     def read_metadata(
 9 |         cls, fs, paths, columns, index, split_stripes, aggregate_files, **kwargs
10 |     ):
11 |         raise NotImplementedError()
12 | 
13 |     @classmethod
14 |     def read_partition(cls, fs, part, columns, **kwargs):
15 |         raise NotImplementedError()
16 | 
17 |     @classmethod
18 |     def write_partition(cls, df, path, fs, filename, **kwargs):
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/dask/array/dispatch.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dispatch in dask.array.
 3 | 
 4 | Also see backends.py
 5 | """
 6 | 
 7 | from __future__ import annotations
 8 | 
 9 | from dask.utils import Dispatch
10 | 
11 | concatenate_lookup = Dispatch("concatenate")
12 | tensordot_lookup = Dispatch("tensordot")
13 | einsum_lookup = Dispatch("einsum")
14 | empty_lookup = Dispatch("empty")
15 | divide_lookup = Dispatch("divide")
16 | percentile_lookup = Dispatch("percentile")
17 | numel_lookup = Dispatch("numel")
18 | nannumel_lookup = Dispatch("nannumel")
19 | to_numpy_dispatch = Dispatch("to_numpy_dispatch")
20 | to_cupy_dispatch = Dispatch("to_cupy_dispatch")
21 | 


--------------------------------------------------------------------------------
/dask/bytes/utils.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import bz2
 4 | import gzip
 5 | import io
 6 | import lzma
 7 | import zipfile
 8 | 
 9 | 
10 | def zip_compress(data):
11 |     """Write data into zipfile and return the bytes"""
12 |     out = io.BytesIO()
13 |     with zipfile.ZipFile(file=out, mode="w") as z:
14 |         with z.open("myfile", "w") as zf:
15 |             zf.write(data)
16 |     out.seek(0)
17 |     return out.read()
18 | 
19 | 
20 | compress = {
21 |     "gzip": gzip.compress,
22 |     "bz2": bz2.compress,
23 |     None: lambda x: x,
24 |     "xz": lzma.compress,
25 |     "zip": zip_compress,
26 | }
27 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_testing.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sys
 4 | 
 5 | import numpy as np
 6 | import pytest
 7 | 
 8 | import dask.array as da
 9 | from dask.array.utils import assert_eq
10 | 
11 | 
12 | @pytest.mark.skipif(bool(sys.flags.optimize), reason="Assertions disabled.")
13 | def test_assert_eq_checks_scalars():
14 |     # https://github.com/dask/dask/issues/2680
15 |     with pytest.raises(AssertionError):
16 |         assert_eq(np.array(0), np.array(1))
17 | 
18 |     a = da.from_array(np.array([0]), 1)[0]
19 |     b = np.array([1])[0]
20 |     with pytest.raises(AssertionError):
21 |         assert_eq(a, b)
22 | 


--------------------------------------------------------------------------------
/docs/source/_static/style.css:
--------------------------------------------------------------------------------
 1 | .configTextArea {
 2 |     font-family: SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",Courier,monospace;
 3 |     margin-bottom: 24px;
 4 | }
 5 | 
 6 | .classifier::before {
 7 |     content: ": ";
 8 | }
 9 | 
10 | /* options for jupyter-sphinx extension */
11 | div.jupyter_container {
12 |     box-shadow: None;
13 |     font-family: var(--pst-font-family-monospace);
14 |     border-radius: 0.4em;
15 | }
16 | 
17 | .jupyter_container div.code_cell {
18 |     padding: 10px;
19 |     max-width: None !important;
20 | }
21 | 
22 | .jupyter_container .output {
23 |     font-size: 16px;
24 |     padding: 10px
25 | }
26 | 


--------------------------------------------------------------------------------
/dask/bytes/tests/test_compression.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from io import BytesIO
 4 | 
 5 | import pytest
 6 | from fsspec.compression import compr
 7 | 
 8 | from dask.bytes.utils import compress
 9 | 
10 | 
11 | @pytest.mark.parametrize("fmt,File", compr.items())
12 | def test_files(fmt, File):
13 |     if fmt not in compress:
14 |         pytest.skip("compression function not provided")
15 |     if fmt is None:
16 |         return
17 |     data = b"1234" * 1000
18 |     compressed = compress[fmt](data)
19 | 
20 |     b = BytesIO(compressed)
21 |     g = File(b, mode="rb")
22 |     data2 = g.read()
23 |     g.close()
24 |     assert data == data2
25 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # required dependencies
 6 |   - packaging=20.0
 7 |   - python=3.9
 8 |   - pyyaml=5.3.1
 9 |   - click=8.0
10 |   - cloudpickle=1.5.0
11 |   - partd=1.2.0
12 |   - fsspec=2021.09.0
13 |   - importlib-metadata=4.13.0
14 |   - toolz=0.10.0
15 |   # optional dependencies pulled in by pip install dask[distributed]
16 |   - pip
17 |   - pip:
18 |       - git+https://github.com/dask/distributed
19 |   # test dependencies
20 |   - pre-commit
21 |   - pytest
22 |   - pytest-cov
23 |   - pytest-rerunfailures
24 |   - pytest-timeout
25 |   - pytest-xdist
26 | 


--------------------------------------------------------------------------------
/dask/dataframe/extensions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Support for pandas ExtensionArray in dask.dataframe.
 3 | 
 4 | See :ref:`extensionarrays` for more.
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | from dask.dataframe.accessor import (
 9 |     register_dataframe_accessor,
10 |     register_index_accessor,
11 |     register_series_accessor,
12 | )
13 | from dask.utils import Dispatch
14 | 
15 | make_array_nonempty = Dispatch("make_array_nonempty")
16 | make_scalar = Dispatch("make_scalar")
17 | 
18 | 
19 | __all__ = [
20 |     "make_array_nonempty",
21 |     "make_scalar",
22 |     "register_dataframe_accessor",
23 |     "register_index_accessor",
24 |     "register_series_accessor",
25 | ]
26 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_gufunc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | pytestmark = pytest.mark.gpu
 7 | 
 8 | import dask.array as da
 9 | from dask.array.gufunc import apply_gufunc
10 | from dask.array.utils import assert_eq
11 | 
12 | cupy = pytest.importorskip("cupy")
13 | 
14 | 
15 | def test_apply_gufunc_axis():
16 |     def mydiff(x):
17 |         return np.diff(x)
18 | 
19 |     a = cupy.random.default_rng().standard_normal((3, 6, 4))
20 |     da_ = da.from_array(a, chunks=2, asarray=False)
21 | 
22 |     m = np.diff(a, axis=1)
23 |     dm = apply_gufunc(
24 |         mydiff, "(i)->(i)", da_, axis=1, output_sizes={"i": 5}, allow_rechunk=True
25 |     )
26 |     assert_eq(m, dm)
27 | 


--------------------------------------------------------------------------------
/dask/_compatibility.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sys
 4 | import warnings
 5 | 
 6 | from importlib_metadata import entry_points as _entry_points
 7 | from packaging.version import parse as parse_version
 8 | 
 9 | PY_VERSION = parse_version(".".join(map(str, sys.version_info[:3])))
10 | 
11 | EMSCRIPTEN = sys.platform == "emscripten"
12 | 
13 | 
14 | def entry_points(group=None):
15 |     warnings.warn(
16 |         "`dask._compatibility.entry_points` has been replaced by `importlib_metadata.entry_points` and will be removed "
17 |         "in a future version. Please use `importlib_metadata.entry_points` instead.",
18 |         DeprecationWarning,
19 |         stacklevel=2,
20 |     )
21 |     return _entry_points(group=group)
22 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # .git-blame-ignore-revs
 2 | # absolufy-imports - No relative - PEP8 (#8796)
 3 | cccb9d8d8e33a891396b1275c2448c352ef40c27
 4 | 
 5 | # Update `pre-commit` version (#8691)
 6 | 510bbc380531cbf56a409f1ae68e6fd84a9599e6
 7 | 
 8 | # Run pyupgrade in CI (#8246)
 9 | 80a82008d5b02a08f6ff59d802defcc43247eb1a
10 | 
11 | # Bump pre-commit hook versions (#7676)
12 | d6bbbb08c92652eae2820e93edc2f3fe502391d3
13 | 
14 | # Start adding isort (#7370)
15 | a31c0fc72e1cc59b8b0254965824abb0718c5f56
16 | 
17 | # Rerun with latest black release (#6568)
18 | 64e2a9b3b9992503221a074a547827501927d1fa
19 | 
20 | # LINT: Fixup black string normalization (#5227)
21 | d92f4015a1da3da10c04c682ed2acae8469e9576
22 | 
23 | # Apply Black formatting (#4983)
24 | 7e4beffb339c69278091d4e305c2ae18ddf8c74f
25 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   require_ci_to_pass: yes
 3 |   # codecov pushes a failing status update to github actions before all the
 4 |   # test runs have completed (this is later updated to passing after more test
 5 |   # runs pass, but the initial red X is annoying). As far as I can tell from
 6 |   # https://docs.codecov.com/docs/merging-reports this shouldn't be happening,
 7 |   # but it is. Here we set a minimum number of builds before notifying in the
 8 |   # hopes that it will stop this behavior.
 9 |   notify:
10 |     after_n_builds: 10
11 | 
12 | coverage:
13 |   precision: 2
14 |   round: down
15 |   range: "90...100"
16 | 
17 |   status:
18 |     project:
19 |       default:
20 |         target: 90%
21 |         threshold: 1%
22 |     patch: no
23 |     changes: no
24 | 
25 | comment: off
26 | 


--------------------------------------------------------------------------------
/.github/workflows/stale-bot.yaml:
--------------------------------------------------------------------------------
 1 | name: 'Label stale issues and PRs'
 2 | on:
 3 |   schedule:
 4 |     - cron: '30 1 * * 1' # runs once a week
 5 | 
 6 | jobs:
 7 |   stale:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/stale@v8
11 |         with:
12 |           stale-issue-message: '' # no comment left if string is empty
13 |           stale-pr-message: '' # no comment left if string is empty
14 |           days-before-stale: 30
15 |           days-before-close: -1
16 |           stale-issue-label: 'needs attention'
17 |           stale-pr-label: 'needs attention'
18 |           exempt-issue-labels: 'good intro to dask,good first issue,Good First Issue,good second issue,feature request'
19 |           exempt-draft-pr: true
20 |           start-date: '2020-04-18T00:00:00Z' # ignore before this date, ISO 8601 or RFC 2822
21 | 


--------------------------------------------------------------------------------
/dask/distributed.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | _import_error_message = (
 6 |     "dask.distributed is not installed.\n\n"
 7 |     "Please either conda or pip install distributed:\n\n"
 8 |     "  conda install dask distributed             # either conda install\n"
 9 |     '  python -m pip install "dask[distributed]" --upgrade    # or pip install'
10 | )
11 | 
12 | try:
13 |     from distributed import *
14 | except ImportError as e:
15 |     if e.msg == "No module named 'distributed'":
16 |         raise ImportError(_import_error_message) from e
17 |     else:
18 |         raise
19 | 
20 | 
21 | def __getattr__(value):
22 |     try:
23 |         import distributed
24 |     except ImportError as e:
25 |         raise ImportError(_import_error_message) from e
26 |     return getattr(distributed, value)
27 | 


--------------------------------------------------------------------------------
/dask/dataframe/io/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from dask.dataframe.io import demo
 4 | from dask.dataframe.io.csv import read_csv, read_fwf, read_table, to_csv
 5 | from dask.dataframe.io.hdf import read_hdf, to_hdf
 6 | from dask.dataframe.io.io import (
 7 |     from_array,
 8 |     from_dask_array,
 9 |     from_delayed,
10 |     from_dict,
11 |     from_map,
12 |     from_pandas,
13 |     to_backend,
14 |     to_bag,
15 |     to_records,
16 | )
17 | from dask.dataframe.io.json import read_json, to_json
18 | from dask.dataframe.io.sql import read_sql, read_sql_query, read_sql_table, to_sql
19 | 
20 | try:
21 |     from dask.dataframe.io.parquet import read_parquet, to_parquet
22 | except ImportError:
23 |     pass
24 | 
25 | try:
26 |     from dask.dataframe.io.orc import read_orc, to_orc
27 | except ImportError:
28 |     pass
29 | 


--------------------------------------------------------------------------------
/dask/tests/test_backends.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | import dask
 6 | 
 7 | 
 8 | @pytest.mark.gpu
 9 | @pytest.mark.parametrize("backend", ["pandas", "cudf"])
10 | def test_CreationDispatch_error_informative_message(backend):
11 |     # Check that an informative error is emitted when a backend dispatch
12 |     # method fails
13 |     pytest.importorskip(backend)
14 |     dd = pytest.importorskip("dask.dataframe")
15 |     data = {"a": [1, 2, 3, 4], "B": [10, 11, 12, 13]}
16 |     with dask.config.set({"dataframe.backend": backend}):
17 |         with pytest.raises(TypeError) as excinfo:
18 |             dd.from_dict(data, npartitions=2, unsupported_kwarg=True)
19 | 
20 |         msg = str(excinfo.value)
21 |         assert "error occurred while calling the from_dict method" in msg
22 |         assert backend in msg
23 | 


--------------------------------------------------------------------------------
/dask/widgets/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | try:
 4 |     from dask.widgets.widgets import (
 5 |         FILTERS,
 6 |         TEMPLATE_PATHS,
 7 |         get_environment,
 8 |         get_template,
 9 |     )
10 | 
11 | except ImportError as e:
12 |     msg = (
13 |         "Dask diagnostics requirements are not installed.\n\n"
14 |         "Please either conda or pip install as follows:\n\n"
15 |         "  conda install dask                     # either conda install\n"
16 |         '  python -m pip install "dask[diagnostics]" --upgrade  # or python -m pip install'
17 |     )
18 |     exception = e  # Explicit reference for e as it will be lost outside the try block
19 |     FILTERS = {}
20 |     TEMPLATE_PATHS = []
21 | 
22 |     def get_environment():
23 |         raise ImportError(msg) from exception
24 | 
25 |     def get_template(name: str):
26 |         raise ImportError(msg) from exception
27 | 


--------------------------------------------------------------------------------
/dask/bag/chunk.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | 
 4 | def barrier(*args):
 5 |     return None
 6 | 
 7 | 
 8 | def getitem(x, key):
 9 |     """Like :func:`operator.getitem`, but allows setting key using partial
10 |     ``partial(chunk.getitem, key=key)
11 |     """
12 |     return x[key]
13 | 
14 | 
15 | def foldby_combine2(combine, acc, x):
16 |     return combine(acc, x[1])
17 | 
18 | 
19 | def groupby_tasks_group_hash(x, hash, grouper):
20 |     return hash(grouper(x)), x
21 | 
22 | 
23 | def var_chunk(seq):
24 |     squares, total, n = 0.0, 0.0, 0
25 |     for x in seq:
26 |         squares += x**2
27 |         total += x
28 |         n += 1
29 |     return squares, total, n
30 | 
31 | 
32 | def var_aggregate(x, ddof):
33 |     squares, totals, counts = list(zip(*x))
34 |     x2, x, n = float(sum(squares)), float(sum(totals)), sum(counts)
35 |     result = (x2 / n) - (x / n) ** 2
36 |     return result * n / (n - ddof)
37 | 


--------------------------------------------------------------------------------
/docs/source/how-to/setup-prometheus.rst:
--------------------------------------------------------------------------------
 1 | .. When modifying the contents of this page, please adjust the corresponding page in the dask.distributed documentation accordingly.
 2 | 
 3 | Setup Prometheus monitoring
 4 | ===========================
 5 | 
 6 | Prometheus_ is a widely popular tool for monitoring and alerting a wide variety of
 7 | systems. A distributed cluster offers a number of Prometheus metrics if the
 8 | prometheus_client_ package is installed. The metrics are exposed in Prometheus'
 9 | text-based format at the ``/metrics`` endpoint on both schedulers and workers.
10 | 
11 | 
12 | Available metrics
13 | -----------------
14 | 
15 | Apart from the metrics exposed per default by the prometheus_client_, schedulers and
16 | workers expose a number of Dask-specific metrics.
17 | See the `dask.distributed documentation
18 | <https://distributed.dask.org/en/latest/prometheus.html>`_ for details.
19 | 
20 | 
21 | .. _Prometheus: https://prometheus.io
22 | .. _prometheus_client: https://github.com/prometheus/client_python
23 | 


--------------------------------------------------------------------------------
/dask/bag/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | try:
 4 |     from dask.bag.avro import read_avro
 5 |     from dask.bag.core import Bag, Item
 6 |     from dask.bag.core import bag_map as map
 7 |     from dask.bag.core import bag_range as range
 8 |     from dask.bag.core import bag_zip as zip
 9 |     from dask.bag.core import (
10 |         concat,
11 |         from_delayed,
12 |         from_sequence,
13 |         from_url,
14 |         map_partitions,
15 |         to_textfiles,
16 |     )
17 |     from dask.bag.text import read_text
18 |     from dask.bag.utils import assert_eq
19 |     from dask.base import compute
20 | except ImportError as e:
21 |     msg = (
22 |         "Dask bag requirements are not installed.\n\n"
23 |         "Please either conda or pip install as follows:\n\n"
24 |         "  conda install dask               # either conda install\n"
25 |         '  python -m pip install "dask[bag]" --upgrade  # or python -m pip install'
26 |     )
27 |     raise ImportError(str(e) + "\n\n" + msg) from e
28 | 


--------------------------------------------------------------------------------
/docs/source/logos.rst:
--------------------------------------------------------------------------------
 1 | :orphan:
 2 | 
 3 | Images and Logos
 4 | ================
 5 | 
 6 | Here are some commonly used Dask icons and logos
 7 | (see the `Dask style guide <https://www.dask.org/style-guide>`_ for more details).
 8 | 
 9 | .. image:: images/dask_icon.svg
10 |    :alt: Primary Dask icon.
11 | 
12 | .. image:: images/dask_icon_black.svg
13 |    :alt: Dask icon in black.
14 | 
15 | .. image:: images/dask_icon_white.svg
16 |    :alt: Dask icon in white.
17 | 
18 | .. image:: images/dask_icon_on_pink.svg
19 |    :alt: Dask icon to use on a pink background.
20 | 
21 | .. image:: images/dask_horizontal.svg
22 |    :alt: Primary Dask logo.
23 | 
24 | .. image:: images/dask_horizontal_black.svg
25 |    :alt: Dask logo in black.
26 | 
27 | .. image:: images/dask_horizontal_white.svg
28 |    :alt: Dask logo in white.
29 | 
30 | .. image:: images/dask_horizontal_on_pink.svg
31 |    :alt: Dask logo to use on a pink background.
32 | 
33 | .. image:: images/dask_horizontal_on_blue.svg
34 |    :alt: Dask logo to use on a blue background.
35 | 


--------------------------------------------------------------------------------
/dask/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | def test_development_guidelines_matches_ci():
 9 |     """When the environment.yaml changes in CI, make sure to change it in the docs as well"""
10 |     root_dir = Path(__file__).parent.parent.parent
11 | 
12 |     if not (root_dir / ".github" / "workflows").exists():
13 |         pytest.skip("Test can only be run on an editable install")
14 | 
15 |     development_doc_file = root_dir / "docs" / "source" / "develop.rst"
16 |     additional_ci_file = root_dir / ".github" / "workflows" / "additional.yml"
17 |     upstream_ci_file = root_dir / ".github" / "workflows" / "upstream.yml"
18 |     latest_env = "environment-3.10.yaml"
19 | 
20 |     for filename in [development_doc_file, additional_ci_file, upstream_ci_file]:
21 |         with open(filename, encoding="utf8") as f:
22 |             assert any(
23 |                 latest_env in line for line in f
24 |             ), f"{latest_env} not found in {filename}"
25 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_boolean.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pandas as pd
 4 | 
 5 | import dask.dataframe as dd
 6 | 
 7 | 
 8 | def test_meta():
 9 |     values = pd.array([True, False, None], dtype="boolean")
10 |     ds = dd.from_pandas(pd.Series(values), 2)
11 |     assert ds.dtype == pd.BooleanDtype()
12 | 
13 |     dd.utils.assert_eq(ds._meta_nonempty, pd.Series([True, pd.NA], dtype="boolean"))
14 | 
15 |     ddf = dd.from_pandas(pd.DataFrame({"A": values}), 2)
16 |     assert ddf.dtypes["A"] == pd.BooleanDtype()
17 | 
18 |     dd.utils.assert_eq(
19 |         ddf._meta_nonempty,
20 |         pd.DataFrame({"A": pd.array([True, pd.NA], dtype="boolean")}),
21 |     )
22 | 
23 | 
24 | def test_ops():
25 |     s1 = pd.Series(pd.array([True, False, None] * 3, dtype="boolean"))
26 |     s2 = pd.Series(pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean"))
27 | 
28 |     ds1 = dd.from_pandas(s1, 2)
29 |     ds2 = dd.from_pandas(s2, 2)
30 | 
31 |     dd.utils.assert_eq(ds1 | ds2, s1 | s2)
32 |     dd.utils.assert_eq(ds1 & ds2, s1 & s2)
33 |     dd.utils.assert_eq(ds1 ^ ds2, s1 ^ s2)
34 | 


--------------------------------------------------------------------------------
/docs/source/dashboard-progress-script.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script was run to produce some of the screenshots on https://docs.dask.org/en/stable/dashboard.html
 3 | """
 4 | from __future__ import annotations
 5 | 
 6 | import time
 7 | 
 8 | from dask import delayed
 9 | from dask.distributed import Client, wait
10 | 
11 | 
12 | @delayed
13 | def inc(x):
14 |     time.sleep(0.1)
15 |     return x + 1
16 | 
17 | 
18 | @delayed
19 | def double(x):
20 |     time.sleep(0.1)
21 |     return 2 * x
22 | 
23 | 
24 | @delayed
25 | def add(x, y):
26 |     time.sleep(0.1)
27 |     return x + y
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     with Client(n_workers=4, threads_per_worker=2, memory_limit="4 GiB") as client:
32 |         while True:
33 |             data = list(range(1000))
34 |             output = []
35 |             for x in data:
36 |                 a = inc(x)
37 |                 b = double(x)
38 |                 c = add(a, b)
39 |                 output.append(c)
40 | 
41 |             total = delayed(sum)(output)
42 |             total = total.persist()
43 |             wait(total)
44 |             time.sleep(5)
45 |             del total
46 |             time.sleep(2)
47 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | # flake8 doesn't support pyproject.toml yet https://github.com/PyCQA/flake8/issues/234
 2 | [flake8]
 3 | # References:
 4 | # https://flake8.readthedocs.io/en/latest/user/configuration.html
 5 | # https://flake8.readthedocs.io/en/latest/user/error-codes.html
 6 | # https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
 7 | exclude = __init__.py
 8 | ignore =
 9 |     # Extra space in brackets
10 |     E20
11 |     # Multiple spaces around ","
12 |     E231,E241
13 |     # Comments
14 |     E26
15 |     # Import formatting
16 |     E4
17 |     # Comparing types instead of isinstance
18 |     E721
19 |     # Assigning lambda expression
20 |     E731
21 |     # Ambiguous variable names
22 |     E741
23 |     # Line break before binary operator
24 |     W503
25 |     # Line break after binary operator
26 |     W504
27 |     # Redefinition of unused 'loop' from line 10
28 |     F811
29 |     # No explicit stacklevel in warnings.warn. FIXME we should correct this in the code
30 |     B028
31 | 
32 | max-line-length = 120
33 | per-file-ignores =
34 |     *_test.py:
35 |         # Do not call assert False since python -O removes these calls
36 |         B011,
37 |     **/tests/*:
38 |         # Do not call assert False since python -O removes these calls
39 |         B011,
40 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_optimize_dataframe.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pandas as pd
 4 | 
 5 | import dask
 6 | import dask.dataframe as dd
 7 | 
 8 | dsk = {
 9 |     ("x", 0): pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[0, 1, 3]),
10 |     ("x", 1): pd.DataFrame({"a": [4, 5, 6], "b": [3, 2, 1]}, index=[5, 6, 8]),
11 |     ("x", 2): pd.DataFrame({"a": [7, 8, 9], "b": [0, 0, 0]}, index=[9, 9, 9]),
12 | }
13 | dfs = list(dsk.values())
14 | 
15 | 
16 | def test_fuse_ave_width():
17 |     df = pd.DataFrame({"x": range(10)})
18 |     df = dd.from_pandas(df, npartitions=5)
19 | 
20 |     s = (df.x + 1) + (df.x + 2)
21 | 
22 |     with dask.config.set({"optimization.fuse.ave-width": 4}):
23 |         a = s.__dask_optimize__(s.dask, s.__dask_keys__())
24 | 
25 |     b = s.__dask_optimize__(s.dask, s.__dask_keys__())
26 | 
27 |     assert len(a) <= 15
28 |     assert len(b) <= 15
29 | 
30 | 
31 | def test_optimize_blockwise():
32 |     from dask.array.optimization import optimize_blockwise
33 | 
34 |     df = pd.DataFrame({"x": range(10), "y": range(10)})
35 |     ddf = dd.from_pandas(df, npartitions=2)
36 | 
37 |     for _ in range(10):
38 |         ddf["x"] = ddf.x + 1 + ddf.y
39 | 
40 |     graph = optimize_blockwise(ddf.dask)
41 | 
42 |     assert len(graph) <= 4
43 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_numpy_compat.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | import dask.array as da
 7 | from dask.array.utils import assert_eq
 8 | 
 9 | 
10 | @pytest.fixture(
11 |     params=[
12 |         [("A", ("f4", (3, 2))), ("B", ("f4", 3)), ("C", ("f8", 3))],
13 |         [("A", ("i4", (3, 2))), ("B", ("f4", 3)), ("C", ("S4", 3))],
14 |     ]
15 | )
16 | def dtype(request):
17 |     return np.dtype(request.param)
18 | 
19 | 
20 | @pytest.fixture(params=[["A"], ["A", "B"], ["A", "B", "C"]])
21 | def index(request):
22 |     return request.param
23 | 
24 | 
25 | def test_basic():
26 |     # sanity check
27 |     dtype = [("a", "f8"), ("b", "f8"), ("c", "f8")]
28 |     x = np.ones((5, 3), dtype=dtype)
29 |     dx = da.ones((5, 3), dtype=dtype, chunks=3)
30 |     result = dx[["a", "b"]]
31 |     expected = x[["a", "b"]]
32 |     assert_eq(result, expected)
33 | 
34 | 
35 | def test_min_max_round_funcs():
36 |     # Regression test for gh-5031
37 |     image = da.from_array(np.array([[0, 1], [1, 2]]), chunks=(1, 2))
38 |     # These use __array_function__ (and min/max/round are aliased,
39 |     # to amin/amax/round_ in numpy)
40 |     assert int(np.min(image)) == 0
41 |     assert int(np.max(image)) == 2
42 |     assert np.round(image)[1, 1] == 2
43 | 


--------------------------------------------------------------------------------
/dask/tests/test_hashing.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from dask.hashing import hash_buffer, hash_buffer_hex, hashers
 6 | 
 7 | np = pytest.importorskip("numpy")
 8 | 
 9 | buffers = [
10 |     b"abc",
11 |     bytearray(b"123"),
12 |     memoryview(b"456"),
13 |     np.array(42),
14 |     np.ones((100, 100)),
15 |     np.zeros((100, 100), dtype=[("a", "i4"), ("b", "i2")]),
16 |     np.ones(10000, dtype=np.int8)[1:],  # unaligned
17 | ]
18 | 
19 | 
20 | @pytest.mark.parametrize("x", buffers)
21 | def test_hash_buffer(x):
22 |     for hasher in [None] + hashers:
23 |         h = hash_buffer(x, hasher=hasher)
24 |         assert isinstance(h, bytes)
25 |         assert 8 <= len(h) < 32
26 |         assert h == hash_buffer(x, hasher=hasher)
27 | 
28 | 
29 | @pytest.mark.parametrize("x", buffers)
30 | def test_hash_buffer_hex(x):
31 |     for hasher in [None] + hashers:
32 |         h = hash_buffer_hex(x, hasher=hasher)
33 |         assert isinstance(h, str)
34 |         assert 16 <= len(h) < 64
35 |         assert h == hash_buffer_hex(x, hasher=hasher)
36 | 
37 | 
38 | @pytest.mark.parametrize("hasher", hashers)
39 | def test_hashers(hasher):
40 |     # Sanity check
41 |     x = b"x"
42 |     h = hasher(x)
43 |     assert isinstance(h, bytes)
44 |     assert 8 <= len(h) < 32
45 | 


--------------------------------------------------------------------------------
/dask/widgets/widgets.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import datetime
 4 | import html
 5 | import os.path
 6 | 
 7 | from jinja2 import Environment, FileSystemLoader, Template
 8 | from jinja2.exceptions import TemplateNotFound
 9 | 
10 | from dask.utils import format_bytes, format_time, format_time_ago, key_split, typename
11 | 
12 | FILTERS = {
13 |     "datetime_from_timestamp": datetime.datetime.fromtimestamp,
14 |     "format_bytes": format_bytes,
15 |     "format_time": format_time,
16 |     "format_time_ago": format_time_ago,
17 |     "html_escape": html.escape,
18 |     "key_split": key_split,
19 |     "type": type,
20 |     "typename": typename,
21 | }
22 | 
23 | TEMPLATE_PATHS = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates")]
24 | 
25 | 
26 | def get_environment() -> Environment:
27 |     loader = FileSystemLoader(TEMPLATE_PATHS)
28 |     environment = Environment(loader=loader)
29 |     environment.filters.update(FILTERS)
30 | 
31 |     return environment
32 | 
33 | 
34 | def get_template(name: str) -> Template:
35 |     try:
36 |         return get_environment().get_template(name)
37 |     except TemplateNotFound as e:
38 |         raise TemplateNotFound(
39 |             f"Unable to find {name} in dask.widgets.TEMPLATE_PATHS {TEMPLATE_PATHS}"
40 |         ) from e
41 | 


--------------------------------------------------------------------------------
/docs/source/array-stats.rst:
--------------------------------------------------------------------------------
 1 | Stats
 2 | =====
 3 | 
 4 | Dask Array implements a subset of the `scipy.stats`_ package.
 5 | 
 6 | Statistical Functions
 7 | ---------------------
 8 | 
 9 | You can calculate various measures of an array including skewness, kurtosis, and arbitrary moments.
10 | 
11 | .. code-block:: python
12 | 
13 |    >>> from dask.array import stats
14 |    >>> rng = da.random.default_rng()
15 |    >>> x = rng.beta(1, 1, size=(1000,), chunks=10)
16 |    >>> k, s, m = [stats.kurtosis(x), stats.skew(x), stats.moment(x, 5)]
17 |    >>> dask.compute(k, s, m)
18 |    (1.7612340817172787, -0.064073498030693302, -0.00054523780628304799)
19 | 
20 | 
21 | Statistical Tests
22 | -----------------
23 | 
24 | You can perform basic statistical tests on Dask arrays.
25 | Each of these tests return a ``dask.delayed`` wrapping one of the scipy ``namedtuple``
26 | results.
27 | 
28 | 
29 | .. code-block:: python
30 | 
31 |    >>> rng = da.random.default_rng()
32 |    >>> a = rng.uniform(size=(50,), chunks=(25,))
33 |    >>> b = a + rng.uniform(low=-0.15, high=0.15, size=(50,), chunks=(25,))
34 |    >>> result = stats.ttest_rel(a, b)
35 |    >>> result.compute()
36 |    Ttest_relResult(statistic=-1.5102104380013242, pvalue=0.13741197274874514)
37 | 
38 | .. _scipy.stats: https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html
39 | 


--------------------------------------------------------------------------------
/dask/tests/test_datasets.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | import dask
 6 | 
 7 | 
 8 | def test_mimesis():
 9 |     pytest.importorskip("mimesis")
10 | 
11 |     b = dask.datasets.make_people()
12 |     assert b.take(5)
13 | 
14 |     assert b.take(3) == b.take(3)
15 | 
16 | 
17 | def test_full_dataset():
18 |     pytest.importorskip("mimesis")
19 |     b = dask.datasets.make_people(npartitions=2, records_per_partition=10)
20 |     assert b.count().compute() == 20
21 | 
22 | 
23 | def test_make_dataset_with_processes():
24 |     pytest.importorskip("mimesis")
25 |     b = dask.datasets.make_people(npartitions=2)
26 |     try:
27 |         b.compute(scheduler="processes")
28 |     except TypeError:
29 |         pytest.fail("Failed to execute make_people using processes")
30 | 
31 | 
32 | def test_no_mimesis():
33 |     try:
34 |         import mimesis  # noqa: F401
35 |     except ImportError:
36 |         with pytest.raises(Exception) as info:
37 |             dask.datasets.make_people()
38 | 
39 |         assert "python -m pip install mimesis" in str(info.value)
40 | 
41 | 
42 | def test_deterministic():
43 |     pytest.importorskip("mimesis")
44 | 
45 |     a = dask.datasets.make_people(seed=123)
46 |     b = dask.datasets.make_people(seed=123)
47 | 
48 |     assert a.take(1)[0]["name"] == b.take(1)[0]["name"]
49 | 


--------------------------------------------------------------------------------
/dask/tests/test_ci.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | import importlib_metadata
 6 | import pytest
 7 | from packaging.version import Version
 8 | 
 9 | 
10 | @pytest.mark.xfail(reason="https://github.com/dask/dask/issues/9735", strict=False)
11 | @pytest.mark.skipif(
12 |     not os.environ.get("UPSTREAM_DEV", False),
13 |     reason="Only check for dev packages in `upstream` CI build",
14 | )
15 | def test_upstream_packages_installed():
16 |     # List of packages should match those specified in
17 |     # `continuous_integration/scripts/install.sh`
18 | 
19 |     # FIXME: This test isn't sensative to projects that use git tags
20 |     # to determine versions (e.g. versionseer) when installed
21 |     # directly from GitHub as the latest `main` branch can sometimes
22 |     # be pointing to a released version of the project.
23 |     packages = [
24 |         "bokeh",
25 |         # "dask",
26 |         # "distributed",
27 |         # "fastparquet",
28 |         # "fsspec",
29 |         "numpy",
30 |         "pandas",
31 |         # "partd",
32 |         "pyarrow",
33 |         # "s3fs",
34 |         "scipy",
35 |         # "sparse",
36 |         # "zarr",
37 |         # "zict",
38 |     ]
39 |     for package in packages:
40 |         v = Version(importlib_metadata.version(package))
41 |         assert v.is_prerelease or v.local is not None, (package, str(v))
42 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Dask
 2 | ====
 3 | 
 4 | |Build Status| |Coverage| |Doc Status| |Discourse| |Version Status| |NumFOCUS|
 5 | 
 6 | Dask is a flexible parallel computing library for analytics.  See
 7 | documentation_ for more information.
 8 | 
 9 | 
10 | LICENSE
11 | -------
12 | 
13 | New BSD. See `License File <https://github.com/dask/dask/blob/main/LICENSE.txt>`__.
14 | 
15 | .. _documentation: https://dask.org
16 | .. |Build Status| image:: https://github.com/dask/dask/actions/workflows/tests.yml/badge.svg
17 |    :target: https://github.com/dask/dask/actions/workflows/tests.yml
18 | .. |Coverage| image:: https://codecov.io/gh/dask/dask/branch/main/graph/badge.svg
19 |    :target: https://codecov.io/gh/dask/dask/branch/main
20 |    :alt: Coverage status
21 | .. |Doc Status| image:: https://readthedocs.org/projects/dask/badge/?version=latest
22 |    :target: https://dask.org
23 |    :alt: Documentation Status
24 | .. |Discourse| image:: https://img.shields.io/discourse/users?logo=discourse&server=https%3A%2F%2Fdask.discourse.group
25 |    :alt: Discuss Dask-related things and ask for help
26 |    :target: https://dask.discourse.group
27 | .. |Version Status| image:: https://img.shields.io/pypi/v/dask.svg
28 |    :target: https://pypi.python.org/pypi/dask/
29 | .. |NumFOCUS| image:: https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A
30 |    :target: https://www.numfocus.org/
31 | 


--------------------------------------------------------------------------------
/dask/tests/test_context.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | import dask
 6 | from dask.context import globalmethod
 7 | 
 8 | 
 9 | def test_with_get():
10 |     da = pytest.importorskip("dask.array")
11 |     var = [0]
12 | 
13 |     def myget(dsk, keys, **kwargs):
14 |         var[0] = var[0] + 1
15 |         return dask.get(dsk, keys, **kwargs)
16 | 
17 |     x = da.ones(10, chunks=(5,))
18 | 
19 |     assert x.sum().compute() == 10
20 |     assert var[0] == 0
21 | 
22 |     with dask.config.set(scheduler=myget):
23 |         assert x.sum().compute() == 10
24 |     assert var[0] == 1
25 | 
26 |     # Make sure we've cleaned up
27 |     assert x.sum().compute() == 10
28 |     assert var[0] == 1
29 | 
30 | 
31 | def foo():
32 |     return "foo"
33 | 
34 | 
35 | def bar():
36 |     return "bar"
37 | 
38 | 
39 | class Foo:
40 |     @globalmethod(key="f")
41 |     def f():  # type: ignore
42 |         return 1
43 | 
44 |     g = globalmethod(foo, key="g", falsey=bar)
45 | 
46 | 
47 | def test_globalmethod():
48 |     x = Foo()
49 | 
50 |     assert x.f() == 1
51 | 
52 |     with dask.config.set(f=lambda: 2):
53 |         assert x.f() == 2
54 | 
55 |     with dask.config.set(f=foo):
56 |         assert x.f is foo
57 |         assert x.f() == "foo"
58 | 
59 |     assert x.g is foo
60 |     assert x.g() == "foo"
61 | 
62 |     with dask.config.set(g=False):
63 |         assert x.g is bar
64 |         assert x.g() == "bar"
65 | 


--------------------------------------------------------------------------------
/dask/widgets/templates/array.html.j2:
--------------------------------------------------------------------------------
 1 | <table>
 2 |     <tr>
 3 |         <td>
 4 |             <table style="border-collapse: collapse;">
 5 |                 <thead>
 6 |                     <tr>
 7 |                         <td> </td>
 8 |                         <th> Array </th>
 9 |                         <th> Chunk </th>
10 |                     </tr>
11 |                 </thead>
12 |                 <tbody>
13 |                     {% if nbytes %}
14 |                     <tr>
15 |                         <th> Bytes </th>
16 |                         <td> {{ nbytes }} </td>
17 |                         <td> {{ cbytes }} </td>
18 |                     </tr>
19 |                     {% endif %}
20 |                     <tr>
21 |                         <th> Shape </th>
22 |                         <td> {{ array.shape }} </td>
23 |                         <td> {{ array.chunksize }} </td>
24 |                     </tr>
25 |                     <tr>
26 |                         <th> Dask graph </th>
27 |                         <td colspan="2"> {{ array.npartitions }} chunks in {{ layers }} </td>
28 |                     </tr>
29 |                     <tr>
30 |                         <th> Data type </th>
31 |                         <td colspan="2"> {{ array.dtype }} {{ array._meta | type | typename }} </td>
32 |                     </tr>
33 |                 </tbody>
34 |             </table>
35 |         </td>
36 |         <td>
37 |         {{grid}}
38 |         </td>
39 |     </tr>
40 | </table>
41 | 


--------------------------------------------------------------------------------
/continuous_integration/scripts/test_imports.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -o errexit
 3 | 
 4 | 
 5 | test_import () {
 6 |     echo "Create environment: python=$PYTHON_VERSION $1"
 7 |     # Create an empty environment
 8 |     mamba create -q -y -n test-imports -c conda-forge python=$PYTHON_VERSION packaging pyyaml fsspec toolz partd click cloudpickle importlib-metadata $1
 9 |     conda activate test-imports
10 |     if [[ $1 =~ "distributed" ]]; then
11 |         # dask[distributed] depends on the latest version of distributed
12 |         python -m pip install git+https://github.com/dask/distributed
13 |     fi
14 |     python -m pip install -e .
15 |     mamba list
16 |     echo "python -c '$2'"
17 |     python -c "$2"
18 |     # Ensure that no non-deterministic objects are tokenized at init time,
19 |     # which can prevent the library from being imported at all.
20 |     echo "python -c '$2' (ensure deterministic)"
21 |     DASK_TOKENIZE__ENSURE_DETERMINISTIC=True python -c "$2"
22 |     conda deactivate
23 |     mamba env remove -n test-imports
24 | }
25 | 
26 | test_import ""                                "import dask, dask.base, dask.multiprocessing, dask.threaded, dask.optimization, dask.bag, dask.delayed, dask.graph_manipulation, dask.layers"
27 | test_import "numpy"                           "import dask.array"
28 | test_import "pandas"                          "import dask.dataframe"
29 | test_import "bokeh"                           "import dask.diagnostics"
30 | test_import "distributed"                     "import dask.distributed"
31 | 


--------------------------------------------------------------------------------
/continuous_integration/recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set major_minor_patch = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').split('.') %}
 2 | {% set new_patch = major_minor_patch[2] | int + 1 %}
 3 | {% set version = (major_minor_patch[:2] + [new_patch]) | join('.') + environ.get('VERSION_SUFFIX', '') %}
 4 | 
 5 | 
 6 | package:
 7 |   name: dask-core
 8 |   version: {{ version }}
 9 | 
10 | source:
11 |   git_url: ../..
12 | 
13 | build:
14 |   number: {{ GIT_DESCRIBE_NUMBER }}
15 |   noarch: python
16 |   string: py_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
17 |   script: {{ PYTHON }} -m pip install . -vv
18 |   entry_points:
19 |     - dask = dask.__main__:main
20 | 
21 | requirements:
22 |   host:
23 |     - python >=3.9
24 |     - pip
25 |     - versioneer =0.28
26 |     - tomli # [py<311]
27 | 
28 |   run:
29 |     - python >=3.9
30 |     - click >=8.0
31 |     - cloudpickle >=1.5.0
32 |     - fsspec >=2021.09.0
33 |     - packaging >=20.0
34 |     - partd >=1.2.0
35 |     - pyyaml >=5.3.1
36 |     - toolz >=0.10.0
37 |     - importlib_metadata >=4.13.0
38 | 
39 | test:
40 |   imports:
41 |     - dask
42 |   commands:
43 |     - pip check
44 |     - dask docs --help
45 |     - dask info --help
46 |     - dask info versions --help
47 |   requires:
48 |     - pip
49 | 
50 | about:
51 |   home: https://github.com/dask/dask/
52 |   license: BSD-3-Clause
53 |   license_file:
54 |     - LICENSE.txt
55 |     - dask/array/NUMPY_LICENSE.txt
56 |   summary: Parallel Python with task scheduling
57 |   doc_url: https://dask.org/
58 |   dev_url: https://github.com/dask/dask
59 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_xarray.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | import dask.array as da
 6 | from dask.array.utils import assert_eq
 7 | 
 8 | xr = pytest.importorskip("xarray")
 9 | 
10 | 
11 | def test_mean():
12 |     y = da.mean(xr.DataArray([1, 2, 3.0]))
13 |     assert isinstance(y, da.Array)
14 |     assert_eq(y, y)
15 | 
16 | 
17 | def test_asarray():
18 |     y = da.asarray(xr.DataArray([1, 2, 3.0]))
19 |     assert isinstance(y, da.Array)
20 |     assert_eq(y, y)
21 | 
22 | 
23 | def test_asanyarray():
24 |     y = da.asanyarray(xr.DataArray([1, 2, 3.0]))
25 |     assert isinstance(y, da.Array)
26 |     assert_eq(y, y)
27 | 
28 | 
29 | def test_asarray_xarray_intersphinx_workaround():
30 |     # test that the intersphinx workaround in https://github.com/pydata/xarray/issues/4279 works
31 |     module = xr.DataArray.__module__
32 |     try:
33 |         xr.DataArray.__module__ = "xarray"
34 |         y = da.asarray(xr.DataArray([1, 2, 3.0]))
35 |         assert isinstance(y, da.Array)
36 |         assert type(y._meta).__name__ == "ndarray"
37 |         assert_eq(y, y)
38 |     finally:
39 |         xr.DataArray.__module__ = module
40 | 
41 | 
42 | def test_fft():
43 |     # Regression test for https://github.com/dask/dask/issues/9679
44 |     coord = da.arange(8, chunks=-1)
45 |     data = da.random.random((8, 8), chunks=-1) + 1
46 |     x = xr.DataArray(data, coords={"x": coord, "y": coord}, dims=["x", "y"])
47 |     result = da.fft.fft(x)
48 |     expected = da.fft.fft(x.data)
49 |     assert_eq(result, expected)
50 | 


--------------------------------------------------------------------------------
/dask/widgets/tests/test_widgets.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os.path
 4 | 
 5 | import pytest
 6 | 
 7 | jinja2 = pytest.importorskip("jinja2")
 8 | 
 9 | from dask.utils import format_bytes
10 | from dask.widgets import FILTERS, TEMPLATE_PATHS, get_environment, get_template
11 | 
12 | 
13 | @pytest.fixture(autouse=True)
14 | def setup_testing():
15 |     TEMPLATE_PATHS.append(
16 |         os.path.join(os.path.dirname(os.path.abspath(__file__)), "templates")
17 |     )
18 |     FILTERS["custom_filter"] = lambda x: "baz"
19 | 
20 | 
21 | def test_widgets():
22 |     template = get_template("example.html.j2")
23 |     assert isinstance(template, jinja2.Template)
24 |     rendered = template.render(foo="bar")
25 |     assert "Hello bar" in rendered
26 | 
27 | 
28 | def test_environment():
29 |     environment = get_environment()
30 |     assert isinstance(environment, jinja2.Environment)
31 | 
32 | 
33 | def test_unknown_template():
34 |     with pytest.raises(jinja2.TemplateNotFound) as e:
35 |         get_template("does_not_exist.html.j2")
36 | 
37 |         # The error should contain all the registered template directories to help the user
38 |         # understand where jinja2 is looking. Including the one we registered in the fixture.
39 |         assert os.path.dirname(os.path.abspath(__file__)) in str(e)
40 | 
41 | 
42 | def test_filters():
43 |     template = get_template("bytes.html.j2")
44 |     assert format_bytes in FILTERS.values()
45 |     assert format_bytes(2e9) in template.render(foo=2e9)
46 | 
47 |     template = get_template("custom_filter.html.j2")
48 |     assert "baz" in template.render(foo=None)
49 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2014, Anaconda, Inc. and contributors
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-mindeps-optional.yaml:
--------------------------------------------------------------------------------
 1 | name: test-environment
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   # required dependencies
 6 |   - packaging=20.0
 7 |   - python=3.9
 8 |   - pyyaml=5.3.1
 9 |   - click=8.0
10 |   - cloudpickle=1.5.0
11 |   - partd=1.2.0
12 |   - fsspec=2021.09.0
13 |   - importlib-metadata=4.13.0
14 |   - toolz=0.10.0
15 |   # optional dependencies pulled in by pip install dask[array,dataframe]
16 |   - numpy=1.21
17 |   - pandas=1.3
18 |   # optional dependencies pulled in by pip install dask[diagnostics]
19 |   - bokeh=2.4.2
20 |   - jinja2=2.10.3
21 |   # optional dependencies pulled in by pip install dask[complete]
22 |   - pyarrow=7.0
23 |   - lz4=4.3.2
24 |   # optional dependencies used by dask
25 |   - cachey=0.1.1
26 |   - crick=0.0.3
27 |   - cytoolz=0.11.0
28 |   - dask-ml=1.4.0
29 |   - fastavro=1.1.0
30 |   - fastparquet=0.8.2
31 |   - h5py=2.10.0
32 |   - ipycytoscape=1.0.1
33 |   - IPython=7.16.1
34 |   - matplotlib=3.4.1
35 |   - mimesis=5.3.0
36 |   - mmh3=2.5.1
37 |   - psutil=5.7.2
38 |   - python-cityhash=0.4.6
39 |   - python-graphviz=0.8.4
40 |   - python-snappy=0.5.4
41 |   - python-xxhash=2.0.0
42 |   - s3fs=2021.9.0
43 |   - scikit-image=0.17.2
44 |   - scipy=1.5.2
45 |   - sparse=0.12.0
46 |   - sqlalchemy=1.4.16
47 |   - tblib=1.6.0
48 |   - tiledb-py=0.8.1
49 |   - zarr=2.12.0
50 |   - pip
51 |   - pip:
52 |       # optional dependencies pulled in by pip install dask[distributed]
53 |       - git+https://github.com/dask/distributed
54 |   # test dependencies
55 |   - pre-commit
56 |   - pytest
57 |   - pytest-cov
58 |   - pytest-rerunfailures
59 |   - pytest-xdist
60 | 


--------------------------------------------------------------------------------
/dask/tests/test_system.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import builtins
 4 | import io
 5 | import os
 6 | import sys
 7 | 
 8 | import pytest
 9 | 
10 | from dask.system import cpu_count
11 | 
12 | psutil = pytest.importorskip("psutil")
13 | 
14 | 
15 | def test_cpu_count():
16 |     count = cpu_count()
17 |     assert isinstance(count, int)
18 |     assert count <= os.cpu_count()
19 |     assert count >= 1
20 | 
21 | 
22 | @pytest.mark.parametrize("dirname", ["cpuacct,cpu", "cpu,cpuacct", None])
23 | def test_cpu_count_cgroups(dirname, monkeypatch):
24 |     def mycpu_count():
25 |         # Absurdly high, unlikely to match real value
26 |         return 250
27 | 
28 |     monkeypatch.setattr(os, "cpu_count", mycpu_count)
29 | 
30 |     class MyProcess:
31 |         def cpu_affinity(self):
32 |             # No affinity set
33 |             return []
34 | 
35 |     monkeypatch.setattr(psutil, "Process", MyProcess)
36 | 
37 |     if dirname:
38 |         paths = {
39 |             "/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname: io.StringIO("2005"),
40 |             "/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname: io.StringIO("10"),
41 |         }
42 |         builtin_open = builtins.open
43 | 
44 |         def myopen(path, *args, **kwargs):
45 |             if path in paths:
46 |                 return paths.get(path)
47 |             return builtin_open(path, *args, **kwargs)
48 | 
49 |         monkeypatch.setattr(builtins, "open", myopen)
50 |         monkeypatch.setattr(sys, "platform", "linux")
51 | 
52 |     count = cpu_count()
53 |     if dirname:
54 |         # Rounds up
55 |         assert count == 201
56 |     else:
57 |         assert count == 250
58 | 


--------------------------------------------------------------------------------
/dask/array/NUMPY_LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2005-2015, NumPy Developers.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |        notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |        copyright notice, this list of conditions and the following
13 |        disclaimer in the documentation and/or other materials provided
14 |        with the distribution.
15 | 
16 |     * Neither the name of the NumPy Developers nor the names of any
17 |        contributors may be used to endorse or promote products derived
18 |        from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/docs/source/delayed-collections.rst:
--------------------------------------------------------------------------------
 1 | Working with Collections
 2 | ========================
 3 | 
 4 | Often we want to do a bit of custom work with ``dask.delayed`` (for example,
 5 | for complex data ingest), then leverage the algorithms in ``dask.array`` or
 6 | ``dask.dataframe``, and then switch back to custom work.  To this end, all
 7 | collections support ``from_delayed`` functions and ``to_delayed``
 8 | methods.
 9 | 
10 | As an example, consider the case where we store tabular data in a custom format
11 | not known by Dask DataFrame.  This format is naturally broken apart into
12 | pieces and we have a function that reads one piece into a Pandas DataFrame.
13 | We use ``dask.delayed`` to lazily read these files into Pandas DataFrames,
14 | use ``dd.from_delayed`` to wrap these pieces up into a single
15 | Dask DataFrame, use the complex algorithms within the DataFrame
16 | (groupby, join, etc.), and then switch back to ``dask.delayed`` to save our results
17 | back to the custom format:
18 | 
19 | .. code-block:: python
20 | 
21 |    import dask.dataframe as dd
22 |    from dask.delayed import delayed
23 | 
24 |    from my_custom_library import load, save
25 | 
26 |    filenames = ...
27 |    dfs = [delayed(load)(fn) for fn in filenames]
28 | 
29 |    df = dd.from_delayed(dfs)
30 |    df = ... # do work with dask.dataframe
31 | 
32 |    dfs = df.to_delayed()
33 |    writes = [delayed(save)(df, fn) for df, fn in zip(dfs, filenames)]
34 | 
35 |    dd.compute(*writes)
36 | 
37 | Data science is often complex, and ``dask.delayed`` provides a release valve for
38 | users to manage this complexity on their own, and solve the last mile problem
39 | for custom formats and complex situations.
40 | 


--------------------------------------------------------------------------------
/docs/source/images/dask_icon_black.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 26.0.3, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 512 512" style="enable-background:new 0 0 512 512;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#FFC11E;}
 7 | 	.st1{fill:#04255C;}
 8 | 	.st2{fill:#FC6E6B;}
 9 | 	.st3{fill:#FFFFFF;}
10 | 	.st4{fill:#EF1161;}
11 | </style>
12 | <g>
13 | 	<path d="M143.71,157.61l126.5-72.99c1.25-0.72,2.02-2.05,2.02-3.5l0.01-43.77c0-6.48-2.66-12.9-7.83-16.81
14 | 		c-6.69-5.06-15.28-5.56-22.33-1.48L65.13,121.17c-6.22,3.59-10.06,10.23-10.06,17.41L55,369.18c0,6.47,2.65,12.89,7.81,16.81
15 | 		c6.68,5.07,15.29,5.57,22.35,1.49l37.48-21.62c1.25-0.72,2.02-2.05,2.02-3.5l0.05-171.85C124.71,176.93,131.95,164.4,143.71,157.61
16 | 		z"/>
17 | 	<path d="M446.95,124.53c-3.15-1.82-6.61-2.73-10.06-2.73c-3.45,0-6.9,0.91-10.05,2.73l-176.96,102.1
18 | 		c-6.2,3.58-10.06,10.25-10.06,17.41l-0.07,231.47c0,7.27,3.76,13.78,10.05,17.42c6.3,3.64,13.81,3.64,20.11,0l176.95-102.11
19 | 		c6.2-3.58,10.06-10.25,10.06-17.41L457,141.95C457,134.68,453.24,128.16,446.95,124.53z"/>
20 | 	<path d="M240.95,211.14l116.78-67.38c1.25-0.72,2.02-2.05,2.02-3.5l0.02-50.98c0-6.48-2.66-12.9-7.83-16.81
21 | 		c-6.69-5.06-15.27-5.55-22.33-1.48l-48.43,27.95L152.64,173.1c-6.22,3.59-10.06,10.23-10.06,17.41l-0.05,174.18l-0.02,56.41
22 | 		c0,6.48,2.65,12.89,7.81,16.81c6.69,5.07,15.29,5.57,22.35,1.49l47.2-27.24c1.25-0.72,2.02-2.05,2.02-3.5l0.05-164.64
23 | 		C221.95,230.46,229.19,217.92,240.95,211.14z"/>
24 | </g>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_extensions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from decimal import Decimal
 4 | 
 5 | import pytest
 6 | 
 7 | import dask.dataframe as dd
 8 | from dask.dataframe.utils import assert_eq
 9 | 
10 | pd = pytest.importorskip("pandas")
11 | 
12 | from pandas.tests.extension.decimal.array import DecimalArray, DecimalDtype
13 | 
14 | from dask.dataframe.extensions import make_array_nonempty, make_scalar
15 | 
16 | 
17 | @make_array_nonempty.register(DecimalDtype)
18 | def _(dtype):
19 |     return DecimalArray._from_sequence([Decimal("0"), Decimal("NaN")], dtype=dtype)
20 | 
21 | 
22 | @make_scalar.register(Decimal)
23 | def _(x):
24 |     return Decimal("1")
25 | 
26 | 
27 | def test_register_extension_type():
28 |     arr = DecimalArray._from_sequence([Decimal("1.0")] * 10)
29 |     ser = pd.Series(arr)
30 |     dser = dd.from_pandas(ser, 2)
31 |     assert_eq(ser, dser)
32 | 
33 |     df = pd.DataFrame({"A": ser})
34 |     ddf = dd.from_pandas(df, 2)
35 |     assert_eq(df, ddf)
36 | 
37 | 
38 | def test_reduction():
39 |     ser = pd.Series(DecimalArray._from_sequence([Decimal("0"), Decimal("1")]))
40 |     dser = dd.from_pandas(ser, 2)
41 |     assert_eq(ser.mean(skipna=False), dser.mean(skipna=False))
42 | 
43 |     # It's unclear whether this can be reliably provided, at least with the current
44 |     # implementation, which uses pandas.DataFrame.sum(), returning a (homogenous)
45 |     # series which has potentially cast values.
46 | 
47 |     # assert_eq(ser.to_frame().mean(skipna=False), dser.to_frame().mean(skipna=False))
48 | 
49 | 
50 | def test_scalar():
51 |     result = dd.utils.make_meta(Decimal("1.0"), parent_meta=pd.DataFrame())
52 |     assert result == Decimal("1.0")
53 | 


--------------------------------------------------------------------------------
/docs/source/images/dask_icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 26.0.3, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 512 512" style="enable-background:new 0 0 512 512;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#FFC11E;}
 7 | 	.st1{fill:#04255C;}
 8 | 	.st2{fill:#FC6E6B;}
 9 | 	.st3{fill:#FFFFFF;}
10 | 	.st4{fill:#EF1161;}
11 | </style>
12 | <g>
13 | 	<path class="st0" d="M143.71,157.61l126.5-72.99c1.25-0.72,2.02-2.05,2.02-3.5l0.01-43.77c0-6.48-2.66-12.9-7.83-16.81
14 | 		c-6.69-5.06-15.28-5.56-22.33-1.48L65.13,121.17c-6.22,3.59-10.06,10.23-10.06,17.41L55,369.18c0,6.47,2.65,12.89,7.81,16.81
15 | 		c6.68,5.07,15.29,5.57,22.35,1.49l37.48-21.62c1.25-0.72,2.02-2.05,2.02-3.5l0.05-171.85C124.71,176.93,131.95,164.4,143.71,157.61
16 | 		z"/>
17 | 	<path class="st4" d="M446.95,124.53c-3.15-1.82-6.61-2.73-10.06-2.73c-3.45,0-6.9,0.91-10.05,2.73l-176.96,102.1
18 | 		c-6.2,3.58-10.06,10.25-10.06,17.41l-0.07,231.47c0,7.27,3.76,13.78,10.05,17.42c6.3,3.64,13.81,3.64,20.11,0l176.95-102.11
19 | 		c6.2-3.58,10.06-10.25,10.06-17.41L457,141.95C457,134.68,453.24,128.16,446.95,124.53z"/>
20 | 	<path class="st2" d="M240.95,211.14l116.78-67.38c1.25-0.72,2.02-2.05,2.02-3.5l0.02-50.98c0-6.48-2.66-12.9-7.83-16.81
21 | 		c-6.69-5.06-15.27-5.55-22.33-1.48l-48.43,27.95L152.64,173.1c-6.22,3.59-10.06,10.23-10.06,17.41l-0.05,174.18l-0.02,56.41
22 | 		c0,6.48,2.65,12.89,7.81,16.81c6.69,5.07,15.29,5.57,22.35,1.49l47.2-27.24c1.25-0.72,2.02-2.05,2.02-3.5l0.05-164.64
23 | 		C221.95,230.46,229.19,217.92,240.95,211.14z"/>
24 | </g>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/docs/source/images/dask_icon_on_pink.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 26.0.3, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 512 512" style="enable-background:new 0 0 512 512;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#FFC11E;}
 7 | 	.st1{fill:#04255C;}
 8 | 	.st2{fill:#FC6E6B;}
 9 | 	.st3{fill:#FFFFFF;}
10 | 	.st4{fill:#EF1161;}
11 | </style>
12 | <g>
13 | 	<path class="st0" d="M143.71,157.61l126.5-72.99c1.25-0.72,2.02-2.05,2.02-3.5l0.01-43.77c0-6.48-2.66-12.9-7.83-16.81
14 | 		c-6.69-5.06-15.28-5.56-22.33-1.48L65.13,121.17c-6.22,3.59-10.06,10.23-10.06,17.41L55,369.18c0,6.47,2.65,12.89,7.81,16.81
15 | 		c6.68,5.07,15.29,5.57,22.35,1.49l37.48-21.62c1.25-0.72,2.02-2.05,2.02-3.5l0.05-171.85C124.71,176.93,131.95,164.4,143.71,157.61
16 | 		z"/>
17 | 	<path class="st1" d="M446.95,124.53c-3.15-1.82-6.61-2.73-10.06-2.73c-3.45,0-6.9,0.91-10.05,2.73l-176.96,102.1
18 | 		c-6.2,3.58-10.06,10.25-10.06,17.41l-0.07,231.47c0,7.27,3.76,13.78,10.05,17.42c6.3,3.64,13.81,3.64,20.11,0l176.95-102.11
19 | 		c6.2-3.58,10.06-10.25,10.06-17.41L457,141.95C457,134.68,453.24,128.16,446.95,124.53z"/>
20 | 	<path class="st2" d="M240.95,211.14l116.78-67.38c1.25-0.72,2.02-2.05,2.02-3.5l0.02-50.98c0-6.48-2.66-12.9-7.83-16.81
21 | 		c-6.69-5.06-15.27-5.55-22.33-1.48l-48.43,27.95L152.64,173.1c-6.22,3.59-10.06,10.23-10.06,17.41l-0.05,174.18l-0.02,56.41
22 | 		c0,6.48,2.65,12.89,7.81,16.81c6.69,5.07,15.29,5.57,22.35,1.49l47.2-27.24c1.25-0.72,2.02-2.05,2.02-3.5l0.05-164.64
23 | 		C221.95,230.46,229.19,217.92,240.95,211.14z"/>
24 | </g>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/docs/source/images/dask_icon_white.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 26.0.3, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 512 512" style="enable-background:new 0 0 512 512;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#FFC11E;}
 7 | 	.st1{fill:#04255C;}
 8 | 	.st2{fill:#FC6E6B;}
 9 | 	.st3{fill:#FFFFFF;}
10 | 	.st4{fill:#EF1161;}
11 | </style>
12 | <g>
13 | 	<path class="st3" d="M143.6,157.73l126.42-73.12c1.25-0.72,2.02-2.06,2.02-3.5l-0.03-43.77c0-6.48-2.68-12.9-7.85-16.8
14 | 		c-6.69-5.05-15.28-5.54-22.33-1.46L64.99,121.37c-6.22,3.6-10.05,10.24-10.04,17.42l0.18,230.59c0,6.47,2.67,12.89,7.83,16.8
15 | 		c6.69,5.07,15.29,5.56,22.35,1.47l37.45-21.66c1.25-0.72,2.02-2.06,2.02-3.5l-0.13-171.85
16 | 		C124.63,177.07,131.86,164.53,143.6,157.73z"/>
17 | 	<path class="st3" d="M446.81,124.33c-3.15-1.82-6.61-2.72-10.06-2.72c-3.45,0-6.9,0.92-10.05,2.74L249.85,226.63
18 | 		c-6.2,3.58-10.05,10.26-10.04,17.42l0.18,231.47c0.01,7.27,3.77,13.78,10.07,17.41c6.3,3.63,13.82,3.63,20.11-0.02l176.85-102.29
19 | 		c6.2-3.59,10.05-10.26,10.04-17.42l-0.17-231.47C456.87,134.47,453.11,127.96,446.81,124.33z"/>
20 | 	<path class="st3" d="M240.9,211.16l116.7-67.5c1.25-0.72,2.02-2.06,2.02-3.5l-0.04-50.98c0-6.48-2.68-12.9-7.85-16.8
21 | 		c-6.69-5.05-15.28-5.54-22.33-1.45l-48.41,28l-128.44,74.29c-6.22,3.6-10.05,10.24-10.04,17.42l0.13,174.18l0.04,56.41
22 | 		c0,6.48,2.67,12.89,7.83,16.8c6.69,5.07,15.29,5.55,22.35,1.47l47.17-27.29c1.25-0.72,2.02-2.06,2.02-3.5l-0.13-164.64
23 | 		C221.92,230.5,229.15,217.95,240.9,211.16z"/>
24 | </g>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_image.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from contextlib import contextmanager
 5 | 
 6 | import pytest
 7 | 
 8 | pytest.importorskip("skimage")
 9 | import numpy as np
10 | from skimage.io import imsave
11 | 
12 | from dask.array.image import imread as da_imread
13 | from dask.utils import tmpdir
14 | 
15 | 
16 | @contextmanager
17 | def random_images(n, shape):
18 |     with tmpdir() as dirname:
19 |         for i in range(n):
20 |             fn = os.path.join(dirname, "image.%d.png" % i)
21 |             x = np.random.randint(0, 255, size=shape).astype("u1")
22 |             imsave(fn, x, check_contrast=False)
23 | 
24 |         yield os.path.join(dirname, "*.png")
25 | 
26 | 
27 | def test_imread():
28 |     with random_images(4, (5, 6, 3)) as globstring:
29 |         im = da_imread(globstring)
30 |         assert im.shape == (4, 5, 6, 3)
31 |         assert im.chunks == ((1, 1, 1, 1), (5,), (6,), (3,))
32 |         assert im.dtype == "uint8"
33 | 
34 |         assert im.compute().shape == (4, 5, 6, 3)
35 |         assert im.compute().dtype == "uint8"
36 | 
37 | 
38 | def test_imread_with_custom_function():
39 |     def imread2(fn):
40 |         return np.ones((2, 3, 4), dtype="i1")
41 | 
42 |     with random_images(4, (5, 6, 3)) as globstring:
43 |         im = da_imread(globstring, imread=imread2)
44 |         assert (im.compute() == np.ones((4, 2, 3, 4), dtype="u1")).all()
45 | 
46 | 
47 | def test_preprocess():
48 |     def preprocess(x):
49 |         x[:] = 1
50 |         return x[:, :, 0]
51 | 
52 |     with random_images(4, (2, 3, 4)) as globstring:
53 |         im = da_imread(globstring, preprocess=preprocess)
54 |         assert (im.compute() == np.ones((4, 2, 3), dtype="u1")).all()
55 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-3.9.yaml:
--------------------------------------------------------------------------------
 1 | # This job includes coverage
 2 | name: test-environment
 3 | channels:
 4 |   - conda-forge
 5 |   - nodefaults
 6 | dependencies:
 7 |   # required dependencies
 8 |   - python=3.9
 9 |   - packaging
10 |   - pyyaml
11 |   - click
12 |   - cloudpickle
13 |   - partd
14 |   - fsspec
15 |   - importlib_metadata
16 |   - toolz
17 |   # test dependencies
18 |   - pre-commit
19 |   - pytest
20 |   - pytest-cov
21 |   - pytest-rerunfailures
22 |   - pytest-timeout
23 |   - pytest-xdist
24 |   - moto
25 |   # Optional dependencies
26 |   - mimesis
27 |   - numpy=1.22
28 |   - pandas=1.4
29 |   - flask
30 |   - fastparquet
31 |   - h5py
32 |   - pytables
33 |   - zarr
34 |   # `tiledb-py=0.17.5` lead to strange seg faults in CI.
35 |   # We should unpin when possible.
36 |   # https://github.com/dask/dask/pull/9569
37 |   - tiledb-py<0.17.4
38 |   - pyspark
39 |   - tiledb>=2.5.0
40 |   - xarray
41 |   - sqlalchemy>=1.4.16,<2  # `pandas=1.4` doesn't support `sqlalchemy=2`
42 |   - pyarrow=9
43 |   - coverage
44 |   - jsonschema
45 |   # other -- IO
46 |   - boto3
47 |   - botocore
48 |   - bokeh
49 |   - httpretty
50 |   - aiohttp
51 |   - s3fs
52 |   - crick
53 |   - cytoolz
54 |   - distributed
55 |   - ipython
56 |   - ipycytoscape
57 |   # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
58 |   - ipywidgets<8.0.5
59 |   - ipykernel<6.22.0
60 |   - lz4
61 |   - numba
62 |   - psutil
63 |   - requests
64 |   - scikit-image<0.20
65 |   - scikit-learn
66 |   - scipy
67 |   - python-snappy
68 |   - sparse
69 |   - cachey
70 |   - python-graphviz
71 |   - python-xxhash
72 |   - python-cityhash
73 |   - mmh3
74 |   - jinja2
75 |   - pip
76 |   - pip:
77 |     - git+https://github.com/dask/distributed
78 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-3.10.yaml:
--------------------------------------------------------------------------------
 1 | # This job includes coverage
 2 | name: test-environment
 3 | channels:
 4 |   - conda-forge
 5 |   - nodefaults
 6 | dependencies:
 7 |   # required dependencies
 8 |   - python=3.10
 9 |   - packaging
10 |   - pyyaml
11 |   - click
12 |   - cloudpickle
13 |   - partd
14 |   - fsspec
15 |   - importlib_metadata
16 |   - toolz
17 |   # test dependencies
18 |   - pre-commit
19 |   - pytest
20 |   - pytest-cov
21 |   - pytest-rerunfailures
22 |   - pytest-timeout
23 |   - pytest-xdist
24 |   - moto
25 |   # Optional dependencies
26 |   - mimesis
27 |   - numpy=1.23
28 |   - pandas=1.5
29 |   - flask
30 |   - fastparquet>=0.8.0
31 |   - h5py
32 |   - pytables
33 |   - zarr
34 |   # `tiledb-py=0.17.5` lead to strange seg faults in CI.
35 |   # We should unpin when possible.
36 |   # https://github.com/dask/dask/pull/9569
37 |   - tiledb-py<0.17.4
38 |   - pyspark
39 |   - tiledb>=2.5.0
40 |   - xarray
41 |   - sqlalchemy>=1.4.16,<2  # `pandas=1.5` doesn't support `sqlalchemy=2`
42 |   - pyarrow=10
43 |   - coverage
44 |   - jsonschema
45 |   # other -- IO
46 |   - boto3
47 |   - botocore
48 |   - bokeh
49 |   - httpretty
50 |   - aiohttp
51 |   - s3fs
52 |   - crick
53 |   - cytoolz
54 |   - distributed
55 |   - ipython
56 |   - ipycytoscape
57 |   # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
58 |   - ipywidgets<8.0.5
59 |   - ipykernel<6.22.0
60 |   - lz4
61 |   - numba
62 |   - psutil
63 |   - requests
64 |   - scikit-image
65 |   - scikit-learn
66 |   - scipy
67 |   - python-snappy
68 |   - sparse
69 |   - cachey
70 |   - python-graphviz
71 |   - python-xxhash
72 |   - python-cityhash
73 |   - mmh3
74 |   - jinja2
75 |   - pip
76 |   - pip:
77 |     - git+https://github.com/dask/distributed
78 | 


--------------------------------------------------------------------------------
/dask/system.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import math
 4 | import os
 5 | import sys
 6 | 
 7 | try:
 8 |     import psutil
 9 | except ImportError:
10 |     psutil = None  # type: ignore
11 | 
12 | __all__ = ("cpu_count", "CPU_COUNT")
13 | 
14 | 
15 | def cpu_count():
16 |     """Get the available CPU count for this system.
17 | 
18 |     Takes the minimum value from the following locations:
19 | 
20 |     - Total system cpus available on the host.
21 |     - CPU Affinity (if set)
22 |     - Cgroups limit (if set)
23 |     """
24 |     count = os.cpu_count()
25 | 
26 |     # Check CPU affinity if available
27 |     if psutil is not None:
28 |         try:
29 |             affinity_count = len(psutil.Process().cpu_affinity())
30 |             if affinity_count > 0:
31 |                 count = min(count, affinity_count)
32 |         except Exception:
33 |             pass
34 | 
35 |     # Check cgroups if available
36 |     if sys.platform == "linux":
37 |         # The directory name isn't standardized across linux distros, check both
38 |         for dirname in ["cpuacct,cpu", "cpu,cpuacct"]:
39 |             try:
40 |                 with open("/sys/fs/cgroup/%s/cpu.cfs_quota_us" % dirname) as f:
41 |                     quota = int(f.read())
42 |                 with open("/sys/fs/cgroup/%s/cpu.cfs_period_us" % dirname) as f:
43 |                     period = int(f.read())
44 |                 # We round up on fractional CPUs
45 |                 cgroups_count = math.ceil(quota / period)
46 |                 if cgroups_count > 0:
47 |                     count = min(count, cgroups_count)
48 |                 break
49 |             except Exception:
50 |                 pass
51 | 
52 |     return count
53 | 
54 | 
55 | CPU_COUNT = cpu_count()
56 | 


--------------------------------------------------------------------------------
/dask/tests/test_utils_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | 
 5 | import pytest
 6 | 
 7 | from dask import utils_test
 8 | from dask.highlevelgraph import HighLevelGraph
 9 | from dask.utils_test import _check_warning
10 | 
11 | 
12 | def test_hlg_layer():
13 |     a = {"x": 1}
14 |     b = {"y": (utils_test.inc, "x")}
15 |     layers = {"a-layer": a, "bee-layer": b}
16 |     dependencies = {"a-layer": set(), "bee-layer": {"a-layer"}}
17 |     hg = HighLevelGraph(layers, dependencies)
18 | 
19 |     assert utils_test.hlg_layer(hg, "a") is hg.layers["a-layer"]
20 |     assert utils_test.hlg_layer(hg, "b") is hg.layers["bee-layer"]
21 |     with pytest.raises(KeyError, match="No layer starts with"):
22 |         utils_test.hlg_layer(hg, "foo")
23 | 
24 | 
25 | def test_hlg_layer_topological():
26 |     a = {"x": 1}
27 |     b = {"y": (utils_test.inc, "x")}
28 |     c = {"z": (utils_test.inc, "x")}
29 |     d = {"r": (sum, ["y", "z"])}
30 |     layers = {"a": a, "b": b, "c": c, "d": d}
31 |     dependencies = {"a": set(), "b": {"a"}, "c": {"a"}, "d": {"b", "c"}}
32 |     hg = HighLevelGraph(layers, dependencies)
33 | 
34 |     assert utils_test.hlg_layer_topological(hg, -1) is hg.layers["d"]
35 |     assert utils_test.hlg_layer_topological(hg, 0) is hg.layers["a"]
36 |     assert utils_test.hlg_layer_topological(hg, 1) in (hg.layers["b"], hg.layers["c"])
37 | 
38 | 
39 | def test__check_warning():
40 |     class MyWarning(Warning):
41 |         pass
42 | 
43 |     with warnings.catch_warnings():
44 |         warnings.simplefilter("error")
45 |         with _check_warning(True, MyWarning, "foo"):
46 |             warnings.warn("foo", MyWarning)
47 | 
48 |     with pytest.warns(MyWarning, match="foo"):
49 |         with _check_warning(False, MyWarning, "foo"):
50 |             warnings.warn("foo", MyWarning)
51 | 


--------------------------------------------------------------------------------
/continuous_integration/environment-3.11.yaml:
--------------------------------------------------------------------------------
 1 | # This job includes coverage
 2 | name: test-environment
 3 | channels:
 4 |   - conda-forge
 5 |   - nodefaults
 6 | dependencies:
 7 |   # required dependencies
 8 |   - python=3.11
 9 |   - packaging
10 |   - pyyaml
11 |   - click
12 |   - cloudpickle
13 |   - partd
14 |   - fsspec
15 |   - importlib_metadata
16 |   - toolz
17 |   # test dependencies
18 |   - pre-commit
19 |   - pytest
20 |   - pytest-cov
21 |   - pytest-rerunfailures
22 |   - pytest-timeout
23 |   - pytest-xdist
24 |   - moto
25 |   # Optional dependencies
26 |   - mimesis
27 |   - numpy
28 |   - pandas
29 |   - flask
30 |   - fastparquet>=0.8.0
31 |   - h5py
32 |   - pytables
33 |   - zarr
34 |   # `tiledb-py=0.17.5` lead to strange seg faults in CI, However 0.18 is needed for 3.11
35 |   # https://github.com/dask/dask/pull/9569
36 |   # - tiledb-py # crashes on Python 3.11
37 |   # - pyspark
38 |   # - tiledb>=2.5.0 # crashes on Python 3.11
39 |   - xarray
40 |   - sqlalchemy>=1.4.16
41 |   - pyarrow>=11
42 |   - coverage
43 |   - jsonschema
44 |   # # other -- IO
45 |   - boto3
46 |   - botocore
47 |   - bokeh
48 |   - httpretty
49 |   - aiohttp
50 |   - s3fs
51 |   # Need a new `crick` release with support for `numpy=1.24+`
52 |   # https://github.com/dask/crick/issues/25
53 |   # - crick
54 |   - cytoolz
55 |   - distributed
56 |   - ipython
57 |   - ipycytoscape
58 |   # until https://github.com/jupyter-widgets/ipywidgets/issues/3731 is fixed
59 |   - ipywidgets<8.0.5
60 |   - ipykernel<6.22.0
61 |   - lz4
62 |   - numba
63 |   - psutil
64 |   - requests
65 |   - scikit-image
66 |   - scikit-learn
67 |   - scipy
68 |   - python-snappy
69 |   - sparse
70 |   - cachey
71 |   - python-graphviz
72 |   - python-cityhash
73 |   - python-xxhash
74 |   - mmh3
75 |   - jinja2
76 |   - pip
77 |   - pip:
78 |     - git+https://github.com/dask/distributed
79 | 


--------------------------------------------------------------------------------
/dask/dataframe/numeric.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pandas as pd
 4 | from pandas.api.types import is_scalar as pd_is_scalar
 5 | 
 6 | from dask.array import Array
 7 | from dask.dataframe.core import Series
 8 | from dask.delayed import delayed
 9 | from dask.utils import derived_from
10 | 
11 | __all__ = ("to_numeric",)
12 | 
13 | 
14 | @derived_from(pd, ua_args=["downcast"])
15 | def to_numeric(arg, errors="raise", meta=None):
16 |     """
17 |     Return type depends on input. Delayed if scalar, otherwise same as input.
18 |     For errors, only "raise" and "coerce" are allowed.
19 |     """
20 |     if errors not in ("raise", "coerce"):
21 |         raise ValueError("invalid error value specified")
22 | 
23 |     is_series = isinstance(arg, Series)
24 |     is_array = isinstance(arg, Array)
25 |     is_scalar = pd_is_scalar(arg)
26 | 
27 |     if not any([is_series, is_array, is_scalar]):
28 |         raise TypeError(
29 |             "arg must be a list, tuple, dask.array.Array, or dask.dataframe.Series"
30 |         )
31 | 
32 |     if meta is not None:
33 |         if is_scalar:
34 |             raise KeyError("``meta`` is not allowed when input is a scalar.")
35 |     else:
36 |         if is_series or is_array:
37 |             meta = pd.to_numeric(arg._meta)
38 | 
39 |     if is_series:
40 |         return arg.map_partitions(
41 |             pd.to_numeric,
42 |             token=arg._name + "-to_numeric",
43 |             meta=meta,
44 |             enforce_metadata=False,
45 |             errors=errors,
46 |         )
47 |     if is_array:
48 |         return arg.map_blocks(
49 |             pd.to_numeric,
50 |             name=arg._name + "-to_numeric",
51 |             meta=meta,
52 |             errors=errors,
53 |         )
54 |     if is_scalar:
55 |         return delayed(pd.to_numeric, pure=True)(arg, errors=errors)
56 | 


--------------------------------------------------------------------------------
/docs/source/deploying-ssh.rst:
--------------------------------------------------------------------------------
 1 | SSH
 2 | ===
 3 | 
 4 | It is easy to set up Dask on informally managed networks of machines using SSH.
 5 | This can be done manually using SSH and the
 6 | Dask :doc:`command line interface <deploying-cli>`,
 7 | or automatically using either the :class:`dask.distributed.SSHCluster` Python *cluster manager* or the
 8 | ``dask-ssh`` command line tool. This document describes both of these options.
 9 | 
10 | .. note::
11 |    Before instaniating a ``SSHCluster`` it is recommended to configure keyless SSH
12 |    for your local machine and other machines. For example, on a Mac to SSH into
13 |    localhost (local machine) you need to ensure the Remote Login option is set in
14 |    System Preferences -> Sharing. In addition, ``id_rsa.pub`` should be in
15 |    ``authorized_keys`` for keyless login.
16 | 
17 | Python Interface
18 | ----------------
19 | 
20 | .. currentmodule:: dask.distributed
21 | 
22 | .. autofunction:: SSHCluster
23 | 
24 | Command Line
25 | ------------
26 | 
27 | The convenience script ``dask-ssh`` opens several SSH connections to your
28 | target computers and initializes the network accordingly. You can
29 | give it a list of hostnames or IP addresses::
30 | 
31 |    $ dask-ssh 192.168.0.1 192.168.0.2 192.168.0.3 192.168.0.4
32 | 
33 | Or you can use normal UNIX grouping::
34 | 
35 |    $ dask-ssh 192.168.0.{1,2,3,4}
36 | 
37 | Or you can specify a hostfile that includes a list of hosts::
38 | 
39 |    $ cat hostfile.txt
40 |    192.168.0.1
41 |    192.168.0.2
42 |    192.168.0.3
43 |    192.168.0.4
44 | 
45 |    $ dask-ssh --hostfile hostfile.txt
46 | 
47 | .. note::
48 | 
49 |    The command line documentation here may differ depending on your installed
50 |    version. We recommend referring to the output of ``dask-ssh --help``.
51 | 
52 | .. click:: distributed.cli.dask_ssh:main
53 |    :prog: dask-ssh
54 |    :show-nested:
55 | 


--------------------------------------------------------------------------------
/docs/source/how-to/extend-sizeof.rst:
--------------------------------------------------------------------------------
 1 | Extend `sizeof`
 2 | ===============
 3 | 
 4 | When Dask needs to compute the size of an object in bytes, e.g. to determine which objects to spill to disk, it uses the ``dask.sizeof.sizeof`` registration mechanism. Users who need to define a ``sizeof`` implementation for their own objects can use ``sizeof.register``:
 5 | 
 6 | .. code-block:: python
 7 | 
 8 |    >>> import numpy as np
 9 |    >>> from dask.sizeof import sizeof
10 |    >>> @sizeof.register(np.ndarray)
11 |    >>> def sizeof_numpy_like(array):
12 |    ...     return array.nbytes
13 | 
14 | This code can be executed in order to register the implementation with Dask by placing it in one of the library's modules e.g. ``__init__.py``. However, this introduces a maintenance burden on the developers of these libraries, and must be manually imported on all workers in the event that these libraries do not accept the patch. 
15 | 
16 | Therefore, Dask also exposes an `entrypoint <https://packaging.python.org/specifications/entry-points/>`_ under the group ``dask.sizeof`` to enable third-party libraries to develop and maintain these ``sizeof`` implementations. 
17 | 
18 | For a fictitious library ``numpy_sizeof_dask.py``, the necessary ``setup.cfg`` configuration would be as follows:
19 | 
20 | .. code-block:: ini
21 | 
22 |    [options.entry_points]
23 |    dask.sizeof = 
24 |       numpy = numpy_sizeof_dask:sizeof_plugin
25 | 
26 | whilst ``numpy_sizeof_dask.py`` would contain
27 | 
28 | .. code-block:: python
29 | 
30 |    >>> import numpy as np
31 |    >>> def sizeof_plugin(sizeof):
32 |    ...    @sizeof.register(np.ndarray)
33 |    ...    def sizeof_numpy_like(array):
34 |    ...        return array.nbytes 
35 | 
36 | Upon the first import of `dask.sizeof`, Dask calls the entrypoint (``sizeof_plugin``) with the ``dask.sizeof.sizeof`` object, which can then be used to register a sizeof implementation.
37 | 


--------------------------------------------------------------------------------
/continuous_integration/gpuci/build.sh:
--------------------------------------------------------------------------------
 1 | ##############################################
 2 | # Dask GPU build and test script for CI      #
 3 | ##############################################
 4 | set -e
 5 | NUMARGS=$#
 6 | ARGS=$*
 7 | 
 8 | # Arg parsing function
 9 | function hasArg {
10 |     (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
11 | }
12 | 
13 | # Set path and build parallel level
14 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
15 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
16 | 
17 | # Set home to the job's workspace
18 | export HOME="$WORKSPACE"
19 | 
20 | # Switch to project root; also root of repo checkout
21 | cd "$WORKSPACE"
22 | 
23 | # Determine CUDA release version
24 | export CUDA_REL=${CUDA_VERSION%.*}
25 | 
26 | ################################################################################
27 | # SETUP - Check environment
28 | ################################################################################
29 | 
30 | gpuci_logger "Check environment variables"
31 | env
32 | 
33 | gpuci_logger "Check GPU usage"
34 | nvidia-smi
35 | 
36 | gpuci_logger "Activate conda env"
37 | . /opt/conda/etc/profile.d/conda.sh
38 | conda activate dask
39 | 
40 | gpuci_logger "Install distributed"
41 | python -m pip install git+https://github.com/dask/distributed
42 | 
43 | gpuci_logger "Install dask"
44 | python -m pip install --no-deps -e .
45 | 
46 | gpuci_logger "Install pytest-timeout"
47 | python -m pip install pytest-timeout
48 | 
49 | gpuci_logger "Check Python version"
50 | python --version
51 | 
52 | gpuci_logger "Check conda environment"
53 | conda info
54 | conda config --show-sources
55 | conda list --show-channel-urls
56 | 
57 | gpuci_logger "Python py.test for dask"
58 | py.test $WORKSPACE -n 3 -v -m gpu --junitxml="$WORKSPACE/junit-dask.xml" --cov-config="$WORKSPACE/pyproject.toml" --cov=dask --cov-report=xml:"$WORKSPACE/dask-coverage.xml" --cov-report term
59 | 


--------------------------------------------------------------------------------
/docs/source/delayed-api.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | The ``dask.delayed`` interface consists of one function, ``delayed``:
 5 | 
 6 | - ``delayed`` wraps functions
 7 | 
 8 |    Wraps functions. Can be used as a decorator, or around function calls
 9 |    directly (i.e. ``delayed(foo)(a, b, c)``). Outputs from functions wrapped in
10 |    ``delayed`` are proxy objects of type ``Delayed`` that contain a graph of
11 |    all operations done to get to this result.
12 | 
13 | - ``delayed`` wraps objects
14 | 
15 |    Wraps objects. Used to create ``Delayed`` proxies directly.
16 | 
17 | ``Delayed`` objects can be thought of as representing a key in the dask task
18 | graph. A ``Delayed`` supports *most* python operations, each of which creates
19 | another ``Delayed`` representing the result:
20 | 
21 | - Most operators (``*``, ``-``, and so on)
22 | - Item access and slicing (``a[0]``)
23 | - Attribute access (``a.size``)
24 | - Method calls (``a.index(0)``)
25 | 
26 | Operations that aren't supported include:
27 | 
28 | - Mutating operators (``a += 1``)
29 | - Mutating magics such as ``__setitem__``/``__setattr__`` (``a[0] = 1``, ``a.foo = 1``)
30 | - Iteration. (``for i in a: ...``)
31 | - Use as a predicate (``if a: ...``)
32 | 
33 | The last two points in particular mean that ``Delayed`` objects cannot be used for
34 | control flow, meaning that no ``Delayed`` can appear in a loop or if statement.
35 | In other words you can't iterate over a ``Delayed`` object, or use it as part of
36 | a condition in an if statement, but ``Delayed`` object can be used in a body of a loop
37 | or if statement (i.e. the example above is fine, but if ``data`` was a ``Delayed``
38 | object it wouldn't be).
39 | Even with this limitation, many workflows can easily be parallelized.
40 | 
41 | .. currentmodule:: dask.delayed
42 | 
43 | .. autosummary::
44 |    delayed
45 |    Delayed
46 | 
47 | .. autofunction:: delayed
48 | .. autoclass:: Delayed
49 | 


--------------------------------------------------------------------------------
/.github/workflows/additional.yml:
--------------------------------------------------------------------------------
 1 | name: Additional
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | # Required shell entrypoint to have properly activated conda environments
 6 | defaults:
 7 |   run:
 8 |     shell: bash -l {0}
 9 | 
10 | jobs:
11 |   doctest:
12 |     runs-on: "ubuntu-latest"
13 |     timeout-minutes: 90
14 |     steps:
15 |       - name: Checkout source
16 |         uses: actions/checkout@v3.5.3
17 | 
18 |       - name: Setup Conda Environment
19 |         uses: conda-incubator/setup-miniconda@v2.2.0
20 |         with:
21 |           miniforge-variant: Mambaforge
22 |           miniforge-version: latest
23 |           use-mamba: true
24 |           channel-priority: strict
25 |           python-version: "3.10"
26 |           environment-file: continuous_integration/environment-3.10.yaml
27 |           activate-environment: test-environment
28 |           auto-activate-base: false
29 | 
30 |       - name: Install
31 |         run: source continuous_integration/scripts/install.sh
32 | 
33 |       - name: Run tests
34 |         run: pytest -v --doctest-modules --ignore-glob='*/test_*.py' dask
35 | 
36 |   imports:
37 |     runs-on: "ubuntu-latest"
38 |     timeout-minutes: 90
39 |     strategy:
40 |       fail-fast: false
41 |       matrix:
42 |         python-version: ["3.9", "3.10", "3.11"]
43 |     steps:
44 |       - name: Checkout source
45 |         uses: actions/checkout@v3.5.3
46 | 
47 |       - name: Setup Conda
48 |         uses: conda-incubator/setup-miniconda@v2.2.0
49 |         with:
50 |           miniforge-variant: Mambaforge
51 |           miniforge-version: latest
52 |           use-mamba: true
53 |           channel-priority: strict
54 |           python-version: "3.9"
55 |           activate-environment: test-environment
56 |           auto-activate-base: false
57 | 
58 |       - name: Run import tests
59 |         env:
60 |           PYTHON_VERSION: ${{ matrix.python-version }}
61 |         run: source continuous_integration/scripts/test_imports.sh
62 | 


--------------------------------------------------------------------------------
/dask/tests/test_cache.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from operator import add
 4 | from time import sleep
 5 | 
 6 | import pytest
 7 | 
 8 | from dask.cache import Cache
 9 | from dask.callbacks import Callback
10 | from dask.local import get_sync
11 | from dask.threaded import get
12 | 
13 | cachey = pytest.importorskip("cachey")
14 | 
15 | 
16 | flag = []
17 | 
18 | 
19 | def inc(x):
20 |     flag.append(x)
21 |     return x + 1
22 | 
23 | 
24 | def test_cache():
25 |     c = cachey.Cache(10000)
26 |     cc = Cache(c)
27 | 
28 |     with cc:
29 |         assert get({"x": (inc, 1)}, "x") == 2
30 | 
31 |     assert flag == [1]
32 |     assert c.data["x"] == 2
33 | 
34 |     assert not cc.starttimes
35 |     assert not cc.durations
36 | 
37 |     while flag:
38 |         flag.pop()
39 |     dsk = {"x": (inc, 1), "y": (inc, 2), "z": (add, "x", "y")}
40 |     with cc:
41 |         assert get(dsk, "z") == 5
42 | 
43 |     assert flag == [2]  # no x present
44 | 
45 |     assert not Callback.active
46 | 
47 | 
48 | def test_cache_with_number():
49 |     c = Cache(10000, limit=1)
50 |     assert isinstance(c.cache, cachey.Cache)
51 |     assert c.cache.available_bytes == 10000
52 |     assert c.cache.limit == 1
53 | 
54 | 
55 | def test_cache_correctness():
56 |     # https://github.com/dask/dask/issues/3631
57 |     c = Cache(10000)
58 |     da = pytest.importorskip("dask.array")
59 |     from numpy import ones, zeros
60 | 
61 |     z = da.from_array(zeros(1), chunks=10)
62 |     o = da.from_array(ones(1), chunks=10)
63 |     with c:
64 |         assert (z.compute() == 0).all()
65 |         assert (o.compute() == 1).all()
66 | 
67 | 
68 | def f(duration, size, *args):
69 |     sleep(duration)
70 |     return [0] * size
71 | 
72 | 
73 | def test_prefer_cheap_dependent():
74 |     dsk = {"x": (f, 0.01, 10), "y": (f, 0.000001, 1, "x")}
75 |     c = Cache(10000)
76 |     with c:
77 |         get_sync(dsk, "y")
78 | 
79 |     assert c.cache.scorer.cost["x"] < c.cache.scorer.cost["y"]
80 | 


--------------------------------------------------------------------------------
/docs/source/understanding-performance.rst:
--------------------------------------------------------------------------------
 1 | Understanding Performance
 2 | =========================
 3 | 
 4 | The first step in making computations run quickly is to understand the costs involved.
 5 | In Python we often rely on tools like
 6 | the `CProfile module <https://docs.python.org/3/library/profile.html>`_,
 7 | `%%prun IPython magic <https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-prun>`_,
 8 | `VMProf <https://vmprof.readthedocs.io/en/latest/>`_, or
 9 | `snakeviz <https://jiffyclub.github.io/snakeviz/>`_
10 | to understand the costs associated with our code.
11 | However, few of these tools work well on multi-threaded or multi-process code,
12 | and fewer still on computations distributed among many machines.
13 | We also have new costs like data transfer, serialization, task scheduling overhead, and more
14 | that we may not be accustomed to tracking.
15 | 
16 | Fortunately, the Dask schedulers come with diagnostics
17 | to help you understand the performance characteristics of your computations.
18 | By using these diagnostics and with some thought,
19 | we can often identify the slow parts of troublesome computations.
20 | 
21 | The :doc:`single-machine and distributed schedulers <scheduling>` come with *different* diagnostic tools.
22 | These tools are deeply integrated into each scheduler,
23 | so a tool designed for one will not transfer over to the other.
24 | 
25 | These pages provide four options for profiling parallel code:
26 | 
27 | 1.  :doc:`Visualize task graphs <graphviz>`
28 | 2.  :ref:`Single threaded scheduler and a normal Python profiler <single-threaded-scheduler>`
29 | 3.  :doc:`Diagnostics for the single-machine scheduler <diagnostics-local>`
30 | 4.  :doc:`Diagnostics for the distributed scheduler and dashboard <diagnostics-distributed>`
31 | 
32 | Additionally, if you are interested in understanding the various phases where
33 | slowdown can occur, you may wish to read the following:
34 | 
35 | -  :doc:`Phases of computation <phases-of-computation>`
36 | 


--------------------------------------------------------------------------------
/.github/workflows/conda.yml:
--------------------------------------------------------------------------------
 1 | name: Conda build
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |   pull_request:
 7 |     paths:
 8 |       - setup.py
 9 |       - continuous_integration/recipe/**
10 |       - .github/workflows/conda.yml
11 | 
12 | # When this workflow is queued, automatically cancel any previous running
13 | # or pending jobs from the same branch
14 | concurrency:
15 |   group: conda-${{ github.head_ref }}
16 |   cancel-in-progress: true
17 | 
18 | # Required shell entrypoint to have properly activated conda environments
19 | defaults:
20 |   run:
21 |     shell: bash -l {0}
22 | 
23 | jobs:
24 |   conda:
25 |     name: Build (and upload)
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - uses: actions/checkout@v3.5.3
29 |         with:
30 |           fetch-depth: 0
31 |       - name: Set up Python
32 |         uses: conda-incubator/setup-miniconda@v2.2.0
33 |         with:
34 |           miniforge-variant: Mambaforge
35 |           use-mamba: true
36 |           python-version: 3.9
37 |           channel-priority: strict
38 |       - name: Install dependencies
39 |         run: |
40 |           mamba install -c conda-forge boa conda-verify
41 | 
42 |           which python
43 |           pip list
44 |           mamba list
45 |       - name: Build conda package
46 |         run: |
47 |           # suffix for nightly package versions
48 |           export VERSION_SUFFIX=a`date +%y%m%d`
49 | 
50 |           conda mambabuild continuous_integration/recipe \
51 |                            --no-anaconda-upload \
52 |                            --output-folder .
53 |       - name: Upload conda package
54 |         if: |
55 |           github.event_name == 'push'
56 |           && github.ref == 'refs/heads/main'
57 |           && github.repository == 'dask/dask'
58 |         env:
59 |           ANACONDA_API_TOKEN: ${{ secrets.DASK_CONDA_TOKEN }}
60 |         run: |
61 |           # install anaconda for upload
62 |           mamba install -c conda-forge anaconda-client
63 | 
64 |           anaconda upload --label dev noarch/*.tar.bz2
65 | 


--------------------------------------------------------------------------------
/dask/context.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Control global computation context
 3 | """
 4 | from __future__ import annotations
 5 | 
 6 | import threading
 7 | from functools import partial
 8 | 
 9 | from dask import config
10 | 
11 | _globals = config.config
12 | 
13 | 
14 | thread_state = threading.local()
15 | 
16 | 
17 | def globalmethod(default=None, key=None, falsey=None):
18 |     """Allow function to be taken over by globals
19 | 
20 |     This modifies a method so that occurrences of it may be taken over by
21 |     functions registered in the global options. Can be used as a decorator or a
22 |     function.
23 | 
24 |     Parameters
25 |     ----------
26 |     default : callable
27 |         The default callable to use.
28 |     key : str
29 |         Key under which we register this function in the global parameters
30 |     falsey : callable, None, optional
31 |         A function to use if the option is falsey. If not provided, the default
32 |         is used instead.
33 | 
34 |     Examples
35 |     --------
36 |     >>> import dask
37 |     >>> class Foo:
38 |     ...     @globalmethod(key='bar', falsey=lambda: 3)
39 |     ...     def bar():
40 |     ...         return 1
41 |     >>> f = Foo()
42 |     >>> f.bar()
43 |     1
44 |     >>> with dask.config.set(bar=lambda: 2):
45 |     ...     print(f.bar())
46 |     2
47 |     >>> with dask.config.set(bar=False):
48 |     ...     print(f.bar())
49 |     3
50 |     """
51 |     if default is None:
52 |         return partial(globalmethod, key=key, falsey=falsey)
53 |     return GlobalMethod(default=default, key=key, falsey=falsey)
54 | 
55 | 
56 | class GlobalMethod:
57 |     def __init__(self, default, key, falsey=None):
58 |         self._default = default
59 |         self._key = key
60 |         self._falsey = falsey
61 | 
62 |     def __get__(self, instance, owner=None):
63 |         if self._key in _globals:
64 |             if _globals[self._key]:
65 |                 return _globals[self._key]
66 |             elif self._falsey is not None:
67 |                 return self._falsey
68 |         return self._default
69 | 


--------------------------------------------------------------------------------
/dask/dataframe/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | try:
 4 |     import dask.dataframe._pyarrow_compat
 5 |     from dask.base import compute
 6 |     from dask.dataframe import backends, dispatch, rolling
 7 |     from dask.dataframe.core import (
 8 |         DataFrame,
 9 |         Index,
10 |         Series,
11 |         _Frame,
12 |         map_partitions,
13 |         repartition,
14 |         to_datetime,
15 |         to_timedelta,
16 |     )
17 |     from dask.dataframe.groupby import Aggregation
18 |     from dask.dataframe.io import (
19 |         demo,
20 |         from_array,
21 |         from_dask_array,
22 |         from_delayed,
23 |         from_dict,
24 |         from_map,
25 |         from_pandas,
26 |         read_csv,
27 |         read_fwf,
28 |         read_hdf,
29 |         read_json,
30 |         read_sql,
31 |         read_sql_query,
32 |         read_sql_table,
33 |         read_table,
34 |         to_bag,
35 |         to_csv,
36 |         to_hdf,
37 |         to_json,
38 |         to_records,
39 |         to_sql,
40 |     )
41 |     from dask.dataframe.multi import concat, merge, merge_asof
42 |     from dask.dataframe.numeric import to_numeric
43 |     from dask.dataframe.optimize import optimize
44 |     from dask.dataframe.reshape import get_dummies, melt, pivot_table
45 |     from dask.dataframe.utils import assert_eq
46 | 
47 |     try:
48 |         from dask.dataframe.io import read_parquet, to_parquet
49 |     except ImportError:
50 |         pass
51 |     try:
52 |         from dask.dataframe.io import read_orc, to_orc
53 |     except ImportError:
54 |         pass
55 |     try:
56 |         from dask.dataframe.core import isna
57 |     except ImportError:
58 |         pass
59 | except ImportError as e:
60 |     msg = (
61 |         "Dask dataframe requirements are not installed.\n\n"
62 |         "Please either conda or pip install as follows:\n\n"
63 |         "  conda install dask                     # either conda install\n"
64 |         '  python -m pip install "dask[dataframe]" --upgrade  # or python -m pip install'
65 |     )
66 |     raise ImportError(msg) from e
67 | 


--------------------------------------------------------------------------------
/dask/dataframe/_pyarrow_compat.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import copyreg
 4 | 
 5 | import pandas as pd
 6 | 
 7 | try:
 8 |     import pyarrow as pa
 9 | except ImportError:
10 |     pa = None
11 | 
12 | from dask.dataframe._compat import PANDAS_GE_150, PANDAS_GE_200
13 | 
14 | # Pickling of pyarrow arrays is effectively broken - pickling a slice of an
15 | # array ends up pickling the entire backing array.
16 | #
17 | # See https://issues.apache.org/jira/browse/ARROW-10739
18 | #
19 | # This comes up when using pandas `string[pyarrow]` dtypes, which are backed by
20 | # a `pyarrow.StringArray`.  To fix this, we register a *global* override for
21 | # pickling `ArrowStringArray` or `ArrowExtensionArray` types (where available).
22 | # We do this at the pandas level rather than the pyarrow level for efficiency reasons
23 | # (a pandas ArrowStringArray may contain many small pyarrow StringArray objects).
24 | #
25 | # The implementation here is based on https://github.com/pandas-dev/pandas/pull/49078
26 | # which is included in pandas=2+. We can remove all this once Dask's minimum
27 | # supported pandas version is at least 2.0.0.
28 | 
29 | 
30 | def rebuild_arrowextensionarray(type_, chunks):
31 |     array = pa.chunked_array(chunks)
32 |     return type_(array)
33 | 
34 | 
35 | def reduce_arrowextensionarray(x):
36 |     return (rebuild_arrowextensionarray, (type(x), x._data.combine_chunks()))
37 | 
38 | 
39 | # `pandas=2` includes efficient serialization of `pyarrow`-backed extension arrays.
40 | # See https://github.com/pandas-dev/pandas/pull/49078 for details.
41 | # We only need to backport efficient serialization for `pandas<2`.
42 | if pa is not None and not PANDAS_GE_200:
43 |     if PANDAS_GE_150:
44 |         # Applies to all `pyarrow`-backed extension arrays (e.g. `string[pyarrow]`, `int64[pyarrow]`)
45 |         for type_ in [pd.arrays.ArrowExtensionArray, pd.arrays.ArrowStringArray]:
46 |             copyreg.dispatch_table[type_] = reduce_arrowextensionarray
47 |     else:
48 |         # Only `string[pyarrow]` is implemented, so just patch that
49 |         copyreg.dispatch_table[pd.arrays.ArrowStringArray] = reduce_arrowextensionarray
50 | 


--------------------------------------------------------------------------------
/dask/widgets/templates/highlevelgraph_layer.html.j2:
--------------------------------------------------------------------------------
 1 | <div style="">
 2 |     <svg width="24" height="24" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg" style="position: absolute;">
 3 |         {% if materialized %}
 4 |         <circle cx="16" cy="16" r="14" fill="#8F8F8F" style="stroke: var(--jp-ui-font-color2, #1D1D1D);" stroke-width="2"/>
 5 |         {% else %}
 6 |         <circle cx="16" cy="16" r="14" style="stroke: var(--jp-ui-font-color2, #1D1D1D); fill: var(--jp-layout-color1, #F2F2F2);" stroke-width="2" />
 7 |         {% endif %}
 8 |     </svg>
 9 | 
10 |     <details style="margin-left: 32px;">
11 |         <summary style="margin-bottom: 10px; margin-top: 10px;">
12 |             <h4 style="display: inline;">Layer{{ layer_index }}: {{ shortname }}</h4>
13 |         </summary>
14 |         <p style="color: var(--jp-ui-font-color2, #5D5851); margin: -0.25em 0px 0px 0px;">
15 |             {{ highlevelgraph_key }}
16 |         </p>
17 | 
18 |         <table>
19 |         <tr>
20 |             <td>
21 |                 <table>
22 |                 {% for key, val in info.items() %}
23 |                     <tr>
24 |                         <th style="text-align: left; width: 150px;">{{ key }}</th>
25 |                         <td style="text-align: left;">{{ val }}</td>
26 |                     </tr>
27 |                 {% endfor %}
28 |                 {% for dep in dependencies %}
29 |                     {% if loop.index > 1 %}
30 |                         <tr>
31 |                             <th style="text-align: left; width: 150px;"></th>
32 |                             <td style="text-align: left;">{{ dep }}</td>
33 |                         </tr>
34 |                     {% else %}
35 |                         <tr>
36 |                             <th style="text-align: left; width: 150px;"> depends on </th>
37 |                             <td style="text-align: left;">{{ dep }}</td>
38 |                         </tr>
39 |                     {% endif %}
40 |                 {% endfor %}
41 |                 </table>
42 |             </td>
43 |             <td>
44 |                 {{ svg_repr }}
45 |             </td>
46 |         </tr>
47 |         </table>
48 | 
49 |     </details>
50 | </div>
51 | 


--------------------------------------------------------------------------------
/docs/source/bag-api.rst:
--------------------------------------------------------------------------------
  1 | API
  2 | ===
  3 | 
  4 | .. currentmodule:: dask.bag
  5 | 
  6 | Create Bags
  7 | -----------
  8 | 
  9 | .. autosummary::
 10 |    :toctree: generated/
 11 | 
 12 |    from_sequence
 13 |    from_delayed
 14 |    from_url
 15 |    range
 16 |    read_text
 17 |    read_avro
 18 | 
 19 | From dataframe
 20 | ~~~~~~~~~~~~~~
 21 | 
 22 | .. currentmodule:: dask.dataframe
 23 | 
 24 | .. autosummary::
 25 |    :toctree: generated/
 26 | 
 27 |    DataFrame.to_bag
 28 |    Series.to_bag
 29 | 
 30 | Top-level functions
 31 | -------------------
 32 | 
 33 | .. currentmodule:: dask.bag
 34 | 
 35 | .. autosummary::
 36 |    :toctree: generated/
 37 | 
 38 |    concat
 39 |    map
 40 |    map_partitions
 41 |    to_textfiles
 42 |    zip
 43 | 
 44 | Random Sampling
 45 | ---------------
 46 | 
 47 | .. autosummary::
 48 |    :toctree: generated/
 49 | 
 50 |    random.choices
 51 |    random.sample
 52 | 
 53 | 
 54 | Turn Bags into other things
 55 | ---------------------------
 56 | 
 57 | .. autosummary::
 58 |    :toctree: generated/
 59 | 
 60 |    Bag.to_textfiles
 61 |    Bag.to_dataframe
 62 |    Bag.to_delayed
 63 |    Bag.to_avro
 64 | 
 65 | 
 66 | Bag Methods
 67 | -----------
 68 | 
 69 | .. autosummary::
 70 |    :toctree: generated/
 71 | 
 72 |    Bag
 73 |    Bag.accumulate
 74 |    Bag.all
 75 |    Bag.any
 76 |    Bag.compute
 77 |    Bag.count
 78 |    Bag.distinct
 79 |    Bag.filter
 80 |    Bag.flatten
 81 |    Bag.fold
 82 |    Bag.foldby
 83 |    Bag.frequencies
 84 |    Bag.groupby
 85 |    Bag.join
 86 |    Bag.map
 87 |    Bag.map_partitions
 88 |    Bag.max
 89 |    Bag.mean
 90 |    Bag.min
 91 |    Bag.persist
 92 |    Bag.pluck
 93 |    Bag.product
 94 |    Bag.reduction
 95 |    Bag.random_sample
 96 |    Bag.remove
 97 |    Bag.repartition
 98 |    Bag.starmap
 99 |    Bag.std
100 |    Bag.sum
101 |    Bag.take
102 |    Bag.to_avro
103 |    Bag.to_dataframe
104 |    Bag.to_delayed
105 |    Bag.to_textfiles
106 |    Bag.topk
107 |    Bag.var
108 |    Bag.visualize
109 | 
110 | 
111 | Item Methods
112 | ------------
113 | 
114 | .. autosummary::
115 |    :toctree: generated/
116 | 
117 |    Item
118 |    Item.apply
119 |    Item.compute
120 |    Item.from_delayed
121 |    Item.persist
122 |    Item.to_delayed
123 |    Item.visualize
124 | 


--------------------------------------------------------------------------------
/dask/cache.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import sys
 4 | from numbers import Number
 5 | from timeit import default_timer
 6 | 
 7 | from dask.callbacks import Callback
 8 | 
 9 | overhead = sys.getsizeof(1.23) * 4 + sys.getsizeof(()) * 4
10 | 
11 | 
12 | class Cache(Callback):
13 |     """Use cache for computation
14 | 
15 |     Examples
16 |     --------
17 | 
18 |     >>> cache = Cache(1e9)  # doctest: +SKIP
19 | 
20 |     The cache can be used locally as a context manager around ``compute`` or
21 |     ``get`` calls:
22 | 
23 |     >>> with cache:  # doctest: +SKIP
24 |     ...     result = x.compute()
25 | 
26 |     You can also register a cache globally, so that it works for all
27 |     computations:
28 | 
29 |     >>> cache.register()    # doctest: +SKIP
30 |     >>> cache.unregister()  # doctest: +SKIP
31 |     """
32 | 
33 |     def __init__(self, cache, *args, **kwargs):
34 |         try:
35 |             import cachey
36 |         except ImportError as ex:
37 |             raise ImportError(
38 |                 'Cache requires cachey, "{ex}" problem ' "importing".format(ex=str(ex))
39 |             ) from ex
40 |         self._nbytes = cachey.nbytes
41 |         if isinstance(cache, Number):
42 |             cache = cachey.Cache(cache, *args, **kwargs)
43 |         else:
44 |             assert not args and not kwargs
45 |         self.cache = cache
46 |         self.starttimes = dict()
47 | 
48 |     def _start(self, dsk):
49 |         self.durations = dict()
50 |         overlap = set(dsk) & set(self.cache.data)
51 |         for key in overlap:
52 |             dsk[key] = self.cache.data[key]
53 | 
54 |     def _pretask(self, key, dsk, state):
55 |         self.starttimes[key] = default_timer()
56 | 
57 |     def _posttask(self, key, value, dsk, state, id):
58 |         duration = default_timer() - self.starttimes[key]
59 |         deps = state["dependencies"][key]
60 |         if deps:
61 |             duration += max(self.durations.get(k, 0) for k in deps)
62 |         self.durations[key] = duration
63 |         nb = self._nbytes(value) + overhead + sys.getsizeof(key) * 4
64 |         self.cache.put(key, value, cost=duration / nb / 1e9, nbytes=nb)
65 | 
66 |     def _finish(self, dsk, state, errored):
67 |         self.starttimes.clear()
68 |         self.durations.clear()
69 | 


--------------------------------------------------------------------------------
/dask/array/image.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from glob import glob
 5 | 
 6 | try:
 7 |     from skimage.io import imread as sk_imread
 8 | except (AttributeError, ImportError):
 9 |     pass
10 | 
11 | from dask.array.core import Array
12 | from dask.base import tokenize
13 | 
14 | 
15 | def add_leading_dimension(x):
16 |     return x[None, ...]
17 | 
18 | 
19 | def imread(filename, imread=None, preprocess=None):
20 |     """Read a stack of images into a dask array
21 | 
22 |     Parameters
23 |     ----------
24 | 
25 |     filename: string
26 |         A globstring like 'myfile.*.png'
27 |     imread: function (optional)
28 |         Optionally provide custom imread function.
29 |         Function should expect a filename and produce a numpy array.
30 |         Defaults to ``skimage.io.imread``.
31 |     preprocess: function (optional)
32 |         Optionally provide custom function to preprocess the image.
33 |         Function should expect a numpy array for a single image.
34 | 
35 |     Examples
36 |     --------
37 | 
38 |     >>> from dask.array.image import imread
39 |     >>> im = imread('2015-*-*.png')  # doctest: +SKIP
40 |     >>> im.shape  # doctest: +SKIP
41 |     (365, 1000, 1000, 3)
42 | 
43 |     Returns
44 |     -------
45 | 
46 |     Dask array of all images stacked along the first dimension.
47 |     Each separate image file will be treated as an individual chunk.
48 |     """
49 |     imread = imread or sk_imread
50 |     filenames = sorted(glob(filename))
51 |     if not filenames:
52 |         raise ValueError("No files found under name %s" % filename)
53 | 
54 |     name = "imread-%s" % tokenize(filenames, map(os.path.getmtime, filenames))
55 | 
56 |     sample = imread(filenames[0])
57 |     if preprocess:
58 |         sample = preprocess(sample)
59 | 
60 |     keys = [(name, i) + (0,) * len(sample.shape) for i in range(len(filenames))]
61 |     if preprocess:
62 |         values = [
63 |             (add_leading_dimension, (preprocess, (imread, fn))) for fn in filenames
64 |         ]
65 |     else:
66 |         values = [(add_leading_dimension, (imread, fn)) for fn in filenames]
67 |     dsk = dict(zip(keys, values))
68 | 
69 |     chunks = ((1,) * len(filenames),) + tuple((d,) for d in sample.shape)
70 | 
71 |     return Array(dsk, name, chunks, sample.dtype)
72 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: end-of-file-fixer
 6 |       - id: debug-statements
 7 |   - repo: https://github.com/MarcoGorelli/absolufy-imports
 8 |     rev: v0.3.1
 9 |     hooks:
10 |       - id: absolufy-imports
11 |         name: absolufy-imports
12 |   - repo: https://github.com/pycqa/isort
13 |     rev: 5.12.0
14 |     hooks:
15 |       - id: isort
16 |         language_version: python3
17 |   - repo: https://github.com/asottile/pyupgrade
18 |     rev: v3.4.0
19 |     hooks:
20 |       - id: pyupgrade
21 |         args:
22 |           - --py39-plus
23 |   - repo: https://github.com/psf/black
24 |     rev: 23.3.0
25 |     hooks:
26 |       - id: black
27 |         language_version: python3
28 |         args:
29 |           - --target-version=py39
30 |   - repo: https://github.com/pycqa/flake8
31 |     rev: 6.0.0
32 |     hooks:
33 |       - id: flake8
34 |         language_version: python3
35 |         additional_dependencies:
36 |           # NOTE: autoupdate does not pick up flake8-bugbear since it is a transitive
37 |           #  dependency. Make sure to update flake8-bugbear manually on a regular basis.
38 |           - flake8-bugbear==23.2.13
39 |   - repo: https://github.com/codespell-project/codespell
40 |     rev: v2.2.4
41 |     hooks:
42 |       -   id: codespell
43 |           types_or: [rst, markdown]
44 |           files: docs
45 |           additional_dependencies:
46 |             - tomli
47 |   - repo: https://github.com/pre-commit/mirrors-mypy
48 |     # pinned due to
49 |     # https://github.com/python/typeshed/pull/9771 and
50 |     # https://github.com/python/mypy/issues/15257 for DaskCollection.__dask_scheduler__
51 |     rev: v1.1.1
52 |     hooks:
53 |       - id: mypy
54 |         # Override default --ignore-missing-imports
55 |         # Use pyproject.toml if possible instead of adding command line parameters here
56 |         args: [--warn-unused-configs]
57 |         additional_dependencies:
58 |           # Type stubs
59 |           # - pandas-stubs  # TODO
60 |           - types-docutils
61 |           - types-PyYAML
62 |           - types-psutil
63 |           - types-requests
64 |           - types-setuptools
65 |           # Typed libraries
66 |           - numpy
67 |           - pytest
68 | 


--------------------------------------------------------------------------------
/dask/dask.yaml:
--------------------------------------------------------------------------------
 1 | temporary-directory: null  # Directory for local disk like /tmp, /scratch, or /local
 2 | 
 3 | visualization:
 4 |   engine: null  # Default visualization engine to use when calling `.visualize()` on a collection
 5 | 
 6 | tokenize:
 7 |   ensure-deterministic: false  # If true, tokenize will error instead of falling back to uuids
 8 | 
 9 | dataframe:
10 |   backend: "pandas"  # Backend dataframe library for input IO and data creation
11 |   shuffle:
12 |     method: null
13 |     compression: null  # compression for on disk-shuffling. Partd supports ZLib, BZ2, SNAPPY
14 |   parquet:
15 |     metadata-task-size-local: 512  # Number of files per local metadata-processing task
16 |     metadata-task-size-remote: 1  # Number of files per remote metadata-processing task
17 |   convert-string: null  # Whether to convert string-like data to pyarrow strings
18 | 
19 | array:
20 |   backend: "numpy"  # Backend array library for input IO and data creation
21 |   chunk-size: "128MiB"
22 |   rechunk:
23 |     method: "tasks"  # Rechunking method to use
24 |     threshold: 4
25 |   svg:
26 |     size: 120  # pixels
27 |   slicing:
28 |     split-large-chunks: null  # How to handle large output chunks in slicing. Warns by default.
29 | 
30 | optimization:
31 |   annotations:
32 |     fuse: true  # Automatically fuse compatible annotations on layers
33 |   fuse:
34 |     active: null  # Treat as false for dask.dataframe, true for everything else
35 |     ave-width: 1
36 |     max-width: null  # 1.5 + ave_width * log(ave_width + 1)
37 |     max-height: .inf
38 |     max-depth-new-edges: null  # ave_width * 1.5
39 |     subgraphs: null  # true for dask.dataframe, false for everything else
40 |     rename-keys: true
41 | 
42 | admin:
43 |   traceback:
44 |     shorten:
45 |       when:
46 |         - dask[\\\/]base.py
47 |         - distributed[\\\/]client.py
48 |       what:
49 |         - dask[\\\/]base.py
50 |         - dask[\\\/]core.py
51 |         - dask[\\\/]array[\\\/]core.py
52 |         - dask[\\\/]optimization.py
53 |         - dask[\\\/]dataframe[\\\/]core.py
54 |         - dask[\\\/]dataframe[\\\/]methods.py
55 |         - dask[\\\/]utils.py
56 |         - distributed[\\\/]worker.py
57 |         - distributed[\\\/]scheduler.py
58 |         - distributed[\\\/]client.py
59 |         - distributed[\\\/]utils.py
60 |         - tornado[\\\/]gen.py
61 |         - pandas[\\\/]core[\\\/]
62 | 


--------------------------------------------------------------------------------
/docs/source/array-stack.rst:
--------------------------------------------------------------------------------
 1 | Stack, Concatenate, and Block
 2 | =============================
 3 | 
 4 | Often we have many arrays stored on disk that we want to stack together and
 5 | think of as one large array.  This is common with geospatial data in which we
 6 | might have many HDF5/NetCDF files on disk, one for every day, but we want to do
 7 | operations that span multiple days.
 8 | 
 9 | To solve this problem, we use the functions ``da.stack``, ``da.concatenate``,
10 | and ``da.block``.
11 | 
12 | Stack
13 | -----
14 | 
15 | We stack many existing Dask arrays into a new array, creating a new dimension
16 | as we go.
17 | 
18 | .. code-block:: python
19 | 
20 |    >>> import dask.array as da
21 | 
22 |    >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
23 |    >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
24 | 
25 |    >>> data = [arr0, arr1]
26 | 
27 |    >>> x = da.stack(data, axis=0)
28 |    >>> x.shape
29 |    (2, 3, 4)
30 | 
31 |    >>> da.stack(data, axis=1).shape
32 |    (3, 2, 4)
33 | 
34 |    >>> da.stack(data, axis=-1).shape
35 |    (3, 4, 2)
36 | 
37 | This creates a new dimension with length equal to the number of slices
38 | 
39 | Concatenate
40 | -----------
41 | 
42 | We concatenate existing arrays into a new array, extending them along an
43 | existing dimension
44 | 
45 | .. code-block:: python
46 | 
47 |    >>> import dask.array as da
48 |    >>> import numpy as np
49 | 
50 |    >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
51 |    >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
52 | 
53 |    >>> data = [arr0, arr1]
54 | 
55 |    >>> x = da.concatenate(data, axis=0)
56 |    >>> x.shape
57 |    (6, 4)
58 | 
59 |    >>> da.concatenate(data, axis=1).shape
60 |    (3, 8)
61 | 
62 | Block
63 | -----
64 | 
65 | We can handle a larger variety of cases with ``da.block`` as it allows
66 | concatenation to be applied over multiple dimensions at once.  This is useful if
67 | your chunks tile a space, for example if small squares tile a larger 2-D plane.
68 | 
69 | .. code-block:: python
70 | 
71 |    >>> import dask.array as da
72 |    >>> import numpy as np
73 | 
74 |    >>> arr0 = da.from_array(np.zeros((3, 4)), chunks=(1, 2))
75 |    >>> arr1 = da.from_array(np.ones((3, 4)), chunks=(1, 2))
76 | 
77 |    >>> data = [
78 |    ...     [arr0, arr1],
79 |    ...     [arr1, arr0]
80 |    ... ]
81 | 
82 |    >>> x = da.block(data)
83 |    >>> x.shape
84 |    (6, 8)
85 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_numeric.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from dask.array import Array, from_array
 8 | from dask.dataframe import Series, from_pandas, to_numeric
 9 | from dask.dataframe.utils import pyarrow_strings_enabled
10 | from dask.delayed import Delayed
11 | 
12 | 
13 | @pytest.mark.parametrize("arg", ["5", 5, "5 "])
14 | def test_to_numeric_on_scalars(arg):
15 |     output = to_numeric(arg)
16 |     assert isinstance(output, Delayed)
17 |     assert output.compute() == 5
18 | 
19 | 
20 | def test_to_numeric_on_dask_array():
21 |     arg = from_array(["1.0", "2", "-3", "5.1"])
22 |     expected = np.array([1.0, 2.0, -3.0, 5.1])
23 |     output = to_numeric(arg)
24 |     assert isinstance(output, Array)
25 |     assert list(output.compute()) == list(expected)
26 | 
27 | 
28 | def test_to_numeric_on_dask_dataframe_series():
29 |     s = pd.Series(["1.0", "2", -3, -5.1])
30 |     arg = from_pandas(s, npartitions=2)
31 |     expected = pd.to_numeric(s)
32 |     output = to_numeric(arg)
33 |     expected_dtype = "int64"
34 |     if pyarrow_strings_enabled():
35 |         # `to_numeric` output depends on input dtype
36 |         expected_dtype = "Int64"
37 |     assert output.dtype == expected_dtype
38 |     assert isinstance(output, Series)
39 |     assert list(output.compute()) == list(expected)
40 | 
41 | 
42 | def test_to_numeric_on_dask_dataframe_series_with_meta():
43 |     s = pd.Series(["1.0", "2", -3, -5.1])
44 |     arg = from_pandas(s, npartitions=2)
45 |     expected = pd.to_numeric(s)
46 |     output = to_numeric(arg, meta=pd.Series([], dtype="float64"))
47 |     assert output.dtype == "float64"
48 |     assert isinstance(output, Series)
49 |     assert list(output.compute()) == list(expected)
50 | 
51 | 
52 | def test_to_numeric_on_dask_dataframe_dataframe_raises_error():
53 |     s = pd.Series(["1.0", "2", -3, -5.1])
54 |     df = pd.DataFrame({"a": s, "b": s})
55 |     arg = from_pandas(df, npartitions=2)
56 |     with pytest.raises(TypeError, match="arg must be a list, tuple, dask."):
57 |         to_numeric(arg)
58 | 
59 | 
60 | def test_to_numeric_raises():
61 |     with pytest.raises(ValueError, match="invalid error value"):
62 |         to_numeric("10", errors="invalid")
63 |     with pytest.raises(KeyError, match="``meta`` is not allowed"):
64 |         to_numeric("10", meta=pd.Series([], dtype="float64"))
65 | 


--------------------------------------------------------------------------------
/dask/array/cupy_entry_point.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import dask.array as da
 4 | from dask import config
 5 | from dask.array.backends import ArrayBackendEntrypoint, register_cupy
 6 | from dask.array.core import Array
 7 | from dask.array.dispatch import to_cupy_dispatch
 8 | 
 9 | 
10 | def _cupy(strict=True):
11 |     try:
12 |         import cupy
13 |     except ImportError:
14 |         if strict:
15 |             raise ImportError("Please install `cupy` to use `CupyBackendEntrypoint`")
16 |         return None
17 |     return cupy
18 | 
19 | 
20 | def _da_with_cupy_meta(attr, *args, meta=None, **kwargs):
21 |     # Call the dask.array api with cupy-based meta
22 |     meta = _cupy().empty(()) if meta is None else meta
23 |     with config.set({"array.backend": "numpy"}):
24 |         return getattr(da, attr)(*args, meta=meta, **kwargs)
25 | 
26 | 
27 | class CupyBackendEntrypoint(ArrayBackendEntrypoint):
28 |     def __init__(self):
29 |         """Register data-directed dispatch functions"""
30 |         if _cupy(strict=False):
31 |             register_cupy()
32 | 
33 |     @classmethod
34 |     def to_backend_dispatch(cls):
35 |         return to_cupy_dispatch
36 | 
37 |     @classmethod
38 |     def to_backend(cls, data: Array, **kwargs):
39 |         if isinstance(data._meta, _cupy().ndarray):
40 |             # Already a cupy-backed collection
41 |             return data
42 |         return data.map_blocks(cls.to_backend_dispatch(), **kwargs)
43 | 
44 |     @property
45 |     def RandomState(self):
46 |         return _cupy().random.RandomState
47 | 
48 |     @property
49 |     def default_bit_generator(self):
50 |         return _cupy().random.XORWOW
51 | 
52 |     @staticmethod
53 |     def ones(*args, **kwargs):
54 |         return _da_with_cupy_meta("ones", *args, **kwargs)
55 | 
56 |     @staticmethod
57 |     def zeros(*args, **kwargs):
58 |         return _da_with_cupy_meta("zeros", *args, **kwargs)
59 | 
60 |     @staticmethod
61 |     def empty(*args, **kwargs):
62 |         return _da_with_cupy_meta("empty", *args, **kwargs)
63 | 
64 |     @staticmethod
65 |     def full(*args, **kwargs):
66 |         return _da_with_cupy_meta("full", *args, **kwargs)
67 | 
68 |     @staticmethod
69 |     def arange(*args, like=None, **kwargs):
70 |         like = _cupy().empty(()) if like is None else like
71 |         with config.set({"array.backend": "numpy"}):
72 |             return da.arange(*args, like=like, **kwargs)
73 | 


--------------------------------------------------------------------------------
/dask/dataframe/_dtypes.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from datetime import date, time
 4 | from decimal import Decimal
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from dask.dataframe._compat import PANDAS_GE_150
 9 | from dask.dataframe.extensions import make_array_nonempty, make_scalar
10 | 
11 | 
12 | @make_array_nonempty.register(pd.DatetimeTZDtype)
13 | def _(dtype):
14 |     return pd.array([pd.Timestamp(1), pd.NaT], dtype=dtype)
15 | 
16 | 
17 | @make_scalar.register(pd.DatetimeTZDtype)
18 | def _(x):
19 |     return pd.Timestamp(1, tz=x.tz, unit=x.unit)
20 | 
21 | 
22 | @make_array_nonempty.register(pd.StringDtype)
23 | def _(dtype):
24 |     return pd.array(["a", pd.NA], dtype=dtype)
25 | 
26 | 
27 | if PANDAS_GE_150:
28 | 
29 |     @make_array_nonempty.register(pd.ArrowDtype)
30 |     def _make_array_nonempty_pyarrow_dtype(dtype):
31 |         import pyarrow as pa
32 | 
33 |         if pa.types.is_integer(dtype.pyarrow_dtype):
34 |             data = [1, 2]
35 |         elif pa.types.is_floating(dtype.pyarrow_dtype):
36 |             data = [1.5, 2.5]
37 |         elif pa.types.is_boolean(dtype.pyarrow_dtype):
38 |             data = [True, False]
39 |         elif pa.types.is_string(dtype.pyarrow_dtype) or pa.types.is_large_string(
40 |             dtype.pyarrow_dtype
41 |         ):
42 |             data = ["a", "b"]
43 |         elif pa.types.is_timestamp(dtype.pyarrow_dtype):
44 |             data = [pd.Timestamp("1970-01-01"), pd.Timestamp("1970-01-02")]
45 |         elif pa.types.is_date(dtype.pyarrow_dtype):
46 |             data = [date(1970, 1, 1), date(1970, 1, 2)]
47 |         elif pa.types.is_binary(dtype.pyarrow_dtype) or pa.types.is_large_binary(
48 |             dtype.pyarrow_dtype
49 |         ):
50 |             data = [b"a", b"b"]
51 |         elif pa.types.is_decimal(dtype.pyarrow_dtype):
52 |             data = [Decimal("1"), Decimal("0.0")]
53 |         elif pa.types.is_duration(dtype.pyarrow_dtype):
54 |             data = [pd.Timedelta("1 day"), pd.Timedelta("2 days")]
55 |         elif pa.types.is_time(dtype.pyarrow_dtype):
56 |             data = [time(12, 0), time(0, 12)]
57 |         else:
58 |             data = dtype.empty(2)
59 |         return pd.array(data, dtype=dtype)
60 | 
61 | 
62 | @make_scalar.register(str)
63 | def _(x):
64 |     return "s"
65 | 
66 | 
67 | @make_array_nonempty.register(pd.BooleanDtype)
68 | def _(dtype):
69 |     return pd.array([True, pd.NA], dtype=dtype)
70 | 
71 | 
72 | @make_scalar.register(bool)
73 | def _(x):
74 |     return True
75 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_reductions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import warnings
 4 | 
 5 | import numpy as np
 6 | import pytest
 7 | 
 8 | pytestmark = pytest.mark.gpu
 9 | 
10 | import dask
11 | import dask.array as da
12 | from dask.array.utils import assert_eq
13 | 
14 | cupy = pytest.importorskip("cupy")
15 | 
16 | 
17 | @pytest.mark.parametrize(
18 |     ["dfunc", "func"],
19 |     [
20 |         (da.argmin, np.argmin),
21 |         (da.argmax, np.argmax),
22 |         (da.nanargmin, np.nanargmin),
23 |         (da.nanargmax, np.nanargmax),
24 |     ],
25 | )
26 | def test_arg_reductions(dfunc, func):
27 |     x = cupy.random.default_rng().random((10, 10, 10))
28 |     a = da.from_array(x, chunks=(3, 4, 5))
29 | 
30 |     assert_eq(dfunc(a), func(x))
31 |     assert_eq(dfunc(a, 0), func(x, 0))
32 |     assert_eq(dfunc(a, 1), func(x, 1))
33 |     assert_eq(dfunc(a, 2), func(x, 2))
34 |     with dask.config.set(split_every=2):
35 |         assert_eq(dfunc(a), func(x))
36 |         assert_eq(dfunc(a, 0), func(x, 0))
37 |         assert_eq(dfunc(a, 1), func(x, 1))
38 |         assert_eq(dfunc(a, 2), func(x, 2))
39 | 
40 |     pytest.raises(ValueError, lambda: dfunc(a, 3))
41 |     pytest.raises(TypeError, lambda: dfunc(a, (0, 1)))
42 | 
43 |     x2 = cupy.arange(10)
44 |     a2 = da.from_array(x2, chunks=3)
45 |     assert_eq(dfunc(a2), func(x2))
46 |     assert_eq(dfunc(a2, 0), func(x2, 0))
47 |     assert_eq(dfunc(a2, 0, split_every=2), func(x2, 0))
48 | 
49 | 
50 | @pytest.mark.parametrize(
51 |     ["dfunc", "func"], [(da.nanargmin, np.nanargmin), (da.nanargmax, np.nanargmax)]
52 | )
53 | def test_nanarg_reductions(dfunc, func):
54 |     x = cupy.random.default_rng().random((10, 10, 10))
55 |     x[5] = cupy.nan
56 |     a = da.from_array(x, chunks=(3, 4, 5))
57 |     assert_eq(dfunc(a), func(x))
58 |     assert_eq(dfunc(a, 0), func(x, 0))
59 | 
60 |     with warnings.catch_warnings():
61 |         warnings.simplefilter("ignore", RuntimeWarning)  # All-NaN slice encountered
62 |         with pytest.raises(ValueError):
63 |             dfunc(a, 1).compute()
64 | 
65 |         with pytest.raises(ValueError):
66 |             dfunc(a, 2).compute()
67 | 
68 |         x[:] = cupy.nan
69 |         a = da.from_array(x, chunks=(3, 4, 5))
70 |         with pytest.raises(ValueError):
71 |             dfunc(a).compute()
72 | 
73 | 
74 | @pytest.mark.parametrize("func", [np.cumsum, np.cumprod])
75 | def test_cumreduction_with_cupy(func):
76 |     a = cupy.ones((10, 10))
77 |     b = da.from_array(a, chunks=(4, 4))
78 |     result = func(b, axis=0)
79 |     assert_eq(result, func(a, axis=0))
80 | 


--------------------------------------------------------------------------------
/dask/widgets/templates/highlevelgraph.html.j2:
--------------------------------------------------------------------------------
 1 | <div>
 2 |     <div>
 3 |         <div style="width: 52px; height: 52px; position: absolute;">
 4 |             <svg width="76" height="71" viewBox="0 0 76 71" fill="none" xmlns="http://www.w3.org/2000/svg">
 5 |                 <circle cx="61.5" cy="36.5" r="13.5" style="stroke: var(--jp-ui-font-color2, #1D1D1D); fill: var(--jp-layout-color1, #F2F2F2);" stroke-width="2"/>
 6 |                 <circle cx="14.5" cy="14.5" r="13.5" style="stroke: var(--jp-ui-font-color2, #1D1D1D); fill: var(--jp-layout-color1, #F2F2F2);" stroke-width="2"/>
 7 |                 <circle cx="14.5" cy="56.5" r="13.5" style="stroke: var(--jp-ui-font-color2, #1D1D1D); fill: var(--jp-layout-color1, #F2F2F2);" stroke-width="2"/>
 8 |                 <path d="M28 16L30.5 16C33.2614 16 35.5 18.2386 35.5 21L35.5 32.0001C35.5 34.7615 37.7386 37.0001 40.5 37.0001L43 37.0001" style="stroke: var(--jp-ui-font-color2, #1D1D1D);" stroke-width="1.5"/>
 9 |                 <path d="M40.5 37L40.5 37.75L40.5 37.75L40.5 37ZM35.5 42L36.25 42L35.5 42ZM35.5 52L34.75 52L35.5 52ZM30.5 57L30.5 57.75L30.5 57ZM41.5001 36.25L40.5 36.25L40.5 37.75L41.5001 37.75L41.5001 36.25ZM34.75 42L34.75 52L36.25 52L36.25 42L34.75 42ZM30.5 56.25L28.0001 56.25L28.0001 57.75L30.5 57.75L30.5 56.25ZM34.75 52C34.75 54.3472 32.8472 56.25 30.5 56.25L30.5 57.75C33.6756 57.75 36.25 55.1756 36.25 52L34.75 52ZM40.5 36.25C37.3244 36.25 34.75 38.8243 34.75 42L36.25 42C36.25 39.6528 38.1528 37.75 40.5 37.75L40.5 36.25Z" style="fill: var(--jp-ui-font-color2, #1D1D1D);"/>
10 |                 <circle cx="28" cy="16" r="2.25" fill="#E5E5E5" style="stroke: var(--jp-ui-font-color2, #1D1D1D);" stroke-width="1.5"/>
11 |                 <circle cx="28" cy="57" r="2.25" fill="#E5E5E5" style="stroke: var(--jp-ui-font-color2, #1D1D1D);" stroke-width="1.5"/>
12 |                 <path d="M45.25 36.567C45.5833 36.7594 45.5833 37.2406 45.25 37.433L42.25 39.1651C41.9167 39.3575 41.5 39.117 41.5 38.7321V35.2679C41.5 34.883 41.9167 34.6425 42.25 34.8349L45.25 36.567Z" style="fill: var(--jp-ui-font-color2, #1D1D1D);"/>
13 |             </svg>
14 |         </div>
15 |         <div style="margin-left: 64px;">
16 |             <h3 style="margin-bottom: 0px;">HighLevelGraph</h3>
17 |             <p style="color: var(--jp-ui-font-color2, #5D5851); margin-bottom:0px;">
18 |                 {{ type }} with {{ layers | length }} layers and {{ n_outputs }} keys from all layers.
19 |             </p>
20 |             {% for layer in toposort %}
21 |             {{ layers[layer]._repr_html_(layer_index=loop.index, highlevelgraph_key=layer, dependencies=layer_dependencies[layer])}}
22 |             {% endfor %}
23 |         </div>
24 |     </div>
25 | </div>
26 | 


--------------------------------------------------------------------------------
/.github/workflows/upstream.yml:
--------------------------------------------------------------------------------
 1 | name: Upstream
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 1 * * *"
 6 |   push:
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | # Required shell entrypoint to have properly activated conda environments
11 | defaults:
12 |   run:
13 |     shell: bash -l {0}
14 | 
15 | jobs:
16 | 
17 |   check:
18 |     runs-on: ubuntu-latest
19 |     if: github.event_name == 'push' || github.event_name == 'pull_request'
20 |     outputs:
21 |       test-upstream: ${{ steps.detect-trigger.outputs.trigger-found }}
22 |     steps:
23 |       - uses: actions/checkout@v3.5.3
24 |         with:
25 |           fetch-depth: 2
26 |       - uses: xarray-contrib/ci-trigger@v1
27 |         id: detect-trigger
28 |         with:
29 |           keyword: "test-upstream"
30 | 
31 |   build:
32 |     needs: check
33 |     runs-on: ubuntu-latest
34 |     if: |
35 |       always()
36 |       && (
37 |           needs.check.outputs.test-upstream == 'true'
38 |           || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'upstream'))
39 |           || (github.repository == 'dask/dask' && github.event_name != 'pull_request')
40 |       )
41 |     timeout-minutes: 90
42 | 
43 |     env:
44 |       COVERAGE: "true"
45 |       PARALLEL: "true"
46 |       UPSTREAM_DEV: 1
47 | 
48 |     steps:
49 |       - name: Checkout source
50 |         uses: actions/checkout@v3.5.3
51 | 
52 |       - name: Setup Conda Environment
53 |         uses: conda-incubator/setup-miniconda@v2.2.0
54 |         with:
55 |           miniforge-variant: Mambaforge
56 |           miniforge-version: latest
57 |           use-mamba: true
58 |           channel-priority: strict
59 |           python-version: "3.10"
60 |           environment-file: continuous_integration/environment-3.10.yaml
61 |           activate-environment: test-environment
62 |           auto-activate-base: false
63 | 
64 |       - name: Install
65 |         run: source continuous_integration/scripts/install.sh
66 | 
67 |       - name: Run tests
68 |         id: run_tests
69 |         env:
70 |           XTRATESTARGS: "--report-log output-log.jsonl"
71 |         run: source continuous_integration/scripts/run_tests.sh
72 | 
73 |       - name: Open or update issue on failure
74 |         if: |
75 |           failure()
76 |           && github.event_name != 'pull_request'
77 |           && github.repository == 'dask/dask'
78 |           && steps.run_tests.outcome == 'failure'
79 |         uses: xarray-contrib/issue-from-pytest-log@v1.2.6
80 |         with:
81 |           log-path: output-log.jsonl
82 |           issue-title: ⚠️ Upstream CI failed ⚠️
83 |           issue-label: upstream
84 | 
85 |       - name: Coverage
86 |         uses: codecov/codecov-action@v3
87 | 


--------------------------------------------------------------------------------
/continuous_integration/scripts/install.sh:
--------------------------------------------------------------------------------
 1 | set -xe
 2 | 
 3 | if [[ ${UPSTREAM_DEV} ]]; then
 4 | 
 5 |     # NOTE: `dask/tests/test_ci.py::test_upstream_packages_installed` should up be
 6 |     # updated when pacakges here are updated.
 7 | 
 8 |     # FIXME https://github.com/mamba-org/mamba/issues/412
 9 |     # mamba uninstall --force ...
10 |     conda uninstall --force bokeh
11 |     mamba install -y -c bokeh/label/dev bokeh
12 | 
13 |     # FIXME https://github.com/mamba-org/mamba/issues/412
14 |     # mamba uninstall --force ...
15 |     conda uninstall --force pyarrow
16 |     python -m pip install --no-deps \
17 |         --extra-index-url https://pypi.fury.io/arrow-nightlies/ \
18 |         --prefer-binary --pre pyarrow
19 | 
20 |     # FIXME https://github.com/mamba-org/mamba/issues/412
21 |     # mamba uninstall --force ...
22 |     conda uninstall --force fastparquet
23 |     python -m pip install \
24 |         --upgrade \
25 |         locket \
26 |         git+https://github.com/pydata/sparse \
27 |         git+https://github.com/dask/s3fs \
28 |         git+https://github.com/intake/filesystem_spec \
29 |         git+https://github.com/dask/partd \
30 |         git+https://github.com/dask/zict \
31 |         git+https://github.com/dask/distributed \
32 |         git+https://github.com/dask/fastparquet \
33 |         git+https://github.com/zarr-developers/zarr-python
34 | 
35 |     # FIXME https://github.com/mamba-org/mamba/issues/412
36 |     # mamba uninstall --force ...
37 |     conda uninstall --force numpy pandas scipy
38 |     python -m pip install --no-deps --pre --retries 10 \
39 |         -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
40 |         numpy \
41 |         pandas \
42 |         scipy
43 | 
44 |     # Used when automatically opening an issue when the `upstream` CI build fails
45 |     mamba install pytest-reportlog
46 | 
47 |     # Crick doesn't work with latest nightly `numpy`. Temporarily remove
48 |     # `crick` from the upstream CI environment as a workaround.
49 |     # Can restore `crick` once https://github.com/dask/crick/issues/25 is closed.
50 | 
51 |     # Tiledb is causing segfaults. Temporarily remove `tiledb` and `tiledb-py`
52 |     # as a workaround.
53 | 
54 |     # FIXME https://github.com/mamba-org/mamba/issues/412
55 |     # mamba uninstall --force ...
56 |     conda uninstall --force crick tiledb tiledb-py
57 | 
58 | 
59 | fi
60 | 
61 | # Install dask
62 | python -m pip install --quiet --no-deps -e .[complete]
63 | echo mamba list
64 | mamba list
65 | 
66 | # For debugging
67 | echo -e "--\n--Conda Environment (re-create this with \`conda env create --name <name> -f <output_file>\`)\n--"
68 | mamba env export | grep -E -v '^prefix:.*$' > env.yaml
69 | cat env.yaml
70 | 
71 | set +xe
72 | 


--------------------------------------------------------------------------------
/docs/source/images/unoverlapping-neighbors.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    width="238.6875"
13 |    height="72.34375"
14 |    id="svg5373"
15 |    version="1.1"
16 |    inkscape:version="0.48.4 r9939"
17 |    sodipodi:docname="unoverlapping-neighbors.svg">
18 |   <defs
19 |      id="defs5375" />
20 |   <sodipodi:namedview
21 |      id="base"
22 |      pagecolor="#ffffff"
23 |      bordercolor="#666666"
24 |      borderopacity="1.0"
25 |      inkscape:pageopacity="0.0"
26 |      inkscape:pageshadow="2"
27 |      inkscape:zoom="0.7"
28 |      inkscape:cx="191.61483"
29 |      inkscape:cy="14.637733"
30 |      inkscape:document-units="px"
31 |      inkscape:current-layer="layer1"
32 |      showgrid="false"
33 |      fit-margin-top="0"
34 |      fit-margin-left="0"
35 |      fit-margin-right="0"
36 |      fit-margin-bottom="0"
37 |      inkscape:window-width="1600"
38 |      inkscape:window-height="876"
39 |      inkscape:window-x="0"
40 |      inkscape:window-y="24"
41 |      inkscape:window-maximized="1" />
42 |   <metadata
43 |      id="metadata5378">
44 |     <rdf:RDF>
45 |       <cc:Work
46 |          rdf:about="">
47 |         <dc:format>image/svg+xml</dc:format>
48 |         <dc:type
49 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
50 |         <dc:title></dc:title>
51 |       </cc:Work>
52 |     </rdf:RDF>
53 |   </metadata>
54 |   <g
55 |      inkscape:label="Layer 1"
56 |      inkscape:groupmode="layer"
57 |      id="layer1"
58 |      transform="translate(-255.65625,-496.1875)">
59 |     <rect
60 |        style="opacity:0.6;fill:#000080;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
61 |        id="rect2987-2-2-9-4-2"
62 |        width="70"
63 |        height="110"
64 |        x="-567.53986"
65 |        y="256.65048"
66 |        transform="matrix(0,-1,1,0,0,0)" />
67 |     <rect
68 |        style="opacity:0.6;fill:#000080;fill-opacity:0.50196078;stroke:#000000;stroke-width:2;stroke-linecap:round;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
69 |        id="rect2987-2-2-9-9-0-3"
70 |        width="70"
71 |        height="110"
72 |        x="-567.18451"
73 |        y="383.34952"
74 |        transform="matrix(0,-1,1,0,0,0)" />
75 |   </g>
76 | </svg>
77 | 


--------------------------------------------------------------------------------
/.github/workflows/update-gpuci.yml:
--------------------------------------------------------------------------------
 1 | name: Check for gpuCI updates
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: "0 0 * * *" # Daily “At 00:00” UTC
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   update-gpuci:
10 |     runs-on: ubuntu-latest
11 |     if: github.repository == 'dask/dask'
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v3.5.3
15 | 
16 |       - name: Parse current axis YAML
17 |         id: rapids_current
18 |         uses: the-coding-turtle/ga-yaml-parser@v0.1.2
19 |         with:
20 |           file: continuous_integration/gpuci/axis.yaml
21 | 
22 |       - name: Get latest cuDF nightly version
23 |         id: cudf_latest
24 |         uses: jacobtomlinson/gha-anaconda-package-version@0.1.3
25 |         with:
26 |           org: "rapidsai-nightly"
27 |           package: "cudf"
28 |           version_system: "CalVer"
29 | 
30 |       - name: Get latest UCX-Py nightly version
31 |         id: ucx_py_latest
32 |         uses: jacobtomlinson/gha-anaconda-package-version@0.1.3
33 |         with:
34 |           org: "rapidsai-nightly"
35 |           package: "ucx-py"
36 |           version_system: "CalVer"
37 | 
38 |       - name: Get old RAPIDS / UCX-Py versions
39 |         env:
40 |           FULL_RAPIDS_VER: ${{ steps.cudf_latest.outputs.version }}
41 |           FULL_UCX_PY_VER: ${{ steps.ucx_py_latest.outputs.version }}
42 |         run: |
43 |           echo RAPIDS_VER=${{ steps.rapids_current.outputs.RAPIDS_VER_0 }} >> $GITHUB_ENV
44 |           echo UCX_PY_VER=$(curl -sL https://version.gpuci.io/rapids/${{ steps.rapids_current.outputs.RAPIDS_VER_0 }}) >> $GITHUB_ENV
45 |           echo NEW_RAPIDS_VER=${FULL_RAPIDS_VER::-4} >> $GITHUB_ENV
46 |           echo NEW_UCX_PY_VER=${FULL_UCX_PY_VER::-4} >> $GITHUB_ENV
47 | 
48 |       - name: Update RAPIDS version
49 |         uses: jacobtomlinson/gha-find-replace@v3
50 |         with:
51 |           include: 'continuous_integration\/gpuci\/axis\.yaml'
52 |           find: "${{ env.RAPIDS_VER }}"
53 |           replace: "${{ env.NEW_RAPIDS_VER }}"
54 |           regex: false
55 | 
56 |       - name: Create Pull Request
57 |         uses: peter-evans/create-pull-request@v5
58 |         if: ${{ env.UCX_PY_VER != env.NEW_UCX_PY_VER }}  # make sure new ucx-py nightlies are available
59 |         with:
60 |           token: ${{ secrets.GITHUB_TOKEN }}
61 |           draft: true
62 |           commit-message: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`"
63 |           title: "Update gpuCI `RAPIDS_VER` to `${{ env.NEW_RAPIDS_VER }}`"
64 |           author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
65 |           branch: "upgrade-gpuci-rapids"
66 |           body: |
67 |             New cuDF and ucx-py nightly versions have been detected.
68 | 
69 |             Updated `axis.yaml` to use `${{ env.NEW_RAPIDS_VER }}`.
70 | 


--------------------------------------------------------------------------------
/dask/dataframe/hyperloglog.py:
--------------------------------------------------------------------------------
 1 | """Implementation of HyperLogLog
 2 | 
 3 | This implements the HyperLogLog algorithm for cardinality estimation, found
 4 | in
 5 | 
 6 |     Philippe Flajolet, Éric Fusy, Olivier Gandouet and Frédéric Meunier.
 7 |         "HyperLogLog: the analysis of a near-optimal cardinality estimation
 8 |         algorithm". 2007 Conference on Analysis of Algorithms. Nice, France
 9 |         (2007)
10 | 
11 | """
12 | from __future__ import annotations
13 | 
14 | import numpy as np
15 | import pandas as pd
16 | from pandas.util import hash_pandas_object
17 | 
18 | 
19 | def compute_first_bit(a):
20 |     "Compute the position of the first nonzero bit for each int in an array."
21 |     # TODO: consider making this less memory-hungry
22 |     bits = np.bitwise_and.outer(a, 1 << np.arange(32))
23 |     bits = bits.cumsum(axis=1).astype(bool)
24 |     return 33 - bits.sum(axis=1)
25 | 
26 | 
27 | def compute_hll_array(obj, b):
28 |     # b is the number of bits
29 | 
30 |     if not 8 <= b <= 16:
31 |         raise ValueError("b should be between 8 and 16")
32 |     num_bits_discarded = 32 - b
33 |     m = 1 << b
34 | 
35 |     # Get an array of the hashes
36 |     hashes = hash_pandas_object(obj, index=False)
37 |     if isinstance(hashes, pd.Series):
38 |         hashes = hashes._values
39 |     hashes = hashes.astype(np.uint32)
40 | 
41 |     # Of the first b bits, which is the first nonzero?
42 |     j = hashes >> num_bits_discarded
43 |     first_bit = compute_first_bit(hashes)
44 | 
45 |     # Pandas can do the max aggregation
46 |     df = pd.DataFrame({"j": j, "first_bit": first_bit})
47 |     series = df.groupby("j").max()["first_bit"]
48 | 
49 |     # Return a dense array so we can concat them and get a result
50 |     # that is easy to deal with
51 |     return series.reindex(np.arange(m), fill_value=0).values.astype(np.uint8)
52 | 
53 | 
54 | def reduce_state(Ms, b):
55 |     m = 1 << b
56 | 
57 |     # We concatenated all of the states, now we need to get the max
58 |     # value for each j in both
59 |     Ms = Ms.reshape((len(Ms) // m), m)
60 |     return Ms.max(axis=0)
61 | 
62 | 
63 | def estimate_count(Ms, b):
64 |     m = 1 << b
65 | 
66 |     # Combine one last time
67 |     M = reduce_state(Ms, b)
68 | 
69 |     # Estimate cardinality, no adjustments
70 |     alpha = 0.7213 / (1 + 1.079 / m)
71 |     E = alpha * m / (2.0 ** -(M.astype("f8"))).sum() * m
72 |     #                        ^^^^ starts as unsigned, need a signed type for
73 |     #                             negation operator to do something useful
74 | 
75 |     # Apply adjustments for small / big cardinalities, if applicable
76 |     if E < 2.5 * m:
77 |         V = (M == 0).sum()
78 |         if V:
79 |             return m * np.log(m / V)
80 |     if E > 2**32 / 30.0:
81 |         return -(2**32) * np.log1p(-E / 2**32)
82 |     return E
83 | 


--------------------------------------------------------------------------------
/docs/source/graph_manipulation.rst:
--------------------------------------------------------------------------------
 1 | .. _graph_manipulation:
 2 | 
 3 | Advanced graph manipulation
 4 | ===========================
 5 | There are some situations where computations with Dask collections will result in
 6 | suboptimal memory usage (e.g. an entire Dask DataFrame is loaded into memory).
 7 | This may happen when Dask’s scheduler doesn’t automatically delay the computation of
 8 | nodes in a task graph to avoid occupying memory with their output for prolonged periods
 9 | of time, or in scenarios where recalculating nodes is much cheaper than holding their
10 | output in memory.
11 | 
12 | This page highlights a set of graph manipulation utilities which can be used to help
13 | avoid these scenarios. In particular, the utilities described below rewrite the
14 | underlying Dask graph for Dask collections, producing equivalent collections with
15 | different sets of keys.
16 | 
17 | Consider the following example:
18 | 
19 | .. code-block:: python
20 | 
21 |    >>> import dask.array as da
22 |    >>> x = da.random.default_rng().normal(size=500_000_000, chunks=100_000)
23 |    >>> x_mean = x.mean()
24 |    >>> y = (x - x_mean).max().compute()
25 | 
26 | The above example computes the largest value of a distribution after removing its bias.
27 | This involves loading the chunks of ``x`` into memory in order to compute ``x_mean``.
28 | However, since the ``x`` array is needed later in the computation to compute ``y``, the
29 | entire ``x`` array is kept in memory. For large Dask Arrays this can be very
30 | problematic.
31 | 
32 | To alleviate the need for the entire ``x`` array to be kept in memory, one could rewrite
33 | the last line as follows:
34 | 
35 | .. code-block:: python
36 | 
37 |    >>> from dask.graph_manipulation import bind
38 |    >>> xb = bind(x, x_mean)
39 |    >>> y = (xb - x_mean).max().compute()
40 | 
41 | Here we use :func:`~dask.graph_manipulation.bind` to create a new Dask Array, ``xb``,
42 | which produces exactly the same output as ``x``, but whose underlying Dask graph has
43 | different keys than ``x``, and will only be computed after ``x_mean`` has been
44 | calculated.
45 | 
46 | This results in the chunks of ``x`` being computed and immediately individually reduced
47 | by ``mean``; then recomputed and again immediately pipelined into the subtraction
48 | followed by reduction with ``max``. This results in a much smaller peak memory usage as
49 | the full ``x`` array is no longer loaded into memory. However, the tradeoff is that the
50 | compute time increases as ``x`` is computed twice.
51 | 
52 | 
53 | API
54 | ---
55 | 
56 | .. currentmodule:: dask.graph_manipulation
57 | 
58 | .. autosummary::
59 | 
60 |    checkpoint
61 |    wait_on
62 |    bind
63 |    clone
64 | 
65 | 
66 | Definitions
67 | ~~~~~~~~~~~
68 | 
69 | .. autofunction:: checkpoint
70 | .. autofunction:: wait_on
71 | .. autofunction:: bind
72 | .. autofunction:: clone
73 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | import dask
 6 | 
 7 | # The doctests in these files fail due to either:
 8 | # - Non-required dependencies not being installed
 9 | # - Imported doctests due to pulling the docstrings from other packages
10 | #   (e.g. `numpy`). No need to run these doctests.
11 | collect_ignore = [
12 |     "dask/bytes/hdfs3.py",
13 |     "dask/bytes/pyarrow.py",
14 |     "dask/bytes/s3.py",
15 |     "dask/array/ghost.py",
16 |     "dask/array/fft.py",
17 |     "dask/dataframe/io/io.py",
18 |     "dask/dataframe/io/parquet/arrow.py",
19 |     "dask/dot.py",
20 |     "dask/ml.py",
21 | ]
22 | 
23 | collect_ignore_glob = []
24 | try:
25 |     import numpy  # noqa: F401
26 | except ImportError:
27 |     collect_ignore_glob.append("dask/array/*")
28 | 
29 | try:
30 |     import pandas  # noqa: F401
31 | except ImportError:
32 |     collect_ignore_glob.append("dask/dataframe/*")
33 | 
34 | try:
35 |     import scipy  # noqa: F401
36 | except ImportError:
37 |     collect_ignore.append("dask/array/stats.py")
38 | 
39 | try:
40 |     import pyarrow  # noqa: F401
41 | except ImportError:
42 |     collect_ignore.append("dask/dataframe/io/orc/arrow.py")
43 | 
44 | try:
45 |     import tiledb  # noqa: F401
46 | except ImportError:
47 |     collect_ignore.append("dask/array/tiledb_io.py")
48 | 
49 | try:
50 |     import sqlalchemy  # noqa: F401
51 | except ImportError:
52 |     collect_ignore.append("dask/dataframe/io/sql.py")
53 | 
54 | 
55 | def pytest_addoption(parser):
56 |     parser.addoption("--runslow", action="store_true", help="run slow tests")
57 | 
58 | 
59 | def pytest_runtest_setup(item):
60 |     if "slow" in item.keywords and not item.config.getoption("--runslow"):
61 |         pytest.skip("need --runslow option to run")
62 | 
63 | 
64 | try:
65 |     from dask.dataframe.utils import pyarrow_strings_enabled
66 | 
67 |     convert_string = pyarrow_strings_enabled()
68 | except (ImportError, RuntimeError):
69 |     convert_string = False
70 | 
71 | skip_with_pyarrow_strings = pytest.mark.skipif(
72 |     convert_string,
73 |     reason="No need to run with pyarrow strings",
74 | )
75 | 
76 | xfail_with_pyarrow_strings = pytest.mark.xfail(
77 |     convert_string,
78 |     reason="Known failure with pyarrow strings",
79 | )
80 | 
81 | 
82 | def pytest_collection_modifyitems(config, items):
83 |     for item in items:
84 |         if "skip_with_pyarrow_strings" in item.keywords:
85 |             item.add_marker(skip_with_pyarrow_strings)
86 |         if "xfail_with_pyarrow_strings" in item.keywords:
87 |             item.add_marker(xfail_with_pyarrow_strings)
88 | 
89 | 
90 | pytest.register_assert_rewrite(
91 |     "dask.array.utils", "dask.dataframe.utils", "dask.bag.utils"
92 | )
93 | 
94 | 
95 | @pytest.fixture(params=["disk", "tasks"])
96 | def shuffle_method(request):
97 |     with dask.config.set({"dataframe.shuffle.method": request.param}):
98 |         yield request.param
99 | 


--------------------------------------------------------------------------------
/docs/source/images/optimize_dask5.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- Generated by graphviz version 2.40.1 (20161225.0304)
 5 |  -->
 6 | <!-- Title: %3 Pages: 1 -->
 7 | <svg width="164pt" height="149pt"
 8 |  viewBox="0.00 0.00 163.89 148.89" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 144.8939)">
10 | <title>%3</title>
11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-144.8939 159.8939,-144.8939 159.8939,4 -4,4"/>
12 | <!-- 1242807449933300231 -->
13 | <g id="node1" class="node">
14 | <title>1242807449933300231</title>
15 | <ellipse fill="none" stroke="#000000" cx="34.4469" cy="-34.4469" rx="34.394" ry="34.394"/>
16 | <text text-anchor="middle" x="34.4469" y="-30.7469" font-family="Times,serif" font-size="14.00" fill="#000000">count1</text>
17 | </g>
18 | <!-- 9194842205208052348 -->
19 | <g id="node2" class="node">
20 | <title>9194842205208052348</title>
21 | <polygon fill="none" stroke="#000000" points="61.4469,-140.8939 7.4469,-140.8939 7.4469,-104.8939 61.4469,-104.8939 61.4469,-140.8939"/>
22 | <text text-anchor="middle" x="34.4469" y="-119.1939" font-family="Times,serif" font-size="14.00" fill="#000000">print1</text>
23 | </g>
24 | <!-- 1242807449933300231&#45;&gt;9194842205208052348 -->
25 | <g id="edge1" class="edge">
26 | <title>1242807449933300231&#45;&gt;9194842205208052348</title>
27 | <path fill="none" stroke="#000000" d="M34.4469,-68.8961C34.4469,-77.395 34.4469,-86.401 34.4469,-94.5715"/>
28 | <polygon fill="#000000" stroke="#000000" points="30.947,-94.7183 34.4469,-104.7184 37.947,-94.7184 30.947,-94.7183"/>
29 | </g>
30 | <!-- 6590722590589999451 -->
31 | <g id="node3" class="node">
32 | <title>6590722590589999451</title>
33 | <ellipse fill="none" stroke="#000000" cx="121.4469" cy="-34.4469" rx="34.394" ry="34.394"/>
34 | <text text-anchor="middle" x="121.4469" y="-30.7469" font-family="Times,serif" font-size="14.00" fill="#000000">count2</text>
35 | </g>
36 | <!-- 5022637276554243765 -->
37 | <g id="node4" class="node">
38 | <title>5022637276554243765</title>
39 | <polygon fill="none" stroke="#000000" points="148.4469,-140.8939 94.4469,-140.8939 94.4469,-104.8939 148.4469,-104.8939 148.4469,-140.8939"/>
40 | <text text-anchor="middle" x="121.4469" y="-119.1939" font-family="Times,serif" font-size="14.00" fill="#000000">print2</text>
41 | </g>
42 | <!-- 6590722590589999451&#45;&gt;5022637276554243765 -->
43 | <g id="edge2" class="edge">
44 | <title>6590722590589999451&#45;&gt;5022637276554243765</title>
45 | <path fill="none" stroke="#000000" d="M121.4469,-68.8961C121.4469,-77.395 121.4469,-86.401 121.4469,-94.5715"/>
46 | <polygon fill="#000000" stroke="#000000" points="117.947,-94.7183 121.4469,-104.7184 124.947,-94.7184 117.947,-94.7183"/>
47 | </g>
48 | </g>
49 | </svg>
50 | 


--------------------------------------------------------------------------------
/dask/hashing.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import binascii
  4 | import hashlib
  5 | 
  6 | hashers = []  # In decreasing performance order
  7 | 
  8 | 
  9 | # Timings on a largish array:
 10 | # - CityHash is 2x faster than MurmurHash
 11 | # - xxHash is slightly slower than CityHash
 12 | # - MurmurHash is 8x faster than SHA1
 13 | # - SHA1 is significantly faster than all other hashlib algorithms
 14 | 
 15 | try:
 16 |     import cityhash  # `python -m pip install cityhash`
 17 | except ImportError:
 18 |     pass
 19 | else:
 20 |     # CityHash disabled unless the reference leak in
 21 |     # https://github.com/escherba/python-cityhash/pull/16
 22 |     # is fixed.
 23 |     if cityhash.__version__ >= "0.2.2":
 24 | 
 25 |         def _hash_cityhash(buf):
 26 |             """
 27 |             Produce a 16-bytes hash of *buf* using CityHash.
 28 |             """
 29 |             h = cityhash.CityHash128(buf)
 30 |             return h.to_bytes(16, "little")
 31 | 
 32 |         hashers.append(_hash_cityhash)
 33 | 
 34 | try:
 35 |     import xxhash  # `python -m pip install xxhash`
 36 | except ImportError:
 37 |     pass
 38 | else:
 39 | 
 40 |     def _hash_xxhash(buf):
 41 |         """
 42 |         Produce a 8-bytes hash of *buf* using xxHash.
 43 |         """
 44 |         return xxhash.xxh64(buf).digest()
 45 | 
 46 |     hashers.append(_hash_xxhash)
 47 | 
 48 | try:
 49 |     import mmh3  # `python -m pip install mmh3`
 50 | except ImportError:
 51 |     pass
 52 | else:
 53 | 
 54 |     def _hash_murmurhash(buf):
 55 |         """
 56 |         Produce a 16-bytes hash of *buf* using MurmurHash.
 57 |         """
 58 |         return mmh3.hash_bytes(buf)
 59 | 
 60 |     hashers.append(_hash_murmurhash)
 61 | 
 62 | 
 63 | def _hash_sha1(buf):
 64 |     """
 65 |     Produce a 20-bytes hash of *buf* using SHA1.
 66 |     """
 67 |     return hashlib.sha1(buf).digest()
 68 | 
 69 | 
 70 | hashers.append(_hash_sha1)
 71 | 
 72 | 
 73 | def hash_buffer(buf, hasher=None):
 74 |     """
 75 |     Hash a bytes-like (buffer-compatible) object.  This function returns
 76 |     a good quality hash but is not cryptographically secure.  The fastest
 77 |     available algorithm is selected.  A fixed-length bytes object is returned.
 78 |     """
 79 |     if hasher is not None:
 80 |         try:
 81 |             return hasher(buf)
 82 |         except (TypeError, OverflowError):
 83 |             # Some hash libraries may have overly-strict type checking,
 84 |             # not accepting all buffers
 85 |             pass
 86 |     for hasher in hashers:
 87 |         try:
 88 |             return hasher(buf)
 89 |         except (TypeError, OverflowError):
 90 |             pass
 91 |     raise TypeError(f"unsupported type for hashing: {type(buf)}")
 92 | 
 93 | 
 94 | def hash_buffer_hex(buf, hasher=None):
 95 |     """
 96 |     Same as hash_buffer, but returns its result in hex-encoded form.
 97 |     """
 98 |     h = hash_buffer(buf, hasher)
 99 |     s = binascii.b2a_hex(h)
100 |     return s.decode()
101 | 


--------------------------------------------------------------------------------
/docs/source/images/dask_horizontal.svg:
--------------------------------------------------------------------------------
 1 | <svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1055.69 386.2">
 2 |   <g>
 3 |     <path d="M80.51,127.89l83.85-48.38a2.69,2.69,0,0,0,1.34-2.32v-29A13.89,13.89,0,0,0,160.52,37a13.21,13.21,0,0,0-14.8-1L28.43,103.73a13.32,13.32,0,0,0-6.67,11.54l0,152.84a13.91,13.91,0,0,0,5.17,11.14,13.23,13.23,0,0,0,14.82,1l24.84-14.33a2.68,2.68,0,0,0,1.34-2.32l0-113.9A25.16,25.16,0,0,1,80.51,127.89Z" fill="#ffc11e"/>
 4 |     <path d="M281.51,106a13.21,13.21,0,0,0-13.33,0L150.89,173.63a13.36,13.36,0,0,0-6.67,11.54l-.05,153.42a13.34,13.34,0,0,0,20,11.55l117.28-67.68a13.34,13.34,0,0,0,6.67-11.54l.05-153.42A13.19,13.19,0,0,0,281.51,106Z" fill="#ef1161"/>
 5 |     <path d="M145,163.36l77.4-44.65a2.7,2.7,0,0,0,1.33-2.32V82.6a13.86,13.86,0,0,0-5.19-11.15,13.25,13.25,0,0,0-14.8-1L171.62,89,86.43,138.15a13.33,13.33,0,0,0-6.66,11.54l0,115.45,0,37.39a13.89,13.89,0,0,0,5.18,11.14,13.25,13.25,0,0,0,14.81,1L131,296.61a2.68,2.68,0,0,0,1.34-2.32l0-109.12A25.17,25.17,0,0,1,145,163.36Z" fill="#fc6e6b"/>
 6 |     <g>
 7 |       <path d="M375.29,224.46V178c0-39.85,23.53-67.14,59-67.14,25.41,0,42.36,13.17,49.25,25.41V52.2a3.79,3.79,0,0,1,4.09-4.08h30.75a4,4,0,0,1,4.07,4.08V283.13c0,2.51-1.57,4.08-4.39,4.08H488.87a3.79,3.79,0,0,1-4.08-4.08V265.56c-7.21,12.24-24.47,25.1-48.94,25.1C398.51,290.66,375.29,264.31,375.29,224.46Zm108.24-5.65V183.67c0-24.47-16-38.59-35.76-38.59-22,0-33.57,14.43-33.57,38.59v35.14c0,24.47,11.61,37.65,33.57,37.65C467.54,256.46,483.53,243.28,483.53,218.81Z" fill="#04255c"/>
 8 |       <path d="M692.49,167.67V282.82c0,2.82-1.57,4.39-4.39,4.39H660.48c-2.82,0-4.39-1.57-4.39-4.39v-20.4c-10.66,17.26-28.24,28.24-49.57,28.24-32.63,0-53.34-21.33-53.34-52.4,0-31.37,20.71-49.57,65.89-52.71a268.17,268.17,0,0,1,35.46-.31v-16c0-19.46-10.67-27.3-28.56-27.3s-24.47,6.59-28.86,18.82c-1.26,3.45-2.51,4.08-6,3.14L563.85,157c-2.83-.94-3.77-2.51-2.83-5,5-24.48,27.3-41.42,66.52-41.42C668,110.56,692.49,129.7,692.49,167.67Zm-38,63.69V210a275.38,275.38,0,0,0-32.32.62c-20.4,2.2-32,9.1-32,25.42,0,14.12,9.73,23.21,26.36,23.21C634.13,259.28,647.62,248.93,654.53,231.36Z" fill="#04255c"/>
 9 |       <path d="M856.27,151.35c.94,2.82,0,5-3.14,6l-22.91,8.16c-3.45.94-5,0-6-3.14-5-14.12-16.31-21-33.25-21-17.58,0-27.93,7.22-27.93,18.51,0,10.67,7.21,15.06,40.47,24.47,39.23,11,54.91,26.36,54.91,53.66,0,32-26.35,53-66.83,53-36.4,0-62.75-15.06-70.91-40.79-1.26-3.45,0-5.34,3.77-6.91l20.39-10c3.76-1.25,5.65-.62,6.9,3.77,4.71,14.43,19.46,23.22,38.59,23.22,19.46,0,30.76-7.53,30.76-19.77,0-11.3-9.42-17.26-42-26.36-37.65-10.67-53.34-25.41-53.34-51.77,0-31.06,25.42-51.77,64-51.77C823.64,110.56,848.11,125.62,856.27,151.35Z" fill="#04255c"/>
10 |       <path d="M1030.09,287.21H997.14c-3.14,0-4.39-1.25-5.65-3.76l-39.22-72.17-23.54,27.61v43.93c0,2.82-1.56,4.39-4.39,4.39H894.22c-2.82,0-4.39-1.57-4.39-4.39V52.2a4.11,4.11,0,0,1,4.39-4.39h30.12a4.11,4.11,0,0,1,4.39,4.39v139l59-72.8c2.83-3.13,5-4.39,8.79-4.39h31.69a3.53,3.53,0,0,1,2.83,6l-52.72,61.19,54.91,100.09C1035.1,284.39,1033.22,287.21,1030.09,287.21Z" fill="#04255c"/>
11 |     </g>
12 |   </g>
13 | </svg>
14 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_hashing.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | from pandas.util import hash_pandas_object
 7 | 
 8 | import dask.dataframe as dd
 9 | from dask.dataframe import _compat
10 | from dask.dataframe._compat import tm
11 | from dask.dataframe.utils import assert_eq
12 | 
13 | 
14 | @pytest.mark.parametrize(
15 |     "obj",
16 |     [
17 |         pd.Series([1, 2, 3]),
18 |         pd.Series([1.0, 1.5, 3.2]),
19 |         pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
20 |         pd.Series(["a", "b", "c"]),
21 |         pd.Series([True, False, True]),
22 |         pd.Index([1, 2, 3]),
23 |         pd.Index([True, False, True]),
24 |         pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
25 |         _compat.makeMissingDataframe(),
26 |         _compat.makeMixedDataFrame(),
27 |         _compat.makeTimeDataFrame(),
28 |         _compat.makeTimeSeries(),
29 |         _compat.makeTimedeltaIndex(),
30 |     ],
31 | )
32 | def test_hash_pandas_object(obj):
33 |     a = hash_pandas_object(obj)
34 |     b = hash_pandas_object(obj)
35 |     if isinstance(a, np.ndarray):
36 |         np.testing.assert_equal(a, b)
37 |     else:
38 |         assert_eq(a, b)
39 | 
40 | 
41 | def test_categorical_consistency():
42 |     # Check that categoricals hash consistent with their values, not codes
43 |     # This should work for categoricals of any dtype
44 |     for s1 in [
45 |         pd.Series(["a", "b", "c", "d"]),
46 |         pd.Series([1000, 2000, 3000, 4000]),
47 |         pd.Series(pd.date_range(0, periods=4)),
48 |     ]:
49 |         s2 = s1.astype("category").cat.set_categories(s1)
50 |         s3 = s2.cat.set_categories(list(reversed(s1)))
51 |         for categorize in [True, False]:
52 |             # These should all hash identically
53 |             h1 = hash_pandas_object(s1, categorize=categorize)
54 |             h2 = hash_pandas_object(s2, categorize=categorize)
55 |             h3 = hash_pandas_object(s3, categorize=categorize)
56 |             tm.assert_series_equal(h1, h2)
57 |             tm.assert_series_equal(h1, h3)
58 | 
59 | 
60 | def test_object_missing_values():
61 |     # Check that the presence of missing values doesn't change how object dtype
62 |     # is hashed.
63 |     s = pd.Series(["a", "b", "c", None])
64 |     h1 = hash_pandas_object(s).iloc[:3]
65 |     h2 = hash_pandas_object(s.iloc[:3])
66 |     tm.assert_series_equal(h1, h2)
67 | 
68 | 
69 | @pytest.mark.parametrize(
70 |     "obj",
71 |     [
72 |         pd.Index([1, 2, 3]),
73 |         pd.Index([True, False, True]),
74 |         pd.Series([1, 2, 3]),
75 |         pd.Series([1.0, 1.5, 3.2]),
76 |         pd.Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
77 |         pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
78 |         pd.DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}, index=["a", "z", "x"]),
79 |     ],
80 | )
81 | def test_hash_object_dispatch(obj):
82 |     result = dd.dispatch.hash_object_dispatch(obj)
83 |     expected = pd.util.hash_pandas_object(obj)
84 |     assert_eq(result, expected)
85 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_wrap.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | pytest.importorskip("numpy")
 6 | 
 7 | import numpy as np
 8 | 
 9 | import dask.array as da
10 | from dask.array.utils import assert_eq
11 | from dask.array.wrap import ones
12 | 
13 | 
14 | def test_ones():
15 |     a = ones((10, 10), dtype="i4", chunks=(4, 4))
16 |     x = np.array(a)
17 |     assert (x == np.ones((10, 10), "i4")).all()
18 | 
19 |     assert a.name.startswith("ones_like-")
20 | 
21 | 
22 | def test_size_as_list():
23 |     a = ones([10, 10], dtype="i4", chunks=(4, 4))
24 |     x = np.array(a)
25 |     assert (x == np.ones((10, 10), dtype="i4")).all()
26 | 
27 | 
28 | def test_singleton_size():
29 |     a = ones(10, dtype="i4", chunks=(4,))
30 |     x = np.array(a)
31 |     assert (x == np.ones(10, dtype="i4")).all()
32 | 
33 | 
34 | def test_kwargs():
35 |     a = ones(10, dtype="i4", chunks=(4,))
36 |     x = np.array(a)
37 |     assert (x == np.ones(10, dtype="i4")).all()
38 | 
39 | 
40 | def test_full():
41 |     a = da.full((3, 3), 100, chunks=(2, 2), dtype="i8")
42 | 
43 |     assert (a.compute() == 100).all()
44 |     assert a.dtype == a.compute(scheduler="sync").dtype == "i8"
45 | 
46 |     assert a.name.startswith("full_like-")
47 | 
48 | 
49 | def test_full_error_nonscalar_fill_value():
50 |     with pytest.raises(ValueError, match="fill_value must be scalar"):
51 |         da.full((3, 3), [100, 100], chunks=(2, 2), dtype="i8")
52 | 
53 | 
54 | def test_full_detects_da_dtype():
55 |     x = da.from_array(100)
56 |     with pytest.warns(FutureWarning, match="not implemented by Dask array") as record:
57 |         # This shall not raise an NotImplementedError due to dtype detected as object.
58 |         a = da.full(shape=(3, 3), fill_value=x)
59 |         assert a.dtype == x.dtype
60 |         assert_eq(a, np.full(shape=(3, 3), fill_value=100))
61 |     assert len(record) == 1
62 | 
63 | 
64 | def test_full_none_dtype():
65 |     a = da.full(shape=(3, 3), fill_value=100, dtype=None)
66 |     assert_eq(a, np.full(shape=(3, 3), fill_value=100, dtype=None))
67 | 
68 | 
69 | def test_full_like_error_nonscalar_fill_value():
70 |     x = np.full((3, 3), 1, dtype="i8")
71 |     with pytest.raises(ValueError, match="fill_value must be scalar"):
72 |         da.full_like(x, [100, 100], chunks=(2, 2), dtype="i8")
73 | 
74 | 
75 | def test_can_make_really_big_array_of_ones():
76 |     ones((1000000, 1000000), chunks=(100000, 100000))
77 |     ones(shape=(1000000, 1000000), chunks=(100000, 100000))
78 | 
79 | 
80 | def test_wrap_consistent_names():
81 |     assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) == sorted(
82 |         ones(10, dtype="i4", chunks=(4,)).dask
83 |     )
84 |     assert sorted(ones(10, dtype="i4", chunks=(4,)).dask) != sorted(
85 |         ones(10, chunks=(4,)).dask
86 |     )
87 |     assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask) == sorted(
88 |         da.full((3, 3), 100, chunks=(2, 2), dtype="f8").dask
89 |     )
90 |     assert sorted(da.full((3, 3), 100, chunks=(2, 2), dtype="i2").dask) != sorted(
91 |         da.full((3, 3), 100, chunks=(2, 2)).dask
92 |     )
93 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_svg.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import xml.etree.ElementTree
 4 | 
 5 | import pytest
 6 | 
 7 | import dask.array as da
 8 | from dask.array.svg import draw_sizes
 9 | 
10 | 
11 | def parses(text):
12 |     cleaned = text.replace("&rarr;", "")  # xml doesn't like righarrow character
13 |     assert xml.etree.ElementTree.fromstring(cleaned) is not None  # parses cleanly
14 | 
15 | 
16 | def test_basic():
17 |     parses(da.ones(10).to_svg())
18 |     parses(da.ones((10, 10)).to_svg())
19 |     parses(da.ones((10, 10, 10)).to_svg())
20 |     parses(da.ones((10, 10, 10, 10)).to_svg())
21 |     parses(da.ones((10, 10, 10, 10, 10)).to_svg())
22 |     parses(da.ones((10, 10, 10, 10, 10, 10)).to_svg())
23 |     parses(da.ones((10, 10, 10, 10, 10, 10, 10)).to_svg())
24 | 
25 | 
26 | def test_repr_html():
27 |     pytest.importorskip("jinja2")
28 |     assert da.ones([])._repr_html_()
29 |     assert da.ones(10)[:0]._repr_html_()
30 |     assert da.ones(10)._repr_html_()
31 |     assert da.ones((10, 10))._repr_html_()
32 |     assert da.ones((10, 10, 10))._repr_html_()
33 |     assert da.ones((10, 10, 10, 10))._repr_html_()
34 | 
35 | 
36 | def test_errors():
37 |     # empty arrays
38 |     with pytest.raises(NotImplementedError) as excpt:
39 |         da.ones([]).to_svg()
40 |     assert "0 dimensions" in str(excpt.value)
41 | 
42 |     # Scalars
43 |     with pytest.raises(NotImplementedError) as excpt:
44 |         da.asarray(1).to_svg()
45 |     assert "0 dimensions" in str(excpt.value)
46 | 
47 |     # 0-length dims arrays
48 |     with pytest.raises(NotImplementedError) as excpt:
49 |         da.ones(10)[:0].to_svg()
50 |     assert "0-length dimensions" in str(excpt.value)
51 | 
52 |     # unknown chunk sizes
53 |     with pytest.raises(NotImplementedError) as excpt:
54 |         x = da.ones(10)
55 |         x = x[x > 5]
56 |         x.to_svg()
57 |     assert "unknown chunk sizes" in str(excpt.value)
58 | 
59 | 
60 | def test_repr_html_size_units():
61 |     pytest.importorskip("jinja2")
62 |     x = da.ones((10000, 5000))
63 |     x = da.ones((3000, 10000), chunks=(1000, 1000))
64 |     text = x._repr_html_()
65 | 
66 |     assert "MB" in text or "MiB" in text
67 |     assert str(x.shape) in text
68 |     assert str(x.dtype) in text
69 | 
70 |     parses(text)
71 | 
72 |     x = da.ones((3000, 10000, 50), chunks=(1000, 1000, 10))
73 |     parses(x._repr_html_())
74 | 
75 | 
76 | def test_draw_sizes():
77 |     assert draw_sizes((10, 10), size=100) == (100, 100)  # respect symmetry
78 |     assert draw_sizes((10, 10), size=200) == (200, 200)  # respect size keyword
79 |     assert draw_sizes((10, 5), size=100) == (100, 50)  # respect small ratios
80 | 
81 |     a, b, c = draw_sizes((1000, 100, 10))
82 |     assert a > b
83 |     assert b > c
84 |     assert a < b * 5
85 |     assert b < c * 5
86 | 
87 | 
88 | def test_too_many_lines_fills_sides_darker():
89 |     data = da.ones((16000, 2400, 3600), chunks=(1, 2400, 3600))
90 |     text = data.to_svg()
91 |     assert "8B4903" in text
92 |     assert text.count("\n") < 300
93 | 
94 | 
95 | def test_3d():
96 |     text = da.ones((10, 10, 10, 10, 10)).to_svg()
97 |     assert text.count("<svg") == 1
98 | 


--------------------------------------------------------------------------------
/dask/tests/test_callbacks.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from dask.callbacks import Callback
  4 | from dask.local import get_sync
  5 | from dask.threaded import get as get_threaded
  6 | from dask.utils_test import add
  7 | 
  8 | 
  9 | def test_start_callback():
 10 |     flag = [False]
 11 | 
 12 |     class MyCallback(Callback):
 13 |         def _start(self, dsk):
 14 |             flag[0] = True
 15 | 
 16 |     with MyCallback():
 17 |         get_sync({"x": 1}, "x")
 18 | 
 19 |     assert flag[0] is True
 20 | 
 21 | 
 22 | def test_start_state_callback():
 23 |     flag = [False]
 24 | 
 25 |     class MyCallback(Callback):
 26 |         def _start_state(self, dsk, state):
 27 |             flag[0] = True
 28 |             assert dsk["x"] == 1
 29 |             assert len(state["cache"]) == 1
 30 | 
 31 |     with MyCallback():
 32 |         get_sync({"x": 1}, "x")
 33 | 
 34 |     assert flag[0] is True
 35 | 
 36 | 
 37 | def test_finish_always_called():
 38 |     flag = [False]
 39 | 
 40 |     class MyCallback(Callback):
 41 |         def _finish(self, dsk, state, errored):
 42 |             flag[0] = True
 43 |             assert errored
 44 | 
 45 |     dsk = {"x": (lambda: 1 / 0,)}
 46 | 
 47 |     # `raise_on_exception=True`
 48 |     try:
 49 |         with MyCallback():
 50 |             get_sync(dsk, "x")
 51 |     except Exception as e:
 52 |         assert isinstance(e, ZeroDivisionError)
 53 |     assert flag[0]
 54 | 
 55 |     # `raise_on_exception=False`
 56 |     flag[0] = False
 57 |     try:
 58 |         with MyCallback():
 59 |             get_threaded(dsk, "x")
 60 |     except Exception as e:
 61 |         assert isinstance(e, ZeroDivisionError)
 62 |     assert flag[0]
 63 | 
 64 |     # KeyboardInterrupt
 65 |     def raise_keyboard():
 66 |         raise KeyboardInterrupt()
 67 | 
 68 |     dsk = {"x": (raise_keyboard,)}
 69 |     flag[0] = False
 70 |     try:
 71 |         with MyCallback():
 72 |             get_sync(dsk, "x")
 73 |     except BaseException as e:
 74 |         assert isinstance(e, KeyboardInterrupt)
 75 |     assert flag[0]
 76 | 
 77 | 
 78 | def test_nested_schedulers():
 79 |     class MyCallback(Callback):
 80 |         def _start(self, dsk):
 81 |             self.dsk = dsk
 82 | 
 83 |         def _pretask(self, key, dsk, state):
 84 |             assert key in self.dsk
 85 | 
 86 |     inner_callback = MyCallback()
 87 |     inner_dsk = {"x": (add, 1, 2), "y": (add, "x", 3)}
 88 | 
 89 |     def nested_call(x):
 90 |         assert not Callback.active
 91 |         with inner_callback:
 92 |             return get_threaded(inner_dsk, "y") + x
 93 | 
 94 |     outer_callback = MyCallback()
 95 |     outer_dsk = {"a": (nested_call, 1), "b": (add, "a", 2)}
 96 | 
 97 |     with outer_callback:
 98 |         get_threaded(outer_dsk, "b")
 99 | 
100 |     assert not Callback.active
101 |     assert outer_callback.dsk == outer_dsk
102 |     assert inner_callback.dsk == inner_dsk
103 |     assert not Callback.active
104 | 
105 | 
106 | def test_add_remove_mutates_not_replaces():
107 |     assert not Callback.active
108 | 
109 |     with Callback():
110 |         assert Callback.active
111 | 
112 |     assert not Callback.active
113 | 


--------------------------------------------------------------------------------
/docs/source/deploying-docker.rst:
--------------------------------------------------------------------------------
 1 | Docker Images
 2 | =============
 3 | 
 4 | Example docker images are maintained at https://github.com/dask/dask-docker .
 5 | 
 6 | Each image installs the full Dask conda environment (including the distributed
 7 | scheduler), Numpy, and Pandas on top of a Miniconda installation on top of
 8 | a Debian image.
 9 | 
10 | These images are large, around 1GB.
11 | 
12 | -   ``ghcr.io/dask/dask``: This a normal debian + miniconda image with the full Dask
13 |     conda package (including the distributed scheduler), Numpy, and Pandas.
14 |     This image is about 1GB in size.
15 | 
16 | -   ``ghcr.io/dask/dask-notebook``: This is based on the
17 |     `Jupyter base-notebook image <https://hub.docker.com/r/jupyter/base-notebook/>`_
18 |     and so it is suitable for use both normally as a Jupyter server, and also as
19 |     part of a JupyterHub deployment.  It also includes a matching Dask software
20 |     environment described above.  This image is about 2GB in size.
21 | 
22 | Example
23 | -------
24 | 
25 | Here is a simple example on a dedicated virtual network
26 | 
27 | .. code-block:: bash
28 | 
29 |    docker network create dask
30 | 
31 |    docker run --network dask -p 8787:8787 --name scheduler ghcr.io/dask/dask dask-scheduler  # start scheduler
32 | 
33 |    docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
34 |    docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
35 |    docker run --network dask ghcr.io/dask/dask dask-worker scheduler:8786 # start worker
36 | 
37 |    docker run --network dask -p 8888:8888 ghcr.io/dask/dask-notebook  # start Jupyter server
38 | 
39 | Then from within the notebook environment you can connect to the Dask cluster like this:
40 | 
41 | .. code-block:: python
42 | 
43 |    from dask.distributed import Client
44 |    client = Client("scheduler:8786")
45 |    client
46 | 
47 | Extensibility
48 | -------------
49 | 
50 | Users can mildly customize the software environment by populating the
51 | environment variables ``EXTRA_APT_PACKAGES``, ``EXTRA_CONDA_PACKAGES``, and
52 | ``EXTRA_PIP_PACKAGES``.  If these environment variables are set in the container,
53 | they will trigger calls to the following respectively::
54 | 
55 |    apt-get install $EXTRA_APT_PACKAGES
56 |    conda install $EXTRA_CONDA_PACKAGES
57 |    python -m pip install $EXTRA_PIP_PACKAGES
58 | 
59 | For example, the following ``conda`` installs the ``joblib`` package into
60 | the Dask worker software environment:
61 | 
62 | .. code-block:: bash
63 | 
64 |    docker run --network dask -e EXTRA_CONDA_PACKAGES="joblib" ghcr.io/dask/dask dask-worker scheduler:8786
65 | 
66 | Note that using these can significantly delay the container from starting,
67 | especially when using ``apt``, or ``conda`` (``pip`` is relatively fast).
68 | 
69 | Remember that it is important for software versions to match between Dask
70 | workers and Dask clients.  As a result, it is often useful to include the same
71 | extra packages in both Jupyter and Worker images.
72 | 
73 | Source
74 | ------
75 | 
76 | Docker files are maintained at https://github.com/dask/dask-docker.
77 | This repository also includes a docker-compose configuration.
78 | 


--------------------------------------------------------------------------------
/dask/array/tests/test_cupy_percentile.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | pytestmark = pytest.mark.gpu
 7 | 
 8 | import dask.array as da
 9 | from dask.array.utils import assert_eq, same_keys
10 | 
11 | cupy = pytest.importorskip("cupy")
12 | 
13 | 
14 | def test_percentile():
15 |     d = da.from_array(cupy.ones((16,)), chunks=(4,))
16 |     qs = np.array([0, 50, 100])
17 | 
18 |     result = da.percentile(d, qs, method="midpoint")
19 |     assert_eq(result, np.array([1, 1, 1], dtype=d.dtype), check_type=False)
20 | 
21 |     x = cupy.array([0, 0, 5, 5, 5, 5, 20, 20])
22 |     d = da.from_array(x, chunks=(3,))
23 | 
24 |     result = da.percentile(d, qs, method="midpoint")
25 |     assert_eq(result, np.array([0, 5, 20], dtype=result.dtype), check_type=False)
26 | 
27 |     assert not same_keys(
28 |         da.percentile(d, qs, "midpoint"),
29 |         da.percentile(d, [0, 50], "midpoint"),
30 |     )
31 | 
32 | 
33 | @pytest.mark.xfail(
34 |     reason="Non-deterministic tokenize(cupy.array(...)), "
35 |     "see https://github.com/dask/dask/issues/6718"
36 | )
37 | def test_percentile_tokenize():
38 |     d = da.from_array(cupy.ones((16,)), chunks=(4,))
39 |     qs = np.array([0, 50, 100])
40 |     assert same_keys(da.percentile(d, qs), da.percentile(d, qs))
41 | 
42 | 
43 | def test_percentiles_with_empty_arrays():
44 |     x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),))
45 |     result = da.percentile(x, [10, 50, 90], method="midpoint")
46 |     assert type(result._meta) == cupy.ndarray
47 |     assert_eq(result, result)  # Check that _meta and computed arrays match types
48 |     assert_eq(result, np.array([1, 1, 1], dtype=x.dtype), check_type=False)
49 | 
50 | 
51 | def test_percentiles_with_empty_q():
52 |     x = da.from_array(cupy.ones(10), chunks=((5, 0, 5),))
53 |     result = da.percentile(x, [], method="midpoint")
54 |     assert type(result._meta) == cupy.ndarray
55 |     assert_eq(result, result)  # Check that _meta and computed arrays match types
56 |     assert_eq(result, np.array([], dtype=x.dtype), check_type=False)
57 | 
58 | 
59 | @pytest.mark.parametrize("q", [5, 5.0, np.int64(5), np.float64(5)])
60 | def test_percentiles_with_scaler_percentile(q):
61 |     # Regression test to ensure da.percentile works with scalar percentiles
62 |     # See #3020
63 |     d = da.from_array(cupy.ones((16,)), chunks=(4,))
64 |     result = da.percentile(d, q, method="midpoint")
65 |     assert type(result._meta) == cupy.ndarray
66 |     assert_eq(result, result)  # Check that _meta and computed arrays match types
67 |     assert_eq(result, np.array([1], dtype=d.dtype), check_type=False)
68 | 
69 | 
70 | def test_percentiles_with_unknown_chunk_sizes():
71 |     rng = da.random.default_rng(cupy.random.default_rng())
72 |     x = rng.random(1000, chunks=(100,))
73 |     x._chunks = ((np.nan,) * 10,)
74 | 
75 |     result = da.percentile(x, 50, method="midpoint").compute()
76 |     assert type(result) == cupy.ndarray
77 |     assert 0.1 < result < 0.9
78 | 
79 |     a, b = da.percentile(x, [40, 60], method="midpoint").compute()
80 |     assert type(a) == cupy.ndarray
81 |     assert type(b) == cupy.ndarray
82 |     assert 0.1 < a < 0.9
83 |     assert 0.1 < b < 0.9
84 |     assert a < b
85 | 


--------------------------------------------------------------------------------
/docs/source/_static/main-page.css:
--------------------------------------------------------------------------------
  1 | /* GLOBAL STYLES
  2 | -------------------------------------------------- */
  3 | /* Padding below the footer and lighter body text */
  4 | 
  5 | body {
  6 |   padding-bottom: 3rem;
  7 |   color: #5a5a5a;
  8 | }
  9 | 
 10 | /* navbar
 11 |  * ----------------------------------------*/
 12 | 
 13 | .navbar {
 14 |   background-color: #000000;
 15 | }
 16 | .navbar li {
 17 |   transition: .3s background-color;
 18 |   text-align: center;
 19 |   background-color: transparent;
 20 |   padding: 0rem 1rem;
 21 |   text-decoration: none;
 22 |   border-radius: 0.3rem;
 23 | }
 24 | .navbar li:hover {
 25 |   background-color: #FDA061;
 26 | }
 27 | .navbar li .nav-link{
 28 |   color: #FDA061;
 29 | }
 30 | .navbar li:hover .nav-link{
 31 |   color: #212529;
 32 | }
 33 | 
 34 | .dropdown-menu {
 35 |   background-color: #000000d0;
 36 | }
 37 | 
 38 | .dropdown-item {
 39 |   color: #FDA061;
 40 | }
 41 | 
 42 | .dropdown-item:hover {
 43 |   background-color: #FDA061D0;
 44 | }
 45 | 
 46 | .hero {
 47 |   background-color: rgba(0,0,0,0.92);
 48 |   text-color: white;
 49 | }
 50 | 
 51 | 
 52 | .top-image {
 53 |   height: 10rem;
 54 |   max-width: 20rem;
 55 | }
 56 | 
 57 | 
 58 | .outline-dask {
 59 |   color: #FDA061;
 60 |   background-color: transparent;
 61 |   border-color: #FDA061;
 62 | }
 63 | 
 64 | 
 65 | .outline-dask:hover {
 66 |   color: #212529;
 67 |   background-color: #FDA061;
 68 |   border-color: #FDA061;
 69 | }
 70 | 
 71 | .solid-dask {
 72 |   color: #212529;
 73 |   background-color: #FDA061;
 74 | }
 75 | 
 76 | .solid-dask:hover {
 77 |   color: #212529;
 78 |   background-color: #EC9050;
 79 | }
 80 | 
 81 | 
 82 | /* MARKETING CONTENT
 83 | -------------------------------------------------- */
 84 | 
 85 | /* Center align the text within the three columns below the carousel */
 86 | .marketing .col-lg-4 {
 87 |   margin-bottom: 1.5rem;
 88 |   text-align: center;
 89 | }
 90 | .marketing .col-lg-4 p {
 91 |   margin-right: .75rem;
 92 |   margin-left: .75rem;
 93 | }
 94 | 
 95 | 
 96 | /* Featurettes
 97 | ------------------------- */
 98 | 
 99 | .featurette-divider {
100 |   margin: 3rem 0; /* Space out the Bootstrap <hr> more */
101 | }
102 | 
103 | /* Thin out the marketing headings */
104 | .featurette-heading {
105 |   font-weight: 300;
106 |   line-height: 1;
107 |   letter-spacing: -.05rem;
108 | }
109 | 
110 | .featurette-subheading {
111 |   text-transform: uppercase;
112 |   font-size: 1.2rem;
113 |   display: block;
114 |   font-weight: 600;
115 |   margin: 1.2rem 0;
116 | }
117 | 
118 | /* Supporters
119 |  * ----------------------------*/
120 | 
121 | .supporters {
122 |   text-align: center;
123 | }
124 | 
125 | .supporter {
126 |   margin: 0.5rem 0;
127 |   width: 100%;
128 | }
129 | 
130 | .supporter img{
131 |   max-height: 100%;
132 |   max-width: 85%;
133 |   position: relative;
134 |   top: 50%;
135 |   transform: translateY(-50%);
136 | 
137 | }
138 | 
139 | 
140 | /* RESPONSIVE CSS
141 | -------------------------------------------------- */
142 | 
143 | @media (min-width: 40em) {
144 |   .featurette-heading {
145 |     font-size: 50px;
146 |   }
147 | }
148 | 
149 | @media (min-width: 62em) {
150 |   .featurette-heading {
151 |     margin-top: 3rem;
152 |   }
153 | }
154 | 


--------------------------------------------------------------------------------
/docs/source/array-gufunc.rst:
--------------------------------------------------------------------------------
 1 | Generalized Ufuncs
 2 | ==================
 3 | 
 4 | `NumPy <https://www.numpy.org>`_ provides the concept of `generalized ufuncs <https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html>`_. Generalized ufuncs are functions
 5 | that distinguish the various dimensions of passed arrays in the two classes loop dimensions
 6 | and core dimensions. To accomplish this, a `signature <https://docs.scipy.org/doc/numpy/reference/c-api/generalized-ufuncs.html#details-of-signature>`_ is specified for NumPy generalized ufuncs.
 7 | 
 8 | `Dask <https://dask.org/>`_ integrates interoperability with NumPy's generalized ufuncs
 9 | by adhering to respective `ufunc protocol <https://docs.scipy.org/doc/numpy/reference/arrays.classes.html#numpy.class.__array_ufunc__>`_, and provides a wrapper to make a Python function a generalized ufunc.
10 | 
11 | 
12 | Usage
13 | -----
14 | 
15 | NumPy Generalized UFuncs
16 | ~~~~~~~~~~~~~~~~~~~~~~~~
17 | .. note::
18 | 
19 |     `NumPy <https://www.numpy.org>`_ generalized ufuncs are currently (v1.14.3 and below) stored in
20 |     inside ``np.linalg._umath_linalg`` and might change in the future.
21 | 
22 | 
23 | .. code-block:: python
24 | 
25 |     import dask.array as da
26 |     import numpy as np
27 | 
28 |     x = da.random.default_rng().normal(size=(3, 10, 10), chunks=(2, 10, 10))
29 | 
30 |     w, v = np.linalg._umath_linalg.eig(x, output_dtypes=(float, float))
31 | 
32 | 
33 | Create Generalized UFuncs
34 | ~~~~~~~~~~~~~~~~~~~~~~~~~
35 | 
36 | It can be difficult to create your own GUFuncs without going into the CPython API.
37 | However, the `Numba <https://numba.pydata.org>`_ project does provide a
38 | nice implementation with their ``numba.guvectorize`` decorator.  See `Numba's
39 | documentation
40 | <https://numba.pydata.org/numba-doc/dev/user/vectorize.html#the-guvectorize-decorator>`_
41 | for more information.
42 | 
43 | Wrap your own Python function
44 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
45 | ``gufunc`` can be used to make a Python function behave like a generalized ufunc:
46 | 
47 | 
48 | .. code-block:: python
49 | 
50 |     x = da.random.default_rng().normal(size=(10, 5), chunks=(2, 5))
51 | 
52 |     def foo(x):
53 |         return np.mean(x, axis=-1)
54 | 
55 |     gufoo = da.gufunc(foo, signature="(i)->()", output_dtypes=float, vectorize=True)
56 | 
57 |     y = gufoo(x)
58 | 
59 | 
60 | Instead of ``gufunc``, also the ``as_gufunc`` decorator can be used for convenience:
61 | 
62 | 
63 | .. code-block:: python
64 | 
65 |     x = da.random.normal(size=(10, 5), chunks=(2, 5))
66 | 
67 |     @da.as_gufunc(signature="(i)->()", output_dtypes=float, vectorize=True)
68 |     def gufoo(x):
69 |         return np.mean(x, axis=-1)
70 | 
71 |     y = gufoo(x)
72 | 
73 | 
74 | Disclaimer
75 | ----------
76 | This experimental generalized ufunc integration is not complete:
77 | 
78 | * ``gufunc`` does not create a true generalized ufunc to be used with other input arrays besides Dask.
79 |   I.e., at the moment, ``gufunc`` casts all input arguments to ``dask.array.Array``
80 | 
81 | * Inferring ``output_dtypes`` automatically is not implemented yet
82 | 
83 | 
84 | API
85 | ---
86 | 
87 | .. currentmodule:: dask.array.gufunc
88 | 
89 | .. autosummary::
90 |    apply_gufunc
91 |    as_gufunc
92 |    gufunc
93 | 


--------------------------------------------------------------------------------
/dask/dataframe/tests/test_hyperloglog.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | import dask.dataframe as dd
 8 | 
 9 | rs = np.random.RandomState(96)
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "df",
14 |     [
15 |         pd.DataFrame(
16 |             {
17 |                 "x": [1, 2, 3] * 3,
18 |                 "y": [1.2, 3.4, 5.6] * 3,
19 |                 "z": -(np.arange(9, dtype=np.int8)),
20 |             }
21 |         ),
22 |         pd.DataFrame(
23 |             {
24 |                 "x": rs.randint(0, 1000000, (10000,)),
25 |                 "y": rs.randn(10000),
26 |                 "z": rs.uniform(0, 9999999, (10000,)),
27 |             }
28 |         ),
29 |         pd.DataFrame(
30 |             {
31 |                 "x": np.repeat(rs.randint(0, 1000000, (1000,)), 3),
32 |                 "y": np.repeat(rs.randn(1000), 3),
33 |                 "z": np.repeat(rs.uniform(0, 9999999, (1000,)), 3),
34 |             }
35 |         ),
36 |         pd.DataFrame({"x": rs.randint(0, 1000000, (10000,))}),
37 |         pd.DataFrame(
38 |             {
39 |                 "x": rs.randint(0, 1000000, (7,)),
40 |                 "y": ["a", "bet", "is", "a", "tax", "on", "bs"],
41 |             }
42 |         ),
43 |         pd.DataFrame(
44 |             {
45 |                 "w": np.zeros((20000,)),
46 |                 "x": np.zeros((20000,)),
47 |                 "y": np.zeros((20000,)) + 4803592,
48 |                 "z": np.zeros((20000,)),
49 |             }
50 |         ),
51 |         pd.DataFrame({"x": [1, 2, 3] * 1000}),
52 |         pd.DataFrame({"x": np.random.random(1000)}),
53 |         pd.DataFrame(
54 |             {
55 |                 "a": [1, 2, 3] * 3,
56 |                 "b": [1.2, 3.4, 5.6] * 3,
57 |                 "c": [1 + 2j, 3 + 4j, 5 + 6j] * 3,
58 |                 "d": -(np.arange(9, dtype=np.int8)),
59 |             }
60 |         ),
61 |         pd.Series([1, 2, 3] * 1000),
62 |         pd.Series(np.random.random(1000)),
63 |         pd.Series(np.random.random(1000), index=np.ones(1000)),
64 |         pd.Series(np.random.random(1000), index=np.random.random(1000)),
65 |     ],
66 | )
67 | @pytest.mark.parametrize("npartitions", [2, 20])
68 | def test_basic(df, npartitions):
69 |     ddf = dd.from_pandas(df, npartitions=npartitions)
70 | 
71 |     approx = ddf.nunique_approx().compute(scheduler="sync")
72 |     exact = len(df.drop_duplicates())
73 |     assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
74 | 
75 | 
76 | @pytest.mark.parametrize("split_every", [None, 2, 10])
77 | @pytest.mark.parametrize("npartitions", [2, 20])
78 | def test_split_every(split_every, npartitions):
79 |     df = pd.Series([1, 2, 3] * 1000)
80 |     ddf = dd.from_pandas(df, npartitions=npartitions)
81 | 
82 |     approx = ddf.nunique_approx(split_every=split_every).compute(scheduler="sync")
83 |     exact = len(df.drop_duplicates())
84 |     assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
85 | 
86 | 
87 | def test_larger_data():
88 |     df = dd.demo.make_timeseries(
89 |         "2000-01-01",
90 |         "2000-04-01",
91 |         {"value": float, "id": int},
92 |         freq="10s",
93 |         partition_freq="1D",
94 |         seed=1,
95 |     )
96 |     assert df.nunique_approx().compute() > 1000
97 | 


--------------------------------------------------------------------------------
/docs/source/deploying-cloud.rst:
--------------------------------------------------------------------------------
 1 | Cloud
 2 | =====
 3 | 
 4 | There are a variety of ways to deploy Dask on cloud providers.
 5 | Cloud providers provide managed services,
 6 | like VMs, Kubernetes, Yarn, or custom APIs with which Dask can connect easily.
 7 | You may want to consider the following options:
 8 | 
 9 | 1.  A managed Kubernetes service and Dask's
10 |     :doc:`Kubernetes integration <deploying-kubernetes>`.
11 | 2.  A managed Yarn service,
12 |     like `Amazon EMR <https://aws.amazon.com/emr/>`_
13 |     or `Google Cloud DataProc <https://cloud.google.com/dataproc/>`_
14 |     and `Dask-Yarn <https://yarn.dask.org>`_.
15 | 
16 |     Specific documentation for the popular Amazon EMR service can be found
17 |     `here <https://yarn.dask.org/en/latest/aws-emr.html>`_.
18 | 3.  Directly launching cloud resources such as VMs or containers via a cluster manager with
19 |     `Dask Cloud Provider <https://cloudprovider.dask.org/en/latest/>`_.
20 | 4. A commercial Dask deployment option like `Coiled <https://coiled.io?utm_source=dask-docs&utm_medium=deploying-cloud>`_ to handle the creation and management of Dask clusters on a cloud computing environment (AWS and GCP).
21 | 
22 | Cloud Deployment Example
23 | ------------------------
24 | 
25 | Using `Dask Cloud Provider <https://cloudprovider.dask.org/en/latest/>`_ to launch a cluster of
26 | VMs on a platform like `DigitalOcean <https://www.digitalocean.com/>`_ can be as convenient as
27 | launching a local cluster.
28 | 
29 | .. code-block:: python
30 | 
31 |     >>> import dask.config
32 | 
33 |     >>> dask.config.set({"cloudprovider.digitalocean.token": "yourAPItoken"})
34 | 
35 |     >>> from dask_cloudprovider.digitalocean import DropletCluster
36 | 
37 |     >>> cluster = DropletCluster(n_workers=1)
38 |     Creating scheduler instance
39 |     Created droplet dask-38b817c1-scheduler
40 |     Waiting for scheduler to run
41 |     Scheduler is running
42 |     Creating worker instance
43 |     Created droplet dask-38b817c1-worker-dc95260d
44 | 
45 | Many of the cluster managers in Dask Cloud Provider work by launching VMs with a startup script
46 | that pulls down the :doc:`Dask Docker image <deploying-docker>` and runs Dask components within that container.
47 | As with all cluster managers the VM resources, Docker image, etc are all configurable.
48 | 
49 | You can then connect a client and work with the cluster as if it were on your local machine.
50 | 
51 | .. code-block:: python
52 | 
53 |     >>> from dask.distributed import Client
54 | 
55 |     >>> client = Client(cluster)
56 | 
57 | Data Access
58 | -----------
59 | 
60 | You may want to install additional libraries in your Jupyter and worker images
61 | to access the object stores of each cloud (see :doc:`how-to/connect-to-remote-data`):
62 | 
63 | -  `s3fs <https://s3fs.readthedocs.io/>`_ for Amazon's S3
64 | -  `gcsfs <https://gcsfs.readthedocs.io/>`_ for Google's GCS
65 | -  `adlfs <https://github.com/dask/adlfs/>`_ for Microsoft's ADL
66 | 
67 | Historical Libraries
68 | --------------------
69 | 
70 | Dask previously maintained libraries for deploying Dask on
71 | Amazon's EC2 and Google GKE.
72 | Due to sporadic interest,
73 | and churn both within the Dask library and EC2 itself,
74 | these were not well maintained.
75 | They have since been deprecated in favor of the
76 | :doc:`Kubernetes <deploying-kubernetes>` solutions.
77 | 


--------------------------------------------------------------------------------
/docs/source/deploying-python.rst:
--------------------------------------------------------------------------------
  1 | Python API
  2 | ==========
  3 | 
  4 | You can create a ``dask.distributed`` scheduler by importing and creating a
  5 | ``Client`` with no arguments.  This overrides whatever default was previously
  6 | set.
  7 | 
  8 | .. code-block:: python
  9 | 
 10 |    from dask.distributed import Client
 11 |    client = Client()
 12 | 
 13 | You can navigate to ``http://localhost:8787/status`` to see the diagnostic
 14 | dashboard if you have Bokeh installed.
 15 | 
 16 | Client
 17 | ------
 18 | 
 19 | You can trivially set up a local cluster on your machine by instantiating a Dask
 20 | Client with no arguments
 21 | 
 22 | .. code-block:: python
 23 | 
 24 |    from dask.distributed import Client
 25 |    client = Client()
 26 | 
 27 | This sets up a scheduler in your local process along with a number of workers and
 28 | threads per worker related to the number of cores in your machine.
 29 | 
 30 | If you want to run workers in your same process, you can pass the
 31 | ``processes=False`` keyword argument.
 32 | 
 33 | .. code-block:: python
 34 | 
 35 |    client = Client(processes=False)
 36 | 
 37 | This is sometimes preferable if you want to avoid inter-worker communication
 38 | and your computations release the GIL.  This is common when primarily using
 39 | NumPy or Dask Array.
 40 | 
 41 | 
 42 | LocalCluster
 43 | ------------
 44 | 
 45 | The ``Client()`` call described above is shorthand for creating a LocalCluster
 46 | and then passing that to your client.
 47 | 
 48 | .. code-block:: python
 49 | 
 50 |    from dask.distributed import Client, LocalCluster
 51 |    cluster = LocalCluster()
 52 |    client = Client(cluster)
 53 | 
 54 | This is equivalent, but somewhat more explicit.
 55 | 
 56 | You may want to look at the
 57 | keyword arguments available on ``LocalCluster`` to understand the options available
 58 | to you on handling the mixture of threads and processes, like specifying explicit
 59 | ports, and so on.
 60 | 
 61 | To create a local cluster with all workers running in dedicated subprocesses, 
 62 | ``dask.distributed`` also offers the experimental ``SubprocessCluster``.
 63 | 
 64 | Cluster manager features
 65 | ------------------------
 66 | 
 67 | Instantiating a cluster manager class like ``LocalCluster`` and then passing it to the
 68 | ``Client`` is a common pattern. Cluster managers also provide useful utilities to help
 69 | you understand what is going on.
 70 | 
 71 | For example you can retrieve the Dashboard URL.
 72 | 
 73 | .. code-block:: python
 74 | 
 75 |    >>> cluster.dashboard_link
 76 |    'http://127.0.0.1:8787/status'
 77 | 
 78 | You can retrieve logs from cluster components.
 79 | 
 80 | .. code-block:: python
 81 | 
 82 |    >>> cluster.get_logs()
 83 |    {'Cluster': '',
 84 |    'Scheduler': "distributed.scheduler - INFO - Clear task state\ndistributed.scheduler - INFO -   S...
 85 | 
 86 | If you are using a cluster manager that supports scaling you can modify the number of workers manually
 87 | or automatically based on workload.
 88 | 
 89 | .. code-block:: python
 90 | 
 91 |    >>> cluster.scale(10)  # Sets the number of workers to 10
 92 | 
 93 |    >>> cluster.adapt(minimum=1, maximum=10)  # Allows the cluster to auto scale to 10 when tasks are computed
 94 | 
 95 | Reference
 96 | ---------
 97 | 
 98 | .. currentmodule:: distributed.deploy.local
 99 | 
100 | .. autoclass:: LocalCluster
101 |    :members:
102 | 


--------------------------------------------------------------------------------
/docs/source/array-assignment.rst:
--------------------------------------------------------------------------------
  1 | .. _array.assignment:
  2 | 
  3 | Assignment
  4 | ==========
  5 | 
  6 | Dask Array supports most of the NumPy assignment indexing syntax. In
  7 | particular, it supports combinations of the following:
  8 | 
  9 | * Indexing by integers: ``x[1] = y``
 10 | * Indexing by slices: ``x[2::-1] = y``
 11 | * Indexing by a list of integers: ``x[[0, -1, 1]] = y``
 12 | * Indexing by a 1-d :class:`numpy` array of integers: ``x[np.arange(3)] = y``
 13 | * Indexing by a 1-d :class:`~dask.array.Array` of integers: ``x[da.arange(3)] = y``, ``x[da.from_array([0, -1, 1])] = y``, ``x[da.where(np.array([1, 2, 3]) < 3)[0]] = y``
 14 | * Indexing by a list of booleans: ``x[[False, True, True]] = y``
 15 | * Indexing by a 1-d :class:`numpy` array of booleans: ``x[np.arange(3) > 0] = y``
 16 | 
 17 | It also supports:
 18 | 
 19 | * Indexing by one broadcastable :class:`~dask.array.Array` of
 20 |   booleans: ``x[x > 0] = y``.
 21 | 
 22 | However, it does not currently support the following:
 23 | 
 24 | * Indexing with lists in multiple axes: ``x[[1, 2, 3], [3, 1, 2]] = y``
 25 | 
 26 | 
 27 | .. _array.assignment.broadcasting:
 28 | 
 29 | Broadcasting
 30 | ------------
 31 | 
 32 | The normal NumPy broadcasting rules apply:
 33 | 
 34 | .. code-block:: python
 35 | 
 36 |    >>> x = da.zeros((2, 6))
 37 |    >>> x[0] = 1
 38 |    >>> x[..., 1] = 2.0
 39 |    >>> x[:, 2] = [3, 4]
 40 |    >>> x[:, 5:2:-2] = [[6, 5]]
 41 |    >>> x.compute()
 42 |    array([[1., 2., 3., 5., 1., 6.],
 43 |           [0., 2., 4., 5., 0., 6.]])
 44 |    >>> x[1] = -x[0]
 45 |    >>> x.compute()
 46 |    array([[ 1.,  2.,  3.,  5.,  1.,  6.],
 47 |           [-1., -2., -3., -5., -1., -6.]])
 48 | 
 49 | .. _array.assignment.masking:
 50 | 
 51 | Masking
 52 | -------
 53 | 
 54 | Elements may be masked by assigning to the NumPy masked value, or to an
 55 | array with masked values:
 56 | 
 57 | .. code-block:: python
 58 | 
 59 |    >>> x = da.ones((2, 6))
 60 |    >>> x[0, [1, -2]] = np.ma.masked
 61 |    >>> x[1] = np.ma.array([0, 1, 2, 3, 4, 5], mask=[0, 1, 1, 0, 0, 0])
 62 |    >>> print(x.compute())
 63 |    [[1.0 -- 1.0 1.0 -- 1.0]
 64 |     [0.0 -- -- 3.0 4.0 5.0]]
 65 |    >>> x[:, 0] = x[:, 1]
 66 |    >>> print(x.compute())
 67 |    [[1.0 -- 1.0 1.0 -- 1.0]
 68 |     [0.0 -- -- 3.0 4.0 5.0]]
 69 |    >>> x[:, 0] = x[:, 1]
 70 |    >>> print(x.compute())
 71 |    [[-- -- 1.0 1.0 -- 1.0]
 72 |     [-- -- -- 3.0 4.0 5.0]]
 73 | 
 74 | If, and only if, a single broadcastable :class:`~dask.array.Array` of
 75 | booleans is provided then masked array assignment does not yet work as
 76 | expected. In this case the data underlying the mask are assigned:
 77 | 
 78 | .. code-block:: python
 79 | 
 80 |    >>> x = da.arange(12).reshape(2, 6)
 81 |    >>> x[x > 7] = np.ma.array(-99, mask=True)
 82 |    >>> print(x.compute())
 83 |    [[  0   1   2   3   4   5]
 84 |     [  6   7 -99 -99 -99 -99]]
 85 | 
 86 | Note that masked assignments do work when a boolean
 87 | :class:`~dask.array.Array` index used in a tuple, or implicit tuple,
 88 | of indices:
 89 | 
 90 | .. code-block:: python
 91 | 
 92 |    >>> x = da.arange(12).reshape(2, 6)
 93 |    >>> x[1, x[0] > 3] = np.ma.masked
 94 |    >>> print(x.compute())
 95 |    [[0 1 2 3 4 5]
 96 |     [6 7 8 9 -- --]]
 97 |    >>> x = da.arange(12).reshape(2, 6)
 98 |    >>> print(x.compute())
 99 |    [[ 0  1  2  3  4  5]
100 |     [ 6  7  8  9 10 11]]
101 |    >>> x[(x[:, 2] < 4,)] = np.ma.masked
102 |    >>> print(x.compute())
103 |    [[-- -- -- -- -- --]
104 |     [6 7 8 9 10 11]]
105 | 


--------------------------------------------------------------------------------