├── .flake8 ├── .flake8.cython ├── .github ├── CODEOWNERS ├── copy-pr-bot.yaml ├── ops-bot.yaml └── workflows │ ├── build.yaml │ ├── pr.yaml │ ├── test.yaml │ └── trigger-breaking-change-alert.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── VERSION ├── ci ├── build_python.sh ├── build_wheel.sh ├── check_style.sh ├── release │ └── update-version.sh ├── run_benchmark_pytests.sh ├── run_pytests.sh ├── test_python.sh ├── test_wheel.sh └── validate_wheel.sh ├── conda ├── environments │ └── builddocs.yml └── recipes │ └── ucx-py │ ├── conda_build_config.yaml │ └── recipe.yaml ├── debug-tests ├── README.md ├── client.py ├── debug_utils.py ├── multi-node-workers.sh ├── scheduler.sh ├── server.py ├── test_endpoint_error_callback.py ├── test_send_recv_many_workers.py └── utils.py ├── dependencies.yaml ├── docker ├── Dockerfile ├── README.md ├── UCXPy-MOFED.dockerfile ├── UCXPy-rdma-core.dockerfile ├── bench-all.sh ├── build-ucx-py.sh ├── build-ucx.sh ├── run.sh └── ucx-py-cuda11.5.yml ├── docs ├── Makefile └── source │ ├── _static │ └── Architecture.png │ ├── api.rst │ ├── conf.py │ ├── configuration.rst │ ├── deployment.rst │ ├── glossary.rst │ ├── index.rst │ ├── install.rst │ ├── os-limits.rst │ ├── quickstart.rst │ ├── send-recv.rst │ ├── transport-monitoring.rst │ └── ucx-debug.rst ├── examples ├── cudf-example.py └── cupy-example.py ├── pyproject.toml ├── setup.py ├── tests ├── conftest.py ├── test_benchmark_cluster.py ├── test_config.py ├── test_custom_send_recv.py ├── test_disconnect.py ├── test_endpoint.py ├── test_from_worker_address.py ├── test_from_worker_address_error.py ├── test_info.py ├── test_multiple_nodes.py ├── test_probe.py ├── test_reset.py ├── test_rma.py ├── test_send_recv.py ├── test_send_recv_am.py ├── test_send_recv_two_workers.py ├── test_shutdown.py ├── test_tags.py ├── test_ucx_getters.py ├── test_version.py └── utils.py └── ucp ├── VERSION ├── __init__.py ├── _libs ├── __init__.pxd ├── __init__.py ├── arr.pxd ├── arr.pyi ├── arr.pyx ├── exceptions.py ├── packed_remote_key.pyx ├── src │ ├── c_util.c │ └── c_util.h ├── tests │ ├── test_address_object.py │ ├── test_arr.py │ ├── test_cancel.py │ ├── test_config.py │ ├── test_endpoint.py │ ├── test_listener.py │ ├── test_mem.py │ ├── test_peer_send_recv.py │ ├── test_probe.py │ ├── test_rma.py │ ├── test_server_client.py │ └── test_server_client_am.py ├── transfer_am.pyx ├── transfer_common.pyx ├── transfer_stream.pyx ├── transfer_tag.pyx ├── typedefs.pyx ├── ucx_address.pyx ├── ucx_api.pyi ├── ucx_api.pyx ├── ucx_api_dep.pxd ├── ucx_context.pyx ├── ucx_endpoint.pyx ├── ucx_listener.pyx ├── ucx_memory_handle.pyx ├── ucx_object.pyx ├── ucx_request.pyx ├── ucx_rkey.pyx ├── ucx_rma.pyx ├── ucx_worker.pyx ├── ucx_worker_cb.pyx ├── ucxio.pyx ├── utils.py ├── utils.pyx └── utils_test.py ├── _version.py ├── benchmarks ├── README.md ├── __init__.py ├── asyncssh.py ├── backends │ ├── __init__.py │ ├── base.py │ ├── tornado.py │ ├── ucp_async.py │ └── ucp_core.py ├── cudf_merge.py ├── send_recv.py └── utils.py ├── comm.py ├── continuous_ucx_progress.py ├── core.py ├── exceptions.py └── utils.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203,E211,E225,E226,E227,E901,E999,W503,W504 3 | # E203: whitespace before ':' (black format differs for slices) 4 | # E211: whitespace before '(' (used in multi-line imports) 5 | # E225: Missing whitespace around operators (breaks cython casting syntax like ) 6 | # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) 7 | # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) 8 | # E999: invalid syntax (works for Python, not Cython) 9 | # W503: line break before binary operator (breaks lines that start with a pointer) 10 | # W504: line break after binary operator (breaks lines that end with a pointer) 11 | 12 | exclude = 13 | .eggs, 14 | *.egg, 15 | build, 16 | __init__.py, 17 | 18 | max-line-length = 88 19 | 20 | # Ignore black/flake8-pyi conflicts 21 | per-file-ignores = 22 | *.pyi:E301 E302 E704 23 | -------------------------------------------------------------------------------- /.flake8.cython: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | [flake8] 18 | filename = *.pyx, *.pxd 19 | exclude = *.egg, build, docs, .git 20 | ignore = E999, E225, E226, E227, W503, W504, E211 21 | 22 | max-line-length = 88 23 | 24 | # Rules ignored: 25 | # E999: invalid syntax (works for Python, not Cython) 26 | # E211: whitespace before '(' (used in multi-line imports) 27 | # E225: Missing whitespace around operators (breaks cython casting syntax like ) 28 | # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) 29 | # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) 30 | # W503: line break before binary operator (breaks lines that start with a pointer) 31 | # W504: line break after binary operator (breaks lines that end with a pointer) 32 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | #python code owners 2 | ucp/ @rapidsai/ucxpy-python-codeowners 3 | tests/ @rapidsai/ucxpy-python-codeowners 4 | examples/ @rapidsai/ucxpy-python-codeowners 5 | benchmarks/ @rapidsai/ucxpy-python-codeowners 6 | 7 | #CI code owners 8 | /.github/ @rapidsai/ci-codeowners 9 | /ci/ @rapidsai/ci-codeowners 10 | /.pre-commit-config.yaml @rapidsai/ci-codeowners 11 | 12 | #packaging code owners 13 | /.devcontainer/ @rapidsai/packaging-codeowners 14 | /conda/ @rapidsai/packaging-codeowners 15 | /dependencies.yaml @rapidsai/packaging-codeowners 16 | /build.sh @rapidsai/packaging-codeowners 17 | pyproject.toml @rapidsai/packaging-codeowners 18 | -------------------------------------------------------------------------------- /.github/copy-pr-bot.yaml: -------------------------------------------------------------------------------- 1 | # Configuration file for `copy-pr-bot` GitHub App 2 | # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ 3 | 4 | enabled: true 5 | auto_sync_draft: false 6 | -------------------------------------------------------------------------------- /.github/ops-bot.yaml: -------------------------------------------------------------------------------- 1 | # This file controls which features from the `ops-bot` repository below are enabled. 2 | # - https://github.com/rapidsai/ops-bot 3 | 4 | auto_merger: true 5 | branch_checker: false 6 | label_checker: true 7 | release_drafter: false 8 | recently_updated: true 9 | forward_merger: true 10 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - "branch-*" 7 | tags: 8 | - v[0-9].[0-9][0-9].[0-9][0-9] 9 | workflow_dispatch: 10 | inputs: 11 | branch: 12 | required: true 13 | type: string 14 | date: 15 | required: true 16 | type: string 17 | sha: 18 | required: true 19 | type: string 20 | build_type: 21 | type: string 22 | default: nightly 23 | 24 | concurrency: 25 | group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} 26 | cancel-in-progress: true 27 | 28 | jobs: 29 | conda-python-build: 30 | secrets: inherit 31 | uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 32 | with: 33 | build_type: ${{ inputs.build_type || 'branch' }} 34 | branch: ${{ inputs.branch }} 35 | date: ${{ inputs.date }} 36 | script: ci/build_python.sh 37 | sha: ${{ inputs.sha }} 38 | upload-conda: 39 | needs: [conda-python-build] 40 | secrets: inherit 41 | uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.08 42 | with: 43 | build_type: ${{ inputs.build_type || 'branch' }} 44 | branch: ${{ inputs.branch }} 45 | date: ${{ inputs.date }} 46 | sha: ${{ inputs.sha }} 47 | wheel-build: 48 | secrets: inherit 49 | uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 50 | with: 51 | build_type: ${{ inputs.build_type || 'branch' }} 52 | branch: ${{ inputs.branch }} 53 | sha: ${{ inputs.sha }} 54 | date: ${{ inputs.date }} 55 | script: ci/build_wheel.sh 56 | package-name: ucx_py 57 | package-type: python 58 | wheel-publish: 59 | needs: wheel-build 60 | secrets: inherit 61 | uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.08 62 | with: 63 | build_type: ${{ inputs.build_type || 'branch' }} 64 | branch: ${{ inputs.branch }} 65 | sha: ${{ inputs.sha }} 66 | date: ${{ inputs.date }} 67 | package-name: ucx_py 68 | package-type: python 69 | -------------------------------------------------------------------------------- /.github/workflows/pr.yaml: -------------------------------------------------------------------------------- 1 | name: pr 2 | 3 | on: 4 | push: 5 | branches: 6 | - "pull-request/[0-9]+" 7 | 8 | concurrency: 9 | group: ${{ github.workflow }}-${{ github.ref }} 10 | cancel-in-progress: true 11 | 12 | jobs: 13 | pr-builder: 14 | needs: 15 | - checks 16 | - conda-python-build 17 | - conda-python-tests 18 | - wheel-build 19 | - wheel-tests 20 | - telemetry-setup 21 | secrets: inherit 22 | uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.08 23 | with: 24 | needs: ${{ toJSON(needs) }} 25 | telemetry-setup: 26 | runs-on: ubuntu-latest 27 | continue-on-error: true 28 | env: 29 | OTEL_SERVICE_NAME: "pr-ucx-py" 30 | steps: 31 | - name: Telemetry setup 32 | # This gate is here and not at the job level because we need the job to not be skipped, 33 | # since other jobs depend on it. 34 | if: ${{ vars.TELEMETRY_ENABLED == 'true' }} 35 | uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main 36 | checks: 37 | secrets: inherit 38 | needs: telemetry-setup 39 | uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.08 40 | with: 41 | ignored_pr_jobs: telemetry-summarize 42 | conda-python-build: 43 | needs: checks 44 | secrets: inherit 45 | uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.08 46 | with: 47 | build_type: pull-request 48 | script: ci/build_python.sh 49 | conda-python-tests: 50 | needs: conda-python-build 51 | secrets: inherit 52 | uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 53 | with: 54 | build_type: pull-request 55 | container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" 56 | script: ci/test_python.sh 57 | wheel-build: 58 | needs: checks 59 | secrets: inherit 60 | uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.08 61 | with: 62 | build_type: pull-request 63 | script: ci/build_wheel.sh 64 | package-name: ucx_py 65 | package-type: python 66 | wheel-tests: 67 | needs: wheel-build 68 | secrets: inherit 69 | uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 70 | with: 71 | build_type: pull-request 72 | script: ci/test_wheel.sh 73 | container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" 74 | telemetry-summarize: 75 | # This job must use a self-hosted runner to record telemetry traces. 76 | runs-on: linux-amd64-cpu4 77 | needs: pr-builder 78 | if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }} 79 | continue-on-error: true 80 | steps: 81 | - name: Telemetry summarize 82 | uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main 83 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | branch: 7 | required: true 8 | type: string 9 | date: 10 | required: true 11 | type: string 12 | sha: 13 | required: true 14 | type: string 15 | build_type: 16 | type: string 17 | default: nightly 18 | 19 | jobs: 20 | conda-python-tests: 21 | secrets: inherit 22 | uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.08 23 | with: 24 | build_type: ${{ inputs.build_type }} 25 | branch: ${{ inputs.branch }} 26 | date: ${{ inputs.date }} 27 | script: ci/test_python.sh 28 | sha: ${{ inputs.sha }} 29 | container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" 30 | wheel-tests: 31 | secrets: inherit 32 | uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.08 33 | with: 34 | build_type: ${{ inputs.build_type }} 35 | branch: ${{ inputs.branch }} 36 | date: ${{ inputs.date }} 37 | sha: ${{ inputs.sha }} 38 | script: ci/test_wheel.sh 39 | container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000" 40 | -------------------------------------------------------------------------------- /.github/workflows/trigger-breaking-change-alert.yaml: -------------------------------------------------------------------------------- 1 | name: Trigger Breaking Change Notifications 2 | 3 | on: 4 | pull_request_target: 5 | types: 6 | - closed 7 | - reopened 8 | - labeled 9 | - unlabeled 10 | 11 | jobs: 12 | trigger-notifier: 13 | if: contains(github.event.pull_request.labels.*.name, 'breaking') 14 | secrets: inherit 15 | uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.08 16 | with: 17 | sender_login: ${{ github.event.sender.login }} 18 | sender_avatar: ${{ github.event.sender.avatar_url }} 19 | repo: ${{ github.repository }} 20 | pr_number: ${{ github.event.pull_request.number }} 21 | pr_title: "${{ github.event.pull_request.title }}" 22 | pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}" 23 | pr_base_ref: ${{ github.event.pull_request.base.ref }} 24 | pr_author: ${{ github.event.pull_request.user.login }} 25 | event_action: ${{ github.event.action }} 26 | pr_merged: ${{ github.event.pull_request.merged }} 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.*~ 2 | build 3 | *.so 4 | ucp/_libs/*.a 5 | ucp/_libs/*.o 6 | ucp/_libs/*.c 7 | _build 8 | 9 | dask-worker-space 10 | __pytestcache__ 11 | __pycache__ 12 | *.egg-info/ 13 | final_dist/ 14 | dist/ 15 | .vscode 16 | 17 | *.sw[po] 18 | *.whl 19 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | args: ["--settings-path=pyproject.toml"] 7 | exclude: __init__.py$ 8 | types: [text] 9 | types_or: [python, cython, pyi] 10 | - repo: https://github.com/ambv/black 11 | rev: 22.3.0 12 | hooks: 13 | - id: black 14 | - repo: https://github.com/PyCQA/flake8 15 | rev: 7.1.1 16 | hooks: 17 | - id: flake8 18 | args: ["--config=.flake8"] 19 | types: [file] 20 | types_or: [python, cython] 21 | additional_dependencies: ["flake8-force"] 22 | - repo: https://github.com/rapidsai/pre-commit-hooks 23 | rev: v0.4.0 24 | hooks: 25 | - id: verify-copyright 26 | - id: verify-alpha-spec 27 | args: 28 | - --fix 29 | - --rapids-version=25.08 30 | - repo: https://github.com/rapidsai/dependency-file-generator 31 | rev: v1.17.0 32 | hooks: 33 | - id: rapids-dependency-file-generator 34 | args: ["--clean"] 35 | default_language_version: 36 | python: python3 37 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "mambaforge-22.9" 7 | jobs: 8 | post_create_environment: 9 | # explicitly passing matrix-entry so that 'libucx' (with appropriate CUDA suffix) 10 | # is pulled in, and therefore tested in this no-CUDA environment 11 | - | 12 | pip install \ 13 | -C rapidsai.matrix-entry="cuda=12.x;cuda_suffixed=true" \ 14 | . 15 | 16 | conda: 17 | environment: conda/environments/builddocs.yml 18 | 19 | sphinx: 20 | configuration: docs/source/conf.py 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 3. Neither the name of the copyright holder nor the names of its 13 | contributors may be used to endorse or promote products derived from 14 | this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 22 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Python type stubs 2 | recursive-include ucp *.pyi 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![https://ucx-py.readthedocs.io/en/latest/](https://readthedocs.org/projects/ucx-py/badge/ "ReadTheDocs")]( https://ucx-py.readthedocs.io/en/latest/ ) 2 | 3 | # Python Bindings for UCX 4 | 5 | ## Installing 6 | 7 | Users can either [install with Conda]( https://ucx-py.readthedocs.io/en/latest/install.html#conda ) or [build from source]( https://ucx-py.readthedocs.io/en/latest/install.html#source ). 8 | 9 | ## Testing 10 | 11 | To run ucx-py's tests, just use ``pytest``: 12 | 13 | ```bash 14 | pytest -v 15 | ``` 16 | 17 | ### TCP Support 18 | 19 | In order to use TCP add `tcp` to `UCX_TLS` and set `UCXPY_IFNAME` to the network interface you want to use. Some setup examples: 20 | 21 | ```bash 22 | # TCP using "eth0" and CUDA support 23 | export UCX_TLS=tcp,cuda_copy,cuda_ipc 24 | export UCXPY_IFNAME="eth0" 25 | 26 | # InfiniBand using "ib0" and CUDA support 27 | export UCX_TLS=rc,cuda_copy,cuda_ipc 28 | export UCXPY_IFNAME="ib0" 29 | 30 | # TCP using "eno0" and no CUDA support 31 | export UCX_TLS=tcp 32 | export UCXPY_IFNAME="eno0" 33 | ``` 34 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.45.0 2 | -------------------------------------------------------------------------------- /ci/build_python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | source rapids-date-string 7 | 8 | rapids-print-env 9 | 10 | rapids-generate-version > ./VERSION 11 | RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) 12 | export RAPIDS_PACKAGE_VERSION 13 | 14 | # populates `RATTLER_CHANNELS` array and `RATTLER_ARGS` array 15 | source rapids-rattler-channel-string 16 | 17 | rapids-logger "Building ucx-py" 18 | 19 | # Need `--experimental` flag to use `load_from_file` and `git.head_rev` 20 | rattler-build build --recipe conda/recipes/ucx-py \ 21 | "${RATTLER_ARGS[@]}" \ 22 | "${RATTLER_CHANNELS[@]}" 23 | 24 | # remove build_cache directory to avoid uploading the entire source tree 25 | # tracked in https://github.com/prefix-dev/rattler-build/issues/1424 26 | rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache 27 | -------------------------------------------------------------------------------- /ci/build_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | source rapids-date-string 7 | source rapids-init-pip 8 | 9 | rapids-generate-version > ./VERSION 10 | 11 | rapids-pip-retry wheel \ 12 | -v \ 13 | -w dist \ 14 | --no-deps \ 15 | --disable-pip-version-check \ 16 | --config-settings rapidsai.disable-cuda=false \ 17 | . 18 | 19 | python -m auditwheel repair \ 20 | -w "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" \ 21 | --exclude "libucm.so.0" \ 22 | --exclude "libucp.so.0" \ 23 | --exclude "libucs.so.0" \ 24 | --exclude "libucs_signal.so.0" \ 25 | --exclude "libuct.so.0" \ 26 | dist/* 27 | 28 | ./ci/validate_wheel.sh "${RAPIDS_WHEEL_BLD_OUTPUT_DIR}" 29 | -------------------------------------------------------------------------------- /ci/check_style.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | rapids-logger "Create checks conda environment" 7 | . /opt/conda/etc/profile.d/conda.sh 8 | 9 | rapids-dependency-file-generator \ 10 | --output conda \ 11 | --file-key checks \ 12 | --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml 13 | 14 | rapids-mamba-retry env create --yes -f env.yaml -n checks 15 | conda activate checks 16 | 17 | # Run pre-commit checks 18 | pre-commit run --hook-stage manual --all-files --show-diff-on-failure 19 | -------------------------------------------------------------------------------- /ci/release/update-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ######################## 3 | # ucx-py Version Updater # 4 | ######################## 5 | 6 | ## Usage 7 | # bash update-version.sh 8 | 9 | 10 | # Format is Major.Minor.Patch - no leading 'v' or trailing 'a' 11 | # Example: 0.30.00 12 | NEXT_FULL_TAG=$1 13 | 14 | # Get current version 15 | CURRENT_TAG=$(git tag | grep -xE 'v[0-9\.]+' | sort --version-sort | tail -n 1 | tr -d 'v') 16 | CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}') 17 | CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}') 18 | CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}') 19 | CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} 20 | 21 | #Get . for next version 22 | NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') 23 | NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') 24 | NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} 25 | 26 | # Get RAPIDS version associated w/ ucx-py version 27 | NEXT_RAPIDS_SHORT_TAG="$(curl -sL https://version.gpuci.io/ucx-py/${NEXT_SHORT_TAG})" 28 | 29 | # Need to distutils-normalize the versions for some use cases 30 | NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") 31 | NEXT_FULL_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_FULL_TAG}'))") 32 | NEXT_RAPIDS_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_RAPIDS_SHORT_TAG}'))") 33 | 34 | echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" 35 | 36 | # Inplace sed replace; workaround for Linux and Mac 37 | function sed_runner() { 38 | sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak 39 | } 40 | 41 | DEPENDENCIES=( 42 | cudf 43 | rapids-dask-dependency 44 | ) 45 | UCX_PY_DEPENDENCIES=( 46 | ucx-py 47 | ) 48 | for FILE in dependencies.yaml conda/environments/*.yml; do 49 | for DEP in "${DEPENDENCIES[@]}"; do 50 | sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_RAPIDS_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" 51 | done 52 | for DEP in "${UCX_PY_DEPENDENCIES[@]}"; do 53 | sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" 54 | done 55 | done 56 | 57 | for DEP in "${DEPENDENCIES[@]}"; do 58 | sed_runner "/\"${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*\"/==${NEXT_RAPIDS_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" pyproject.toml 59 | done 60 | 61 | for FILE in .github/workflows/*.yaml; do 62 | sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_RAPIDS_SHORT_TAG}/g" "${FILE}" 63 | done 64 | 65 | echo "${NEXT_FULL_TAG_PEP440}" > VERSION 66 | 67 | sed_runner "s/--rapids-version=[[:digit:]]\{2\}.[[:digit:]]\{2\}/--rapids-version=${NEXT_RAPIDS_SHORT_TAG}/g" .pre-commit-config.yaml 68 | -------------------------------------------------------------------------------- /ci/run_benchmark_pytests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | # cd to root directory to prevent repo's `ucp` directory from being used 7 | # in subsequent commands 8 | pushd / 9 | timeout 1m python -m ucp.benchmarks.send_recv -o cupy --server-dev 0 --client-dev 0 --reuse-alloc --backend ucp-async 10 | timeout 1m python -m ucp.benchmarks.send_recv -o cupy --server-dev 0 --client-dev 0 --reuse-alloc --backend ucp-core 11 | timeout 1m python -m ucp.benchmarks.cudf_merge --chunks-per-dev 4 --chunk-size 10000 --rmm-init-pool-size 2097152 12 | popd 13 | -------------------------------------------------------------------------------- /ci/run_pytests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | # Support invoking run_pytests.sh outside the script directory 7 | cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ 8 | 9 | timeout 10m pytest --cache-clear -vs "$@" tests 10 | timeout 2m pytest --cache-clear -vs "$@" ucp/_libs/tests 11 | -------------------------------------------------------------------------------- /ci/test_python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024-2025, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | # Support invoking test_python.sh outside the script directory 7 | cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ 8 | 9 | rapids-logger "Create test conda environment using artifacts from previous job" 10 | . /opt/conda/etc/profile.d/conda.sh 11 | 12 | UCX_PY_VERSION="$(head -1 ./VERSION)" 13 | PYTHON_CHANNEL=$(rapids-download-conda-from-github python) 14 | 15 | rapids-dependency-file-generator \ 16 | --output conda \ 17 | --file-key test_python \ 18 | --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ 19 | --prepend-channel "${PYTHON_CHANNEL}" \ 20 | | tee env.yaml 21 | 22 | rapids-mamba-retry env create -yq -f env.yaml -n test 23 | conda activate test 24 | 25 | rapids-print-env 26 | 27 | rapids-logger "Check GPU usage" 28 | nvidia-smi 29 | 30 | rapids-logger "Check NICs" 31 | awk 'END{print $1}' /etc/hosts 32 | cat /etc/hosts 33 | 34 | run_tests() { 35 | rapids-logger "UCX Version and Build Configuration" 36 | ucx_info -v 37 | 38 | rapids-logger "Python pytest for ucx-py" 39 | 40 | # list test directory 41 | ls tests/ 42 | 43 | # Test with TCP/Sockets 44 | rapids-logger "TEST WITH TCP ONLY" 45 | ./ci/run_pytests.sh 46 | 47 | rapids-logger "Run local benchmark" 48 | # cd to root directory to prevent repo's `ucp` directory from being used 49 | # in subsequent commands 50 | ./ci/run_benchmark_pytests.sh 51 | } 52 | 53 | rapids-logger "Run tests with conda package" 54 | run_tests 55 | 56 | 57 | # The following block is untested in GH Actions 58 | TEST_UCX_MASTER=0 59 | if [[ "${TEST_UCX_MASTER}" == 1 ]]; then 60 | rapids-logger "Build UCX master" 61 | git clone https://github.com/openucx/ucx ucx-master 62 | pushd ucx-master 63 | ./autogen.sh 64 | mkdir build 65 | pushd build 66 | ../contrib/configure-release --prefix="${CONDA_PREFIX}" --with-cuda="${CUDA_HOME}" --enable-mt 67 | make -j install 68 | 69 | rapids-logger "Build UCX-Py" 70 | popd; popd 71 | git clean -ffdx 72 | python setup.py build_ext --inplace 73 | rapids-pip-retry install -e . 74 | 75 | rapids-logger "Run tests with pip package against ucx master" 76 | run_tests 77 | fi 78 | -------------------------------------------------------------------------------- /ci/test_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2023-2025, NVIDIA CORPORATION. 3 | 4 | set -eoxu pipefail 5 | 6 | source rapids-init-pip 7 | 8 | RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" 9 | PYTHON_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="ucx_py_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github python) 10 | 11 | # echo to expand wildcard before adding `[extra]` requires for pip 12 | rapids-pip-retry install $(echo "${PYTHON_WHEELHOUSE}"/ucx_py*.whl)[test] 13 | 14 | cd tests 15 | timeout 10m python -m pytest --cache-clear -vs . 16 | cd ../ucp 17 | timeout 2m python -m pytest --cache-clear -vs ./_libs/tests/ 18 | -------------------------------------------------------------------------------- /ci/validate_wheel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2024, NVIDIA CORPORATION. 3 | 4 | set -euo pipefail 5 | 6 | wheel_dir_relative_path=$1 7 | 8 | rapids-logger "validate packages with 'pydistcheck'" 9 | 10 | pydistcheck \ 11 | --inspect \ 12 | "$(echo ${wheel_dir_relative_path}/*.whl)" 13 | 14 | rapids-logger "validate packages with 'twine'" 15 | 16 | twine check \ 17 | --strict \ 18 | "$(echo ${wheel_dir_relative_path}/*.whl)" 19 | -------------------------------------------------------------------------------- /conda/environments/builddocs.yml: -------------------------------------------------------------------------------- 1 | name: ucx_dev 2 | channels: 3 | - rapidsai 4 | - nvidia 5 | - conda-forge 6 | dependencies: 7 | # the ceiling on sphinx can be removed when https://github.com/spatialaudio/nbsphinx/issues/825 is resolved 8 | - sphinx>=8.0,<8.2.0 9 | - sphinx-markdown-tables 10 | - sphinx_rtd_theme 11 | - sphinxcontrib-websupport 12 | - nbsphinx 13 | - numpydoc 14 | - recommonmark 15 | - pandoc=<2.0.0 16 | - pip 17 | - cython 18 | -------------------------------------------------------------------------------- /conda/recipes/ucx-py/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | c_compiler_version: 2 | - 13 3 | 4 | cxx_compiler_version: 5 | - 13 6 | 7 | ucx: 8 | - "==1.15.*" 9 | -------------------------------------------------------------------------------- /conda/recipes/ucx-py/recipe.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. 2 | schema_version: 1 3 | 4 | context: 5 | version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }} 6 | date_string: '${{ env.get("RAPIDS_DATE_STRING") }}' 7 | py_version: ${{ env.get("RAPIDS_PY_VERSION") }} 8 | py_buildstring: ${{ py_version | version_to_buildstring }} 9 | head_rev: '${{ git.head_rev(".")[:8] }}' 10 | 11 | package: 12 | name: ucx-py 13 | version: ${{ version }} 14 | 15 | source: 16 | path: ../../.. 17 | 18 | build: 19 | string: py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }} 20 | dynamic_linking: 21 | overlinking_behavior: "error" 22 | script: 23 | content: | 24 | python -m pip install --config-settings rapidsai.disable-cuda=true . -vv 25 | requirements: 26 | build: 27 | - ${{ compiler("c") }} 28 | - ${{ compiler("cxx") }} 29 | host: 30 | - cython>=3.0.0 31 | - pip 32 | - python =${{ py_version }} 33 | - rapids-build-backend>=0.3.1,<0.4.0dev0 34 | - setuptools>=64.0.0 35 | - ucx 36 | run: 37 | - numpy>=1.23,<3.0a0 38 | - pynvml>=12.0.0,<13.0.0a0 39 | - python 40 | - ucx >=1.15.0,<1.19.0 41 | ignore_run_exports: 42 | from_package: 43 | - ${{ compiler("c") }} 44 | - ${{ compiler("cxx") }} 45 | by_name: 46 | - ucx 47 | 48 | 49 | tests: 50 | - python: 51 | imports: 52 | - ucp 53 | pip_check: false 54 | 55 | 56 | about: 57 | homepage: ${{ load_from_file("pyproject.toml").project.urls.Homepage }} 58 | license: ${{ load_from_file("pyproject.toml").project.license.text }} 59 | summary: ${{ load_from_file("pyproject.toml").project.description }} 60 | -------------------------------------------------------------------------------- /debug-tests/README.md: -------------------------------------------------------------------------------- 1 | ## Debug Tests 2 | 3 | Files in this directory are useful for debugging purposes and often require being executed in two separate sessions (tmux/ssh/etc). 4 | 5 | NOTE: This was moved outside of the tests directory to prevent users running potentially unstable tests by accident. 6 | 7 | 8 | ## Send/Recv 9 | 10 | `send.py` and `recv.py` are used to debug/confirm nvlink message passing over 1000 iterations of either CuPy or cudf objects: 11 | 12 | ### Process 1 13 | 14 | > UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/debug-testssend.py 15 | 16 | ### Process 2 17 | 18 | > UCXPY_LOG_LEVEL=DEBUG UCX_LOG_LEVEL=DEBUG UCXPY_IFNAME=enp1s0f0 CUDA_VISIBLE_DEVICES=0,1 UCX_MEMTYPE_CACHE=n UCX_TLS=tcp,cuda_copy,cuda_ipc /usr/local/cuda/bin/nvprof python tests/recv.py 19 | 20 | `nvprof` is used to verify NVLINK usage and we are looking at two things primarily: 21 | - existence of [CUDA memcpy PtoP] 22 | - balanced cudaMalloc/cudaFree 23 | 24 | ### Multi-worker Setup 25 | This setup is particularly useful for IB testing when `multi-node-workers.sh` 26 | is placed in a NFS mount and can be executed independently on each machine 27 | 28 | - bash scheduler.sh 29 | - bash multi-node-workers.sh 30 | -------------------------------------------------------------------------------- /debug-tests/client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import time 4 | 5 | import pynvml 6 | import pytest 7 | from debug_utils import ( 8 | ITERATIONS, 9 | parse_args, 10 | set_rmm, 11 | start_process, 12 | total_nvlink_transfer, 13 | ) 14 | from utils import recv, send 15 | 16 | import ucp 17 | from ucp.utils import get_event_loop 18 | 19 | pynvml.nvmlInit() 20 | 21 | 22 | cmd = "nvidia-smi nvlink --setcontrol 0bz" # Get output in bytes 23 | # subprocess.check_call(cmd, shell=True) 24 | 25 | pynvml = pytest.importorskip("pynvml", reason="PYNVML not installed") 26 | 27 | 28 | async def get_ep(name, port): 29 | addr = ucp.get_address() 30 | ep = await ucp.create_endpoint(addr, port) 31 | return ep 32 | 33 | 34 | def client(env, port, func, verbose): 35 | # wait for server to come up 36 | # receive cudf object 37 | # deserialize 38 | # assert deserialized msg is cdf 39 | # send receipt 40 | 41 | os.environ.update(env) 42 | before_rx, before_tx = total_nvlink_transfer() 43 | 44 | async def read(): 45 | await asyncio.sleep(1) 46 | ep = await get_ep("client", port) 47 | 48 | for i in range(ITERATIONS): 49 | bytes_used = pynvml.nvmlDeviceGetMemoryInfo( 50 | pynvml.nvmlDeviceGetHandleByIndex(0) 51 | ).used 52 | bytes_used 53 | # print("Bytes Used:", bytes_used, i) 54 | 55 | frames, msg = await recv(ep) 56 | 57 | # Send meta data 58 | await send(ep, frames) 59 | 60 | print("Shutting Down Client...") 61 | await ep.close() 62 | 63 | set_rmm() 64 | for i in range(ITERATIONS): 65 | print("ITER: ", i) 66 | t = time.time() 67 | get_event_loop().run_until_complete(read()) 68 | if verbose: 69 | print("Time take for interation %d: %ss" % (i, time.time() - t)) 70 | 71 | print("FINISHED") 72 | # num_bytes = nbytes(rx_cuda_obj) 73 | # print(f"TOTAL DATA RECEIVED: {num_bytes}") 74 | # nvlink only measures in KBs 75 | # if num_bytes > 90000: 76 | # rx, tx = total_nvlink_transfer() 77 | # msg = f"RX BEFORE SEND: {before_rx} -- RX AFTER SEND: {rx} \ 78 | # -- TOTAL DATA: {num_bytes}" 79 | # print(msg) 80 | # assert rx > before_rx 81 | 82 | # import cloudpickle 83 | # cuda_obj_generator = cloudpickle.loads(func) 84 | # pure_cuda_obj = cuda_obj_generator() 85 | 86 | # from cudf.testing import assert_eq 87 | # import cupy as cp 88 | 89 | # if isinstance(rx_cuda_obj, cp.ndarray): 90 | # cp.testing.assert_allclose(rx_cuda_obj, pure_cuda_obj) 91 | # else: 92 | # assert_eq(rx_cuda_obj, pure_cuda_obj) 93 | 94 | 95 | def main(): 96 | args = parse_args(server_address=True) 97 | 98 | start_process(args, client) 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /debug-tests/debug_utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import cloudpickle 5 | import cupy 6 | from utils import get_num_gpus 7 | 8 | from dask.utils import parse_bytes 9 | 10 | import rmm 11 | from rmm.allocators.cupy import rmm_cupy_allocator 12 | 13 | ITERATIONS = 100 14 | 15 | 16 | def set_rmm(): 17 | rmm.reinitialize( 18 | pool_allocator=True, managed_memory=False, initial_pool_size=parse_bytes("6GB") 19 | ) 20 | cupy.cuda.set_allocator(rmm_cupy_allocator) 21 | 22 | 23 | def parse_args(server_address=False): 24 | parser = argparse.ArgumentParser(description="Tester client process") 25 | if server_address is True: 26 | parser.add_argument( 27 | "-s", 28 | "--server", 29 | default=None, 30 | help="Server address, ucp.get_address() if not specified", 31 | ) 32 | parser.add_argument("-p", "--port", default=13337, help="Server port", type=int) 33 | parser.add_argument( 34 | "-o", 35 | "--object_type", 36 | default="numpy", 37 | choices=["numpy", "cupy", "cudf"], 38 | help="In-memory array type.", 39 | ) 40 | parser.add_argument( 41 | "-c", 42 | "--cpu-affinity", 43 | metavar="N", 44 | default=-1, 45 | type=int, 46 | help="CPU affinity (default -1: unset).", 47 | ) 48 | parser.add_argument( 49 | "-v", 50 | "--verbose", 51 | default=False, 52 | action="store_true", 53 | help="Print timings per iteration.", 54 | ) 55 | 56 | return parser.parse_args() 57 | 58 | 59 | def get_cuda_devices(): 60 | if "CUDA_VISIBLE_DEVICES" in os.environ: 61 | return os.environ["CUDA_VISIBLE_DEVICES"].split(",") 62 | else: 63 | ngpus = get_num_gpus() 64 | return list(range(ngpus)) 65 | 66 | 67 | def total_nvlink_transfer(): 68 | import pynvml 69 | 70 | pynvml.nvmlShutdown() 71 | 72 | pynvml.nvmlInit() 73 | 74 | try: 75 | cuda_dev_id = int(os.environ["CUDA_VISIBLE_DEVICES"].split(",")[0]) 76 | except Exception as e: 77 | print(e) 78 | cuda_dev_id = 0 79 | nlinks = pynvml.NVML_NVLINK_MAX_LINKS 80 | handle = pynvml.nvmlDeviceGetHandleByIndex(cuda_dev_id) 81 | rx = 0 82 | tx = 0 83 | for i in range(nlinks): 84 | transfer = pynvml.nvmlDeviceGetNvLinkUtilizationCounter(handle, i, 0) 85 | rx += transfer["rx"] 86 | tx += transfer["tx"] 87 | return rx, tx 88 | 89 | 90 | def start_process(args, process_function): 91 | if args.cpu_affinity >= 0: 92 | os.sched_setaffinity(0, [args.cpu_affinity]) 93 | 94 | base_env = os.environ 95 | env = base_env.copy() 96 | 97 | port = 15339 98 | 99 | # serialize function and send to the client and server 100 | # server will use the return value of the contents, 101 | # serialize the values, then send serialized values to client. 102 | # client will compare return values of the deserialized 103 | # data sent from the server 104 | 105 | obj = get_object(args.object_type) 106 | obj_func = cloudpickle.dumps(obj) 107 | 108 | process_function(env, port, obj_func, args.verbose) 109 | 110 | 111 | def cudf_obj(): 112 | import numpy as np 113 | 114 | import cudf 115 | 116 | size = 2**26 117 | return cudf.DataFrame( 118 | {"a": np.random.random(size), "b": np.random.random(size), "c": ["a"] * size} 119 | ) 120 | 121 | 122 | def cudf_from_cupy_obj(): 123 | import cupy 124 | import numpy as np 125 | 126 | import cudf 127 | 128 | size = 9**5 129 | obj = cupy.arange(size) 130 | data = [obj for i in range(10)] 131 | data.extend([np.arange(10) for i in range(10)]) 132 | data.append(cudf.Series([1, 2, 3, 4])) 133 | data.append({"key": "value"}) 134 | data.append({"key": cudf.Series([0.45, 0.134])}) 135 | return data 136 | 137 | 138 | def cupy_obj(): 139 | import cupy as cp 140 | 141 | size = 10**9 142 | return cp.arange(size) 143 | 144 | 145 | def numpy_obj(): 146 | import numpy as np 147 | 148 | size = 2**20 149 | obj = np.arange(size) 150 | return obj 151 | 152 | 153 | def get_object(object_type): 154 | if object_type == "numpy": 155 | return numpy_obj 156 | elif object_type == "cupy": 157 | return cupy_obj 158 | elif object_type == "cudf": 159 | return cudf_obj 160 | else: 161 | raise TypeError("Object type %s unknown" % (object_type)) 162 | -------------------------------------------------------------------------------- /debug-tests/multi-node-workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | #export UCX_LOG_LEVEL=DEBUG 4 | #export UCXPY_LOG_LEVEL=DEBUG 5 | export UCX_MEMTYPE_CACHE=n 6 | export UCX_TLS=tcp,cuda_copy,rc 7 | 8 | UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python recv.py 2>&1 | tee /tmp/recv-log-0.txt & 9 | UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=1 python recv.py 2>&1 | tee /tmp/recv-log-1.txt & 10 | UCX_NET_DEVICES=mlx5_1:1 CUDA_VISIBLE_DEVICES=2 python recv.py 2>&1 | tee /tmp/recv-log-2.txt & 11 | UCX_NET_DEVICES=mlx5_1:1 CUDA_VISIBLE_DEVICES=3 python recv.py 2>&1 | tee /tmp/recv-log-3.txt & 12 | UCX_NET_DEVICES=mlx5_2:1 CUDA_VISIBLE_DEVICES=4 python recv.py 2>&1 | tee /tmp/recv-log-4.txt & 13 | UCX_NET_DEVICES=mlx5_2:1 CUDA_VISIBLE_DEVICES=5 python recv.py 2>&1 | tee /tmp/recv-log-5.txt & 14 | UCX_NET_DEVICES=mlx5_3:1 CUDA_VISIBLE_DEVICES=6 python recv.py 2>&1 | tee /tmp/recv-log-6.txt & 15 | UCX_NET_DEVICES=mlx5_3:1 CUDA_VISIBLE_DEVICES=7 python recv.py 2>&1 | tee /tmp/recv-log-7.txt & 16 | 17 | sleep 3600 18 | -------------------------------------------------------------------------------- /debug-tests/scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | #export UCX_LOG_LEVEL=TRACE 5 | # export UCXPY_LOG_LEVEL=DEBUG 6 | export UCX_MEMTYPE_CACHE=n 7 | export UCX_TLS=tcp,cuda_copy,rc,cuda_ipc 8 | 9 | UCX_NET_DEVICES=mlx5_0:1 CUDA_VISIBLE_DEVICES=0 python send.py 2>&1 | tee /tmp/send-log.txt & 10 | -------------------------------------------------------------------------------- /debug-tests/server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | import cloudpickle 5 | import pytest 6 | from debug_utils import ITERATIONS, parse_args, set_rmm, start_process 7 | from utils import recv, send 8 | 9 | from distributed.comm.utils import to_frames 10 | from distributed.protocol import to_serialize 11 | 12 | import ucp 13 | from ucp.utils import get_event_loop 14 | 15 | cmd = "nvidia-smi nvlink --setcontrol 0bz" # Get output in bytes 16 | # subprocess.check_call(cmd, shell=True) 17 | 18 | pynvml = pytest.importorskip("pynvml", reason="PYNVML not installed") 19 | 20 | 21 | async def get_ep(name, port): 22 | addr = ucp.get_address() 23 | ep = await ucp.create_endpoint(addr, port) 24 | return ep 25 | 26 | 27 | def server(env, port, func, verbose): 28 | # create listener receiver 29 | # write cudf object 30 | # confirm message is sent correctly 31 | 32 | os.environ.update(env) 33 | 34 | async def f(listener_port): 35 | # coroutine shows up when the client asks 36 | # to connect 37 | set_rmm() 38 | 39 | async def write(ep): 40 | 41 | print("CREATING CUDA OBJECT IN SERVER...") 42 | cuda_obj_generator = cloudpickle.loads(func) 43 | cuda_obj = cuda_obj_generator() 44 | msg = {"data": to_serialize(cuda_obj)} 45 | frames = await to_frames(msg, serializers=("cuda", "dask", "pickle")) 46 | while True: 47 | for i in range(ITERATIONS): 48 | print("ITER: ", i) 49 | # Send meta data 50 | await send(ep, frames) 51 | 52 | frames, msg = await recv(ep) 53 | 54 | print("CONFIRM RECEIPT") 55 | await ep.close() 56 | break 57 | # lf.close() 58 | del msg 59 | del frames 60 | 61 | lf = ucp.create_listener(write, port=listener_port) 62 | try: 63 | while not lf.closed(): 64 | await asyncio.sleep(0.1) 65 | except ucp.UCXCloseError: 66 | pass 67 | 68 | loop = get_event_loop() 69 | while True: 70 | loop.run_until_complete(f(port)) 71 | 72 | 73 | def main(): 74 | args = parse_args(server_address=False) 75 | 76 | start_process(args, server) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /debug-tests/utils.py: -------------------------------------------------------------------------------- 1 | ../tests/utils.py -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | RUN apt-get update && \ 4 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata && \ 5 | apt-get install -y \ 6 | automake \ 7 | dh-make \ 8 | g++ \ 9 | git \ 10 | libcap2 \ 11 | libtool \ 12 | make \ 13 | udev \ 14 | wget \ 15 | && apt-get remove -y openjdk-11-* || apt-get autoremove -y \ 16 | && apt-get clean && rm -rf /var/lib/apt/lists/* 17 | 18 | COPY run.sh /root 19 | 20 | WORKDIR /root 21 | 22 | CMD [ "/root/run.sh" ] 23 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Docker container 2 | 3 | ## Summary 4 | 5 | Contains reference dockerfile and build script to run UCX-Py tests and benchmarks. This is a minimal setup, without support for CUDA, MOFED, or rdma-core. 6 | 7 | ## Building Docker image 8 | 9 | To begin, it's necessary to build the image, this is done as follows: 10 | 11 | ```bash 12 | cd docker 13 | docker build -t ucx-py -f Dockerfile . 14 | ``` 15 | 16 | ## Running 17 | 18 | Once building the Docker image is complete, the container can be started with the following command: 19 | 20 | ```bash 21 | docker run ucx-py 22 | ``` 23 | 24 | The container above will run UCX-Py tests and benchmarks. 25 | 26 | ## Infiniband/NVLink-enabled docker file 27 | 28 | In addition to the reference Docker image, there are two further docker 29 | files which have support for CUDA devices and 30 | InfiniBand/NVLink-enabled communications using either 31 | [rdma-core](https://github.com/linux-rdma/rdma-core) or 32 | [MOFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/). 33 | In both cases, the default base image is 34 | [nvidia/cuda:11.5.2-devel-ubuntu20.04](https://hub.docker.com/r/nvidia/cuda/tags?page=1&name=11.5.2-devel-ubuntu20.04). 35 | 36 | The rdma-core image should work as long as the host system has MOFED >= 5.0. 37 | If you use the MOFED image, then the host version (reported by `ofed_info 38 | -s`) should match that used when building the container. 39 | 40 | To use one of these images, first build it 41 | ```bash 42 | docker build -t ucx-py-mofed -f UCXPy-MOFED.dockerfile . 43 | # or 44 | docker build -t ucx-py-rdma -f UCXPy-rdma-core.dockerfile . 45 | ``` 46 | 47 | ### Controlling build-args 48 | 49 | You can control some of the behaviour of the docker file with docker `--build-arg` flags: 50 | 51 | - `UCX_VERSION_TAG`: git committish for the version of UCX to build (default `v1.13.0`); 52 | - `CONDA_HOME`: Where to install conda in the image (default `/opt/conda`); 53 | - `CONDA_ENV`: What to name the conda environment (default `ucx`); 54 | - `CONDA_ENV_SPEC`: yaml file used when initially creating the conda environment (default `ucx-py-cuda11.5.yml`); 55 | - `CUDA_VERSION`: version of cuda toolkit in the base image (default `11.5.2`), must exist in the [nvidia/cuda](https://hub.docker.com/layers/cuda/nvidia/cuda) docker hub image list; 56 | - `DISTRIBUTION_VERSION`: version of distribution in the base image (default `ubuntu20.04`), must exist in the [nvidia/cuda](https://hub.docker.com/layers/cuda/nvidia/cuda) docker hub image list. Note that rdma-core provides forward-compatibility with version 28.0 (shipped with ubuntu20.04) supporting MOFED 5.0 and later. Other distributions may provide a different version of rdma-core for which MOFED compatibility may vary; 57 | - `MOFED_VERSION`: (MOFED image only) version of MOFED to download (default `5.3-1.0.5.0`), must match version on host system 58 | 59 | ### Running 60 | 61 | Running the container requires a number of additional flags to expose 62 | high-performance transports from the host. `docker run --privileged` is a 63 | catch-all that will definitely provide enough permissions (`ulimit -l unlimited` 64 | is then needed in the container). Alternately, provide `--ulimit memlock=-1` and 65 | expose devices with `--device /dev/infiniband`, see [the UCX 66 | documentation](https://openucx.readthedocs.io/en/master/running.html#running-in-docker-containers) 67 | for more details. To expose the infiniband devices using IPoIB, we need to in 68 | addition map the relevant host network interfaces, a catchall is just to use `--network host`. 69 | 70 | For example, a run command that exposes all devices available in 71 | `/dev/infiniband` along with the network interfaces on the host is (assuming 72 | that the `ucx-py-rdma` image tag has been built as above): 73 | 74 | ```bash 75 | docker run --ulimit memlock=-1 --device /dev/infiniband --network host -ti ucx-py-rdma /bin/bash 76 | ``` 77 | 78 | UCX-Py is installed via 79 | [mamba](https://mamba.readthedocs.io/en/latest/index.html) in the `ucx` 80 | environment; so 81 | ```bash 82 | source /opt/conda/etc/profile.d/conda.sh 83 | source /opt/conda/etc/profile.d/mamba.sh 84 | mamba activate ucx 85 | ``` 86 | in the container will provide a Python with UCX-Py available. 87 | -------------------------------------------------------------------------------- /docker/UCXPy-MOFED.dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=11.5.2 2 | ARG DISTRIBUTION_VERSION=ubuntu20.04 3 | FROM nvidia/cuda:${CUDA_VERSION}-devel-${DISTRIBUTION_VERSION} 4 | 5 | # Make available to later build stages 6 | ARG DISTRIBUTION_VERSION 7 | # Should match host OS OFED version (as reported by ofed_info -s) 8 | ARG MOFED_VERSION=5.3-1.0.5.0 9 | # Tag to checkout from UCX repository 10 | ARG UCX_VERSION_TAG=v1.13.0 11 | # Where to install conda, and what to name the created environment 12 | ARG CONDA_HOME=/opt/conda 13 | ARG CONDA_ENV=ucx 14 | # Name of conda spec file in the current working directory that 15 | # will be used to build the conda environment. 16 | ARG CONDA_ENV_SPEC=ucx-py-cuda11.5.yml 17 | 18 | ENV CONDA_ENV="${CONDA_ENV}" 19 | ENV CONDA_HOME="${CONDA_HOME}" 20 | 21 | # Where cuda is installed 22 | ENV CUDA_HOME="/usr/local/cuda" 23 | 24 | SHELL ["/bin/bash", "-c"] 25 | 26 | RUN apt-get update -y \ 27 | && apt-get --fix-missing upgrade -y \ 28 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata \ 29 | && apt-get install -y \ 30 | automake \ 31 | dh-make \ 32 | git \ 33 | libcap2 \ 34 | libtool \ 35 | make \ 36 | pkg-config \ 37 | udev \ 38 | curl \ 39 | && apt-get autoremove -y \ 40 | && apt-get clean 41 | 42 | RUN curl -fsSL https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ 43 | -o /minimamba.sh \ 44 | && bash /minimamba.sh -b -p ${CONDA_HOME} \ 45 | && rm /minimamba.sh 46 | 47 | ENV PATH="${CONDA_HOME}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${CUDA_HOME}/bin" 48 | 49 | RUN curl -fsSL https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-${DISTRIBUTION_VERSION}-x86_64.tgz | tar xz \ 50 | && (cd MLNX_OFED_LINUX-${MOFED_VERSION}-${DISTRIBUTION_VERSION}-x86_64 \ 51 | && yes | ./mlnxofedinstall --user-space-only --without-fw-update \ 52 | --without-neohost-backend) \ 53 | && rm -rf /var/lib/apt/lists/* \ 54 | && rm -rf /MLNX_OFED_LINUX-${MOFED_VERSION}-${DISTRIBUTION_VERSION}-x86_64 55 | 56 | WORKDIR /root 57 | COPY ${CONDA_ENV_SPEC} /root/conda-env.yml 58 | COPY build-ucx.sh /root/build-ucx.sh 59 | COPY build-ucx-py.sh /root/build-ucx-py.sh 60 | COPY bench-all.sh /root/bench-all.sh 61 | 62 | RUN mamba env create -n ${CONDA_ENV} --file /root/conda-env.yml 63 | RUN bash ./build-ucx.sh ${UCX_VERSION_TAG} ${CONDA_HOME} ${CONDA_ENV} ${CUDA_HOME} 64 | RUN bash ./build-ucx-py.sh ${CONDA_HOME} ${CONDA_ENV} 65 | CMD ["/root/bench-all.sh", "tcp,cuda_copy,cuda_ipc", "rc,cuda_copy", "all"] 66 | -------------------------------------------------------------------------------- /docker/UCXPy-rdma-core.dockerfile: -------------------------------------------------------------------------------- 1 | ARG CUDA_VERSION=11.5.2 2 | ARG DISTRIBUTION_VERSION=ubuntu20.04 3 | FROM nvidia/cuda:${CUDA_VERSION}-devel-${DISTRIBUTION_VERSION} 4 | 5 | # Tag to checkout from UCX repository 6 | ARG UCX_VERSION_TAG=v1.13.0 7 | # Where to install conda, and what to name the created environment 8 | ARG CONDA_HOME=/opt/conda 9 | ARG CONDA_ENV=ucx 10 | # Name of conda spec file in the current working directory that 11 | # will be used to build the conda environment. 12 | ARG CONDA_ENV_SPEC=ucx-py-cuda11.5.yml 13 | 14 | ENV CONDA_ENV="${CONDA_ENV}" 15 | ENV CONDA_HOME="${CONDA_HOME}" 16 | 17 | # Where cuda is installed 18 | ENV CUDA_HOME="/usr/local/cuda" 19 | 20 | SHELL ["/bin/bash", "-c"] 21 | 22 | RUN apt-get update -y \ 23 | && apt-get --fix-missing upgrade -y \ 24 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata \ 25 | && apt-get install -y \ 26 | automake \ 27 | dh-make \ 28 | git \ 29 | libcap2 \ 30 | libtool \ 31 | make \ 32 | pkg-config \ 33 | udev \ 34 | curl \ 35 | librdmacm-dev \ 36 | rdma-core \ 37 | && apt-get autoremove -y \ 38 | && apt-get clean 39 | 40 | RUN curl -fsSL https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ 41 | -o /minimamba.sh \ 42 | && bash /minimamba.sh -b -p ${CONDA_HOME} \ 43 | && rm /minimamba.sh 44 | 45 | ENV PATH="${CONDA_HOME}/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${CUDA_HOME}/bin" 46 | 47 | WORKDIR /root 48 | COPY ${CONDA_ENV_SPEC} /root/conda-env.yml 49 | COPY build-ucx.sh /root/build-ucx.sh 50 | COPY build-ucx-py.sh /root/build-ucx-py.sh 51 | 52 | RUN mamba env create -n ${CONDA_ENV} --file /root/conda-env.yml 53 | RUN bash ./build-ucx.sh ${UCX_VERSION_TAG} ${CONDA_HOME} ${CONDA_ENV} ${CUDA_HOME} 54 | RUN bash ./build-ucx-py.sh ${CONDA_HOME} ${CONDA_ENV} 55 | -------------------------------------------------------------------------------- /docker/bench-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2022, NVIDIA CORPORATION. 3 | 4 | set -e 5 | 6 | function logger { 7 | echo -e "\n$@\n" 8 | } 9 | 10 | # Requires conda installed at /opt/conda and the ucx environment setup 11 | # See UCXPy-CUDA.dockerfile 12 | source /opt/conda/etc/profile.d/conda.sh 13 | conda activate ucx 14 | 15 | cd ucx-py/ 16 | # Benchmark using command-line provided transports or else default 17 | for tls in ${@:-"tcp" "all"}; do 18 | export UCX_TLS=${tls} 19 | logger "Python pytest for ucx-py" 20 | 21 | logger "Tests (UCX_TLS=${UCX_TLS})" 22 | pytest --cache-clear -vs ucp/_libs/tests 23 | pytest --cache-clear -vs tests/ 24 | 25 | for array_type in "numpy" "cupy" "rmm"; do 26 | logger "Benchmarks (UCX_TLS=${UCX_TLS}, array_type=${array_type})" 27 | python ucp.benchmarks.send_recv -l ucp-async -o ${array_type} \ 28 | --server-dev 0 --client-dev 0 --reuse-alloc 29 | python ucp.benchmarks.send_recv -l ucp-core -o ${array_type} \ 30 | --server-dev 0 --client-dev 0 --reuse-alloc 31 | done 32 | done 33 | -------------------------------------------------------------------------------- /docker/build-ucx-py.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | CONDA_HOME=${1:-"/opt/conda"} 5 | CONDA_ENV=${2:-"ucx"} 6 | 7 | source ${CONDA_HOME}/etc/profile.d/conda.sh 8 | source ${CONDA_HOME}/etc/profile.d/mamba.sh 9 | mamba activate ${CONDA_ENV} 10 | 11 | git clone https://github.com/rapidsai/ucx-py.git 12 | pip install -v ucx-py/ 13 | -------------------------------------------------------------------------------- /docker/build-ucx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | UCX_VERSION_TAG=${1:-"v1.13.0"} 5 | CONDA_HOME=${2:-"/opt/conda"} 6 | CONDA_ENV=${3:-"ucx"} 7 | CUDA_HOME=${4:-"/usr/local/cuda"} 8 | # Send any remaining arguments to configure 9 | CONFIGURE_ARGS=${@:5} 10 | 11 | source ${CONDA_HOME}/etc/profile.d/conda.sh 12 | source ${CONDA_HOME}/etc/profile.d/mamba.sh 13 | mamba activate ${CONDA_ENV} 14 | 15 | git clone https://github.com/openucx/ucx.git 16 | 17 | cd ucx 18 | git checkout ${UCX_VERSION_TAG} 19 | ./autogen.sh 20 | mkdir build-linux && cd build-linux 21 | ../contrib/configure-release --prefix=${CONDA_PREFIX} --with-sysroot --enable-cma \ 22 | --enable-mt --with-gnu-ld --with-rdmacm --with-verbs \ 23 | --with-cuda=${CUDA_HOME} \ 24 | ${CONFIGURE_ARGS} 25 | make -j install 26 | -------------------------------------------------------------------------------- /docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2021, NVIDIA CORPORATION. 3 | set -e 4 | 5 | function logger { 6 | echo -e "\n$@\n" 7 | } 8 | 9 | PYTHON_PREFIX=$(python -c "import distutils.sysconfig; print(distutils.sysconfig.PREFIX)") 10 | 11 | ################################################################################ 12 | # SETUP - Install python packages and check environment 13 | ################################################################################ 14 | 15 | pip install \ 16 | "pytest" "pytest-asyncio" \ 17 | "dask" "distributed" \ 18 | "cython" 19 | 20 | logger "Check versions" 21 | python --version 22 | pip list 23 | 24 | ################################################################################ 25 | # BUILD - Build UCX master, UCX-Py and run tests 26 | ################################################################################ 27 | logger "Build UCX master" 28 | cd $HOME 29 | git clone https://github.com/openucx/ucx 30 | cd ucx 31 | ./autogen.sh 32 | ./contrib/configure-devel \ 33 | --prefix=$PYTHON_PREFIX \ 34 | --enable-gtest=no \ 35 | --with-valgrind=no 36 | make -j install 37 | 38 | echo $PYTHON_PREFIX >> /etc/ld.so.conf.d/python.conf 39 | ldconfig 40 | 41 | logger "UCX Version and Build Information" 42 | ucx_info -v 43 | 44 | 45 | ################################################################################ 46 | # TEST - Run pytests for ucx-py 47 | ################################################################################ 48 | logger "Clone and Build UCX-Py" 49 | cd $HOME 50 | git clone https://github.com/rapidsai/ucx-py 51 | cd ucx-py 52 | python setup.py build_ext --inplace 53 | python -m pip install -e . 54 | 55 | for tls in "tcp" "all"; do 56 | export UCX_TLS=$tls 57 | 58 | logger "Python pytest for ucx-py" 59 | 60 | # Test with TCP/Sockets 61 | logger "Tests (UCX_TLS=$UCX_TLS)" 62 | pytest --cache-clear -vs ucp/_libs/tests 63 | pytest --cache-clear -vs tests/ 64 | 65 | logger "Benchmarks (UCX_TLS=$UCX_TLS)" 66 | python -m ucp.benchmarks.send_recv -l ucp-async -o numpy \ 67 | --server-dev 0 --client-dev 0 --reuse-alloc 68 | python -m ucp.benchmarks.send_recv -l ucp-core -o numpy \ 69 | --server-dev 0 --client-dev 0 --reuse-alloc 70 | done 71 | -------------------------------------------------------------------------------- /docker/ucx-py-cuda11.5.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - rapidsai 3 | - nvidia 4 | - conda-forge 5 | 6 | dependencies: 7 | - python=3.10 8 | - cudatoolkit=11.5 9 | - setuptools 10 | - cython>=3.0.0 11 | - pytest 12 | - pytest-asyncio 13 | - dask 14 | - distributed 15 | - cupy 16 | - numba>=0.59.1,<0.61.0a0 17 | - rmm 18 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/source/_static/Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/ucx-py/37b1e7e097b83218b2e767f0a05dbbc1cb502c2d/docs/source/_static/Architecture.png -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. currentmodule:: ucp 5 | 6 | **ucp** 7 | 8 | .. autosummary:: 9 | ucp 10 | ucp.create_listener 11 | ucp.create_endpoint 12 | ucp.get_address 13 | ucp.get_config 14 | ucp.get_ucp_worker 15 | ucp.get_ucx_version 16 | ucp.init 17 | ucp.progress 18 | ucp.reset 19 | 20 | **Endpoint** 21 | 22 | .. autosummary:: 23 | Endpoint 24 | Endpoint.abort 25 | Endpoint.close 26 | Endpoint.closed 27 | Endpoint.close_after_n_recv 28 | Endpoint.cuda_support 29 | Endpoint.get_ucp_endpoint 30 | Endpoint.get_ucp_worker 31 | Endpoint.recv 32 | Endpoint.send 33 | Endpoint.ucx_info 34 | Endpoint.uid 35 | 36 | **Listener** 37 | 38 | .. autosummary:: 39 | Listener 40 | Listener.close 41 | Listener.closed 42 | Listener.port 43 | 44 | .. currentmodule:: ucp 45 | 46 | .. autofunction:: create_listener 47 | .. autofunction:: create_endpoint 48 | .. autofunction:: get_address 49 | .. autofunction:: get_config 50 | .. autofunction:: get_ucp_worker 51 | .. autofunction:: get_ucx_version 52 | .. autofunction:: init 53 | .. autofunction:: progress 54 | .. autofunction:: reset 55 | 56 | Endpoint 57 | -------- 58 | 59 | .. currentmodule:: ucp 60 | 61 | .. autoclass:: Endpoint 62 | :members: 63 | 64 | 65 | Listener 66 | -------- 67 | 68 | .. currentmodule:: ucp 69 | 70 | .. autoclass:: Listener 71 | :members: 72 | -------------------------------------------------------------------------------- /docs/source/deployment.rst: -------------------------------------------------------------------------------- 1 | NVLink and Docker/Kubernetes 2 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | 4 | In order to use NVLink when running in containers using Docker and/or 5 | Kubernetes the processes must share an IPC namespace for NVLink to work 6 | correctly. 7 | 8 | Many GPUs in one container 9 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 10 | 11 | The simplest way to ensure that processing accessing GPUs share an IPC 12 | namespace is to run the processes within the same container. This means 13 | exposing multiple GPUs to a single container. 14 | 15 | Many containers with a shared IPC namespace 16 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 17 | 18 | If you wish to isolate your processes into multiple containers and 19 | expose one or more GPUs to each container you need to ensure they are 20 | using a shared IPC namespace. 21 | 22 | In a Docker configuration you can mark one container as having a 23 | shareable IPC namespace with the flag ``--ipc="shareable"``. Other 24 | containers can then share that namespace with the flag 25 | ``--ipc="container: <_name-or-ID_>"`` and passing the name or ID of the 26 | container that is sharing it’s namespace. 27 | 28 | You can also share the host IPC namespace with your container with the 29 | flag ``--ipc="host"``, however this is not recommended on multi-tenant 30 | hosts. 31 | 32 | Privileged pods in a Kubernetes cluster `can also be configured to share 33 | the host IPC`_. 34 | 35 | For more information see the `Docker documentation`_. 36 | 37 | .. _can also be configured to share the host IPC: https://kubernetes.io/docs/concepts/policy/pod-security-policy/#host-namespaces 38 | .. _Docker documentation: https://docs.docker.com/engine/reference/run/#ipc-settings---ipc -------------------------------------------------------------------------------- /docs/source/glossary.rst: -------------------------------------------------------------------------------- 1 | Glossary 2 | -------- 3 | 4 | - ACK Acknowledge 5 | - am Active Message 6 | - AMO Atomic Memory Operation 7 | - ANL Argonne National Laboratory 8 | - AZP AZure Pipeline 9 | - bcopy Byte copy 10 | - Bistro Binary Instrumentation 11 | - BTL Byte Transfer Layer 12 | - cm Connection Manager 13 | - CMA Cross Memory Attach 14 | - CQ Completion Queue(Infiniband) 15 | - CQE Completion Queue Entry(Infiniband) 16 | - csmock static analysis tools 17 | - CUDA Compute Unified Device Architecture(NVIDIA) 18 | - DC Dynamically Connected transport(Infiniband) 19 | - ep EndPoint 20 | - FC Flow Control 21 | - fd File Descriptor 22 | - GDR GPUDirect RDMA 23 | - gtest Google Test 24 | - HPC High Performance Computing 25 | - HWTM HardWare Tag Matching 26 | - IB Infiniband 27 | - iface Interaface 28 | - IPC Inter Process Communication 29 | - JUCX Java API over UCP 30 | - KLM A new sophisticated way of creating memory regions.- (Mellanox specific) 31 | - KNEM Kernel Nemesis 32 | - LLNL Lawrence Livermore National Laboratory 33 | - madvise give advice about use of memory. See manual - madvise(2) 34 | - md Memory Domain 35 | - MEMH Memory Handle 36 | - MLX Mellanox Technologies 37 | - mlx5 Connect-X5 VPI 38 | - mm Memory Mapper 39 | - MPI Message Passing INterface 40 | - MPICH A MPI Implementation 41 | - MTT The MPI Testing Tool 42 | - NAK Negative Acknowledge 43 | - ODP OnDemand Paging 44 | - OFA OpenFabrics Alliance 45 | - OMPI OpenMPI 46 | - OOB Out of band 47 | - OOO Out of Order 48 | - OPA Omni-Path Architecture 49 | - Open MPI A MPI Implementation 50 | - ORNL Oak Ridge National Laboratory 51 | - PCIe PCI Express 52 | - PGAS Partitioned Global Address Space 53 | - POSIX Portable operating system interface 54 | - ppn processes per node 55 | - PR Pull Request 56 | - PROGRESS64 A C library of scalable functions for - concurrent programs, primarily focused on networking - applications.(https://github.com/ARM-software/- progress64) 57 | - QP Queue Pair(Infiniband) 58 | - RC Reliable Connection (Infiniband) 59 | - rcache Registration Cache 60 | - RDMA Remote Direct Memory Access 61 | - REQ Request 62 | - rkey Remote KEY 63 | - RMA Remote Memory Access 64 | - RNR Receiver Not Ready 65 | - RoCE RDMA over Converged Ethernet 66 | - ROCm Radeon Open Compute platform(AMD) 67 | - RTE Run Time Environment 68 | - RX Receive 69 | - Skb Socket Buffer 70 | - sm Shared Memory 71 | - SM Subnet Manager(Infiniband) 72 | - SockCM Socket Connection Manager 73 | - SRQ Shared Receive Queue 74 | - SYSV UNIX System V 75 | - tl Transport Layer 76 | - TLS Transpot LayerS 77 | - TM Tag Matching 78 | - TX Transmit 79 | - UC Unreliable Connection (Infiniband) 80 | - UCF Unified Communication Framework 81 | - UCM Unified Communication Memory 82 | - UCP Unified Communication Protocols Higher level API 83 | - UCS Unified Communication Service Common utilities. 84 | - UCT Unified Communication Transport Lower level API 85 | - UCX Unified Communication X 86 | - UD Unreliable Datagram (Infiniband) 87 | - uGNI user level generic network interface(Cray) 88 | - UMR User mode memory registration 89 | - VPI Virtual Protocol Interconnect 90 | - WQ Work Queue(Infiniband) 91 | - WQE Work Queue Elements (pronounce WOOKIE) 92 | - WR Work Request 93 | - XPMEM cross partition memory 94 | - XRC eXtended Reliable Connection(Infiniband) 95 | - Zcopy Zero Copy -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | UCX-Py 2 | ====== 3 | 4 | UCX-Py is the Python interface for `UCX `_, a low-level high-performance networking library. UCX and UCX-Py supports several transport methods including InfiniBand and NVLink while still using traditional networking protocols like TCP. 5 | 6 | 7 | .. image:: _static/Architecture.png 8 | :alt: A simple dask dictionary 9 | :align: center 10 | 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :hidden: 15 | 16 | quickstart 17 | install 18 | configuration 19 | deployment 20 | ucx-debug 21 | 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | :hidden: 26 | :caption: Help & reference 27 | 28 | os-limits 29 | transport-monitoring 30 | send-recv 31 | api 32 | glossary 33 | -------------------------------------------------------------------------------- /docs/source/os-limits.rst: -------------------------------------------------------------------------------- 1 | Operating System Limits 2 | ======================= 3 | 4 | 5 | UCX can be affected by a variety of limits, not just defined by UCX itself but also by the operating system. In this section we describe some of the limits that may be encountered by the user when running UCX-Py or just UCX alone. 6 | 7 | File Descriptors 8 | ---------------- 9 | 10 | In sockets-based connections, multiple file descriptors may be open to establish connections between endpoints. When UCX is establishing connection between endpoints via protocols such as TCP, an error such as below may occur: 11 | 12 | :: 13 | 14 | ucp.exceptions.UCXError: User-defined limit was reached 15 | 16 | One possible cause for this is that the limit established by the OS or system administrators has been reached by the user. This limit can be checked with: 17 | 18 | :: 19 | 20 | $ ulimit -n 21 | 22 | If the user has permission to do so, the file descriptor limit can be increased by typing the new limit after the command above. For example, to set a new limit of 1 million, the following should be executed: 23 | 24 | :: 25 | 26 | $ ulimit -n 1000000 27 | 28 | Another way the number of open files limit can be increased is by editing the limits.conf file in the operating system. Please consult your system administration for details. 29 | 30 | Please note that the number of open files required may different according to the application, further investigation may be required to find optimal values. 31 | 32 | For systems with specialized hardware such as InfiniBand, using RDMACM may also help circumventing that issue, as it doesn't rely heavily on file descriptors. 33 | 34 | 35 | Maximum Connections 36 | ------------------- 37 | 38 | UCX respects the operating system's limit of socket listen() backlog, known in userspace as SOMAXCONN. This limit may cause creating may cause new endpoints from connecting to a listener to hang if too many connections happen to be initiated too quickly. 39 | 40 | To check for the current limit, the user can execute the following command: 41 | 42 | :: 43 | 44 | $ sysctl net.core.somaxconn 45 | 46 | For most Linux distros, the default limit is 128. To increase that limit to 65535 for example, the user may run the following (require root or sudo permissions): 47 | 48 | :: 49 | 50 | $ sudo sysctl -w net.core.somaxconn=128 51 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | 5 | Setup 6 | ----- 7 | 8 | Create a new conda environment with UCX-Py: 9 | 10 | :: 11 | 12 | conda create -n ucx -c conda-forge -c rapidsai \ 13 | cudatoolkit= ucx-py 14 | 15 | For a more detailed guide on installation options please refer to the :doc:`install` page. 16 | 17 | Send/Recv NumPy Arrays 18 | ---------------------- 19 | 20 | Process 1 - Server 21 | ~~~~~~~~~~~~~~~~~~ 22 | 23 | .. code-block:: python 24 | 25 | import asyncio 26 | import time 27 | import ucp 28 | import numpy as np 29 | 30 | n_bytes = 2**30 31 | host = ucp.get_address(ifname='eth0') # ethernet device name 32 | port = 13337 33 | 34 | async def send(ep): 35 | # recv buffer 36 | arr = np.empty(n_bytes, dtype='u1') 37 | await ep.recv(arr) 38 | assert np.count_nonzero(arr) == np.array(0, dtype=np.int64) 39 | print("Received NumPy array") 40 | 41 | # increment array and send back 42 | arr += 1 43 | print("Sending incremented NumPy array") 44 | await ep.send(arr) 45 | 46 | await ep.close() 47 | lf.close() 48 | 49 | async def main(): 50 | global lf 51 | lf = ucp.create_listener(send, port) 52 | 53 | while not lf.closed(): 54 | await asyncio.sleep(0.1) 55 | 56 | if __name__ == '__main__': 57 | asyncio.run(main()) 58 | 59 | Process 2 - Client 60 | ~~~~~~~~~~~~~~~~~~ 61 | 62 | .. code-block:: python 63 | 64 | import asyncio 65 | import ucp 66 | import numpy as np 67 | 68 | port = 13337 69 | n_bytes = 2**30 70 | 71 | async def main(): 72 | host = ucp.get_address(ifname='eth0') # ethernet device name 73 | ep = await ucp.create_endpoint(host, port) 74 | msg = np.zeros(n_bytes, dtype='u1') # create some data to send 75 | 76 | # send message 77 | print("Send Original NumPy array") 78 | await ep.send(msg) # send the real message 79 | 80 | # recv response 81 | print("Receive Incremented NumPy arrays") 82 | resp = np.empty_like(msg) 83 | await ep.recv(resp) # receive the echo 84 | await ep.close() 85 | np.testing.assert_array_equal(msg + 1, resp) 86 | 87 | if __name__ == '__main__': 88 | asyncio.run(main()) 89 | 90 | 91 | 92 | Send/Recv CuPy Arrays 93 | --------------------- 94 | 95 | .. note:: 96 | If you are passing CuPy arrays between GPUs and want to use `NVLINK `_ ensure you have correctly set ``UCX_TLS`` with ``cuda_ipc``. See the :doc:`configuration` for more details 97 | 98 | Process 1 - Server 99 | ~~~~~~~~~~~~~~~~~~ 100 | 101 | .. code-block:: python 102 | 103 | import asyncio 104 | import time 105 | import ucp 106 | import cupy as cp 107 | 108 | n_bytes = 2**30 109 | host = ucp.get_address(ifname='eth0') # ethernet device name 110 | port = 13337 111 | 112 | async def send(ep): 113 | # recv buffer 114 | arr = cp.empty(n_bytes, dtype='u1') 115 | await ep.recv(arr) 116 | assert cp.count_nonzero(arr) == cp.array(0, dtype=cp.int64) 117 | print("Received CuPy array") 118 | 119 | # increment array and send back 120 | arr += 1 121 | print("Sending incremented CuPy array") 122 | await ep.send(arr) 123 | 124 | await ep.close() 125 | lf.close() 126 | 127 | async def main(): 128 | global lf 129 | lf = ucp.create_listener(send, port) 130 | 131 | while not lf.closed(): 132 | await asyncio.sleep(0.1) 133 | 134 | if __name__ == '__main__': 135 | asyncio.run(main()) 136 | 137 | Process 2 - Client 138 | ~~~~~~~~~~~~~~~~~~ 139 | 140 | .. code-block:: python 141 | 142 | import asyncio 143 | import ucp 144 | import cupy as cp 145 | import numpy as np 146 | 147 | port = 13337 148 | n_bytes = 2**30 149 | 150 | async def main(): 151 | host = ucp.get_address(ifname='eth0') # ethernet device name 152 | ep = await ucp.create_endpoint(host, port) 153 | msg = cp.zeros(n_bytes, dtype='u1') # create some data to send 154 | 155 | # send message 156 | print("Send Original CuPy array") 157 | await ep.send(msg) # send the real message 158 | 159 | # recv response 160 | print("Receive Incremented CuPy arrays") 161 | resp = cp.empty_like(msg) 162 | await ep.recv(resp) # receive the echo 163 | await ep.close() 164 | cp.testing.assert_array_equal(msg + 1, resp) 165 | 166 | if __name__ == '__main__': 167 | asyncio.run(main()) 168 | -------------------------------------------------------------------------------- /docs/source/transport-monitoring.rst: -------------------------------------------------------------------------------- 1 | Monitoring Transports 2 | ===================== 3 | 4 | Below is a list of commonly used tools and commands to monitor InfiniBand and CUDA IPC messages: 5 | 6 | 7 | Infiniband 8 | ---------- 9 | 10 | Monitor InfiniBand packet counters -- this number should dramatically increase when there's InfiniBand traffic: 11 | 12 | :: 13 | 14 | watch -n 0.1 'cat /sys/class/infiniband/mlx5_*/ports/1/counters/port_xmit_data' 15 | 16 | 17 | CUDA IPC/NVLink 18 | --------------- 19 | 20 | Monitor traffic over all GPUs 21 | 22 | :: 23 | 24 | nvidia-smi nvlink -gt d 25 | 26 | 27 | Monitor traffic over all GPUs on counter 0 28 | 29 | .. note:: 30 | nvidia-smi nvlink -g is now deprecated 31 | 32 | :: 33 | 34 | # set counters 35 | nvidia-smi nvlink -sc 0bz 36 | watch -d 'nvidia-smi nvlink -g 0' 37 | 38 | 39 | Stats Monitoring of GPUs 40 | :: 41 | 42 | dcgmi dmon -e 449 43 | 44 | `nvdashboard `_ 45 | -------------------------------------------------------------------------------- /examples/cudf-example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | from dask_cuda import LocalCUDACluster 5 | from dask_cuda.initialize import initialize 6 | from distributed import Client 7 | 8 | import cudf 9 | import dask_cudf 10 | 11 | enable_tcp_over_ucx = True 12 | enable_infiniband = False 13 | enable_nvlink = False 14 | 15 | 16 | async def run(): 17 | initialize( 18 | create_cuda_context=True, 19 | enable_tcp_over_ucx=enable_tcp_over_ucx, 20 | enable_infiniband=enable_infiniband, 21 | enable_nvlink=enable_nvlink, 22 | ) 23 | 24 | async with LocalCUDACluster( 25 | interface="enp1s0f0", 26 | protocol="ucx", 27 | enable_tcp_over_ucx=enable_tcp_over_ucx, 28 | enable_infiniband=enable_infiniband, 29 | enable_nvlink=enable_nvlink, 30 | asynchronous=True, 31 | ) as cluster: 32 | async with Client(cluster, asynchronous=True) as client: 33 | d = dask_cudf.from_cudf( 34 | cudf.DataFrame({"a": range(2**16)}), npartitions=2 35 | ) 36 | r = d.sum() 37 | 38 | for i in range(100): 39 | print("Running iteration:", i) 40 | start = time.time() 41 | await client.compute(r) 42 | print("Time for iteration", i, ":", time.time() - start) 43 | 44 | 45 | if __name__ == "__main__": 46 | asyncio.run(run()) 47 | -------------------------------------------------------------------------------- /examples/cupy-example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | import cupy 5 | 6 | from dask import array as da 7 | from dask_cuda import LocalCUDACluster 8 | from dask_cuda.initialize import initialize 9 | from distributed import Client 10 | 11 | enable_tcp_over_ucx = True 12 | enable_infiniband = False 13 | enable_nvlink = False 14 | 15 | 16 | async def run(): 17 | initialize( 18 | create_cuda_context=True, 19 | enable_tcp_over_ucx=enable_tcp_over_ucx, 20 | enable_infiniband=enable_infiniband, 21 | enable_nvlink=enable_nvlink, 22 | ) 23 | 24 | async with LocalCUDACluster( 25 | interface="enp1s0f0", 26 | protocol="ucx", 27 | enable_tcp_over_ucx=enable_tcp_over_ucx, 28 | enable_infiniband=enable_infiniband, 29 | enable_nvlink=enable_nvlink, 30 | asynchronous=True, 31 | ) as cluster: 32 | async with Client(cluster, asynchronous=True) as client: 33 | rs = da.random.RandomState(RandomState=cupy.random.RandomState) 34 | a = rs.normal(10, 1, (int(4e3), int(4e3)), chunks=(int(1e3), int(1e3))) 35 | x = a + a.T 36 | 37 | for i in range(100): 38 | print("Running iteration:", i) 39 | start = time.time() 40 | await client.compute(x) 41 | print("Time for iteration", i, ":", time.time() - start) 42 | 43 | 44 | if __name__ == "__main__": 45 | asyncio.run(run()) 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021-2025, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | [build-system] 16 | build-backend = "rapids_build_backend.build" 17 | requires = [ 18 | "rapids-build-backend>=0.3.1,<0.4.0dev0", 19 | "setuptools>=64.0.0", 20 | ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. 21 | 22 | [project] 23 | name = "ucx-py" 24 | dynamic = ["version"] 25 | description = "Python Bindings for the Unified Communication X library (UCX)" 26 | readme = { file = "README.md", content-type = "text/markdown" } 27 | authors = [ 28 | { name = "NVIDIA Corporation" }, 29 | ] 30 | license = { text = "BSD-3-Clause" } 31 | requires-python = ">=3.10" 32 | dependencies = [ 33 | "numpy>=1.23,<3.0a0", 34 | "pynvml>=12.0.0,<13.0.0a0", 35 | ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. 36 | classifiers = [ 37 | "Intended Audience :: Developers", 38 | "Intended Audience :: System Administrators", 39 | "License :: OSI Approved :: BSD License", 40 | "Operating System :: POSIX :: Linux", 41 | "Programming Language :: Python", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | "Topic :: System :: Hardware", 44 | "Topic :: System :: Systems Administration", 45 | "Programming Language :: Python :: 3", 46 | ] 47 | 48 | [project.optional-dependencies] 49 | test = [ 50 | "cloudpickle", 51 | "cudf==25.8.*,>=0.0.0a0", 52 | "cupy-cuda12x>=12.0.0", 53 | "distributed", 54 | "numba>=0.59.1,<0.62.0a0", 55 | "pytest-asyncio", 56 | "pytest-rerunfailures", 57 | "pytest==7.*", 58 | "rapids-dask-dependency==25.8.*,>=0.0.0a0", 59 | ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. 60 | 61 | [project.urls] 62 | Homepage = "https://github.com/rapidsai/ucx-py" 63 | Documentation = "https://ucx-py.readthedocs.io/en/stable/" 64 | Source = "https://github.com/rapidsai/ucx-py" 65 | 66 | [tool.isort] 67 | line_length = 79 68 | multi_line_output = 3 69 | include_trailing_comma = true 70 | force_grid_wrap = 0 71 | combine_as_imports = true 72 | order_by_type = true 73 | known_dask = [ 74 | "dask", 75 | "distributed", 76 | "dask_cuda", 77 | ] 78 | known_rapids = [ 79 | "rmm", 80 | "cuml", 81 | "cugraph", 82 | "dask_cudf", 83 | "cudf", 84 | ] 85 | known_first_party = [ 86 | "ucp", 87 | ] 88 | default_section = "THIRDPARTY" 89 | sections = [ 90 | "FUTURE", 91 | "STDLIB", 92 | "THIRDPARTY", 93 | "DASK", 94 | "RAPIDS", 95 | "FIRSTPARTY", 96 | "LOCALFOLDER", 97 | ] 98 | skip = [ 99 | ".eggs", 100 | ".git", 101 | ".hg", 102 | ".mypy_cache", 103 | ".tox", 104 | ".venv", 105 | "build", 106 | "dist", 107 | "__init__.py", 108 | ] 109 | 110 | [tool.pytest.ini_options] 111 | xfail_strict = true 112 | addopts = "--tb=native" 113 | 114 | [tool.rapids-build-backend] 115 | build-backend = "setuptools.build_meta" 116 | commit-files = [ 117 | "ucp/COMMIT_FILE" 118 | ] 119 | # by default, do not rename the package 'ucx-py-cu${ver}' 120 | # (this is overridden in wheel publishing) 121 | disable-cuda=true 122 | dependencies-file = "dependencies.yaml" 123 | matrix-entry = "cuda_suffixed=true" 124 | requires = [ 125 | "cython>=3.0.0", 126 | ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit dependencies.yaml and run `rapids-dependency-file-generator`. 127 | 128 | [tool.setuptools] 129 | license-files = ["LICENSE"] 130 | zip-safe = false 131 | 132 | [tool.setuptools.packages.find] 133 | exclude=["*tests*"] 134 | 135 | [tool.setuptools.dynamic] 136 | version = {file = "ucp/VERSION"} 137 | 138 | [tool.pydistcheck] 139 | select = [ 140 | "distro-too-large-compressed", 141 | ] 142 | 143 | # PyPI limit is 100 MiB, fail CI before we get too close to that 144 | max_allowed_size_compressed = '75M' 145 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # This file is a copy of what is available in a Cython demo + some additions 5 | 6 | from __future__ import absolute_import, print_function 7 | 8 | import glob 9 | import os 10 | from distutils.sysconfig import get_config_var, get_python_inc 11 | 12 | from Cython.Distutils.build_ext import new_build_ext 13 | from setuptools import setup 14 | from setuptools.extension import Extension 15 | 16 | 17 | def _find_libucx_libs_and_headers(): 18 | """ 19 | If the 'libucx' wheel is not installed, returns a tuple of empty lists. 20 | In that case, the project will be compiled against system installations 21 | of the UCX libraries. 22 | 23 | If 'libucx' is installed, returns lists of library and header paths to help 24 | the compiler and linker find its contents. In that case, the project will 25 | be compiled against those libucx-wheel-provided versions of the UCX libraries. 26 | """ 27 | try: 28 | import libucx 29 | except ImportError: 30 | return [], [] 31 | 32 | # find 'libucx' 33 | module_dir = os.path.dirname(libucx.__file__) 34 | 35 | # find where it stores files like 'libucm.so.0' 36 | libs = glob.glob(f"{module_dir}/**/lib*.so*", recursive=True) 37 | 38 | # deduplicate those library paths 39 | lib_dirs = {os.path.dirname(f) for f in libs} 40 | if not lib_dirs: 41 | raise RuntimeError( 42 | f"Did not find shared libraries in 'libucx' install location ({module_dir})" 43 | ) 44 | 45 | # find where it stores headers 46 | headers = glob.glob(f"{module_dir}/**/include", recursive=True) 47 | 48 | # deduplicate those header paths (and ensure the list only includes directories) 49 | header_dirs = {f for f in headers if os.path.isdir(f)} 50 | if not header_dirs: 51 | raise RuntimeError( 52 | f"Did not find UCX headers 'libucx' install location ({module_dir})" 53 | ) 54 | 55 | return list(lib_dirs), list(header_dirs) 56 | 57 | 58 | include_dirs = [os.path.dirname(get_python_inc())] 59 | library_dirs = [get_config_var("LIBDIR")] 60 | libraries = ["ucp", "uct", "ucm", "ucs"] 61 | extra_compile_args = ["-std=c99", "-Werror"] 62 | 63 | # tell the compiler and linker where to find UCX libraries and their headers 64 | # provided by the 'libucx' wheel 65 | libucx_lib_dirs, libucx_header_dirs = _find_libucx_libs_and_headers() 66 | library_dirs.extend(libucx_lib_dirs) 67 | include_dirs.extend(libucx_header_dirs) 68 | 69 | 70 | ext_modules = [ 71 | Extension( 72 | "ucp._libs.ucx_api", 73 | sources=["ucp/_libs/ucx_api.pyx", "ucp/_libs/src/c_util.c"], 74 | depends=["ucp/_libs/src/c_util.h", "ucp/_libs/ucx_api_dep.pxd"], 75 | include_dirs=include_dirs, 76 | library_dirs=library_dirs, 77 | libraries=libraries, 78 | extra_compile_args=extra_compile_args, 79 | ), 80 | Extension( 81 | "ucp._libs.arr", 82 | sources=["ucp/_libs/arr.pyx"], 83 | include_dirs=include_dirs, 84 | library_dirs=library_dirs, 85 | libraries=libraries, 86 | extra_compile_args=extra_compile_args, 87 | ), 88 | ] 89 | 90 | setup( 91 | ext_modules=ext_modules, 92 | cmdclass={"build_ext": new_build_ext}, 93 | package_data={"ucp": ["VERSION"]}, 94 | ) 95 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | 4 | import pytest 5 | 6 | import ucp 7 | 8 | # Prevent calls such as `cudf = pytest.importorskip("cudf")` from initializing 9 | # a CUDA context. Such calls may cause tests that must initialize the CUDA 10 | # context on the appropriate device to fail. 11 | # For example, without `RAPIDS_NO_INITIALIZE=True`, `test_benchmark_cluster` 12 | # will succeed if running alone, but fails when all tests are run in batch. 13 | os.environ["RAPIDS_NO_INITIALIZE"] = "True" 14 | 15 | 16 | def pytest_addoption(parser): 17 | parser.addoption( 18 | "--runslow", action="store_true", default=False, help="run slow tests" 19 | ) 20 | 21 | 22 | def pytest_configure(config): 23 | config.addinivalue_line("markers", "slow: mark test as slow to run") 24 | 25 | 26 | def pytest_collection_modifyitems(config, items): 27 | if config.getoption("--runslow"): 28 | # --runslow given in cli: do not skip slow tests 29 | return 30 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 31 | for item in items: 32 | if "slow" in item.keywords: 33 | item.add_marker(skip_slow) 34 | 35 | 36 | def handle_exception(loop, context): 37 | msg = context.get("exception", context["message"]) 38 | print(msg) 39 | 40 | 41 | # Let's make sure that UCX gets time to cancel 42 | # progress tasks before closing the event loop. 43 | @pytest.fixture() 44 | def event_loop(scope="session"): 45 | loop = asyncio.new_event_loop() 46 | loop.set_exception_handler(handle_exception) 47 | ucp.reset() 48 | yield loop 49 | ucp.reset() 50 | loop.run_until_complete(asyncio.sleep(0)) 51 | loop.close() 52 | -------------------------------------------------------------------------------- /tests/test_benchmark_cluster.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import tempfile 3 | from itertools import chain 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | from ucp.benchmarks.utils import _run_cluster_server, _run_cluster_workers 9 | 10 | 11 | async def _worker(rank, eps, args): 12 | futures = [] 13 | # Send my rank to all others 14 | for ep in eps.values(): 15 | futures.append(ep.send(np.array([rank], dtype="u4"))) 16 | # Recv from all others 17 | result = np.empty(len(eps.values()), dtype="u4") 18 | futures += list(ep.recv(result[i : i + 1]) for i, ep in enumerate(eps.values())) 19 | 20 | # Wait for transfers to complete 21 | await asyncio.gather(*futures) 22 | 23 | # We expect to get the sum of all ranks excluding ours 24 | expect = sum(range(len(eps) + 1)) - rank 25 | assert expect == result.sum() 26 | 27 | 28 | @pytest.mark.asyncio 29 | async def test_benchmark_cluster(n_chunks=1, n_nodes=2, n_workers=2): 30 | server_file = tempfile.NamedTemporaryFile() 31 | 32 | server, server_ret = _run_cluster_server(server_file.name, n_nodes * n_workers) 33 | 34 | # Wait for server to become available 35 | with open(server_file.name, "r") as f: 36 | while len(f.read()) == 0: 37 | pass 38 | 39 | workers = list( 40 | chain.from_iterable( 41 | _run_cluster_workers(server_file.name, n_chunks, n_workers, i, _worker) 42 | for i in range(n_nodes) 43 | ) 44 | ) 45 | 46 | for worker in workers: 47 | worker.join() 48 | assert not worker.exitcode 49 | 50 | server.join() 51 | assert not server.exitcode 52 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | from utils import captured_logger 6 | 7 | import ucp 8 | 9 | 10 | def test_get_config(): 11 | with patch.dict(os.environ): 12 | # Unset to test default value 13 | if os.environ.get("UCX_TLS") is not None: 14 | del os.environ["UCX_TLS"] 15 | ucp.reset() 16 | config = ucp.get_config() 17 | assert isinstance(config, dict) 18 | assert config["TLS"] == "all" 19 | 20 | 21 | @patch.dict(os.environ, {"UCX_SEG_SIZE": "2M"}) 22 | def test_set_env(): 23 | ucp.reset() 24 | config = ucp.get_config() 25 | assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"] 26 | 27 | 28 | @patch.dict(os.environ, {"UCX_SEG_SIZE": "2M"}) 29 | def test_init_options(): 30 | ucp.reset() 31 | options = {"SEG_SIZE": "3M"} 32 | # environment specification should be ignored 33 | ucp.init(options) 34 | config = ucp.get_config() 35 | assert config["SEG_SIZE"] == options["SEG_SIZE"] 36 | 37 | 38 | @patch.dict(os.environ, {"UCX_SEG_SIZE": "4M"}) 39 | def test_init_options_and_env(): 40 | ucp.reset() 41 | options = {"SEG_SIZE": "3M"} # Should be ignored 42 | ucp.init(options, env_takes_precedence=True) 43 | config = ucp.get_config() 44 | assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"] 45 | # Provided options dict was not modified. 46 | assert options == {"SEG_SIZE": "3M"} 47 | 48 | 49 | @pytest.mark.skipif( 50 | ucp.get_ucx_version() >= (1, 12, 0), 51 | reason="Beginning with UCX >= 1.12, it's only possible to validate " 52 | "UCP options but not options from other modules such as UCT. " 53 | "See https://github.com/openucx/ucx/issues/7519.", 54 | ) 55 | def test_init_unknown_option(): 56 | ucp.reset() 57 | options = {"UNKNOWN_OPTION": "3M"} 58 | with pytest.raises(ucp.exceptions.UCXConfigError): 59 | ucp.init(options) 60 | 61 | 62 | def test_init_invalid_option(): 63 | ucp.reset() 64 | options = {"SEG_SIZE": "invalid-size"} 65 | with pytest.raises(ucp.exceptions.UCXConfigError): 66 | ucp.init(options) 67 | 68 | 69 | @patch.dict(os.environ, {"UCX_SEG_SIZE": "2M"}) 70 | def test_logging(): 71 | """ 72 | Test default logging configuration. 73 | """ 74 | import logging 75 | 76 | root = logging.getLogger("ucx") 77 | 78 | # ucp.init will only print INFO LINES 79 | with captured_logger(root, level=logging.INFO) as foreign_log: 80 | ucp.reset() 81 | options = {"SEG_SIZE": "3M"} 82 | ucp.init(options) 83 | assert len(foreign_log.getvalue()) > 0 84 | 85 | with captured_logger(root, level=logging.ERROR) as foreign_log: 86 | ucp.reset() 87 | options = {"SEG_SIZE": "3M"} 88 | ucp.init(options) 89 | 90 | assert len(foreign_log.getvalue()) == 0 91 | -------------------------------------------------------------------------------- /tests/test_custom_send_recv.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pickle 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | import ucp 8 | 9 | cudf = pytest.importorskip("cudf") 10 | distributed = pytest.importorskip("distributed") 11 | cuda = pytest.importorskip("numba.cuda") 12 | 13 | 14 | @pytest.mark.asyncio 15 | @pytest.mark.parametrize( 16 | "g", 17 | [ 18 | lambda cudf: cudf.Series([1, 2, 3]), 19 | lambda cudf: cudf.Series([1, 2, 3], index=[4, 5, 6]), 20 | lambda cudf: cudf.Series([1, None, 3]), 21 | lambda cudf: cudf.Series(range(2**13)), 22 | lambda cudf: cudf.DataFrame({"a": np.random.random(1200000)}), 23 | lambda cudf: cudf.DataFrame({"a": range(2**20)}), 24 | lambda cudf: cudf.DataFrame({"a": range(2**26)}), 25 | lambda cudf: cudf.Series(), 26 | lambda cudf: cudf.DataFrame(), 27 | lambda cudf: cudf.DataFrame({"a": [], "b": []}), 28 | lambda cudf: cudf.DataFrame({"a": [1.0], "b": [2.0]}), 29 | lambda cudf: cudf.DataFrame( 30 | {"a": ["a", "b", "c", "d"], "b": ["a", "b", "c", "d"]} 31 | ), 32 | lambda cudf: cudf.datasets.timeseries(), # ts index with ints, cats, floats 33 | ], 34 | ) 35 | async def test_send_recv_cudf(event_loop, g): 36 | from distributed.utils import nbytes 37 | 38 | class UCX: 39 | def __init__(self, ep): 40 | self.ep = ep 41 | 42 | async def write(self, cdf): 43 | header, _frames = cdf.serialize() 44 | frames = [pickle.dumps(header)] + _frames 45 | 46 | # Send meta data 47 | await self.ep.send(np.array([len(frames)], dtype=np.uint64)) 48 | await self.ep.send( 49 | np.array( 50 | [hasattr(f, "__cuda_array_interface__") for f in frames], 51 | dtype=bool, 52 | ) 53 | ) 54 | await self.ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64)) 55 | # Send frames 56 | for frame in frames: 57 | if nbytes(frame) > 0: 58 | await self.ep.send(frame) 59 | 60 | async def read(self): 61 | try: 62 | # Recv meta data 63 | nframes = np.empty(1, dtype=np.uint64) 64 | await self.ep.recv(nframes) 65 | is_cudas = np.empty(nframes[0], dtype=bool) 66 | await self.ep.recv(is_cudas) 67 | sizes = np.empty(nframes[0], dtype=np.uint64) 68 | await self.ep.recv(sizes) 69 | except (ucp.exceptions.UCXCanceled, ucp.exceptions.UCXCloseError) as e: 70 | msg = "SOMETHING TERRIBLE HAS HAPPENED IN THE TEST" 71 | raise e(msg) 72 | else: 73 | # Recv frames 74 | frames = [] 75 | for is_cuda, size in zip(is_cudas.tolist(), sizes.tolist()): 76 | if size > 0: 77 | if is_cuda: 78 | frame = cuda.device_array((size,), dtype=np.uint8) 79 | else: 80 | frame = np.empty(size, dtype=np.uint8) 81 | await self.ep.recv(frame) 82 | frames.append(frame) 83 | else: 84 | if is_cuda: 85 | frames.append(cuda.device_array((0,), dtype=np.uint8)) 86 | else: 87 | frames.append(b"") 88 | return frames 89 | 90 | class UCXListener: 91 | def __init__(self): 92 | self.comm = None 93 | 94 | def start(self): 95 | async def serve_forever(ep): 96 | ucx = UCX(ep) 97 | self.comm = ucx 98 | 99 | self.ucp_server = ucp.create_listener(serve_forever) 100 | 101 | uu = UCXListener() 102 | uu.start() 103 | uu.address = ucp.get_address() 104 | uu.client = await ucp.create_endpoint(uu.address, uu.ucp_server.port) 105 | ucx = UCX(uu.client) 106 | await asyncio.sleep(0.2) 107 | msg = g(cudf) 108 | frames, _ = await asyncio.gather(uu.comm.read(), ucx.write(msg)) 109 | ucx_header = pickle.loads(frames[0]) 110 | cudf_buffer = frames[1:] 111 | typ = type(msg) 112 | res = typ.deserialize(ucx_header, cudf_buffer) 113 | 114 | from cudf.testing import assert_eq 115 | 116 | assert_eq(res, msg) 117 | await uu.comm.ep.close() 118 | await uu.client.close() 119 | 120 | assert uu.client.closed() 121 | assert uu.comm.ep.closed() 122 | del uu.ucp_server 123 | ucp.reset() 124 | -------------------------------------------------------------------------------- /tests/test_disconnect.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | import multiprocessing as mp 4 | from io import StringIO 5 | from queue import Empty 6 | 7 | import numpy as np 8 | import pytest 9 | 10 | import ucp 11 | from ucp.utils import get_event_loop 12 | 13 | mp = mp.get_context("spawn") 14 | 15 | 16 | async def mp_queue_get_nowait(queue): 17 | while True: 18 | try: 19 | return queue.get_nowait() 20 | except Empty: 21 | pass 22 | await asyncio.sleep(0.01) 23 | 24 | 25 | def _test_shutdown_unexpected_closed_peer_server( 26 | client_queue, server_queue, endpoint_error_handling 27 | ): 28 | global ep_is_alive 29 | ep_is_alive = None 30 | 31 | async def run(): 32 | async def server_node(ep): 33 | try: 34 | global ep_is_alive 35 | 36 | await ep.send(np.arange(100, dtype=np.int64)) 37 | # Waiting for signal to close the endpoint 38 | await mp_queue_get_nowait(server_queue) 39 | 40 | # At this point, the client should have died and the endpoint 41 | # is not alive anymore. `True` only when endpoint error 42 | # handling is enabled. 43 | ep_is_alive = ep._ep.is_alive() 44 | 45 | await ep.close() 46 | finally: 47 | listener.close() 48 | 49 | listener = ucp.create_listener( 50 | server_node, endpoint_error_handling=endpoint_error_handling 51 | ) 52 | client_queue.put(listener.port) 53 | while not listener.closed(): 54 | await asyncio.sleep(0.1) 55 | 56 | log_stream = StringIO() 57 | logging.basicConfig(stream=log_stream, level=logging.DEBUG) 58 | get_event_loop().run_until_complete(run()) 59 | log = log_stream.getvalue() 60 | 61 | if endpoint_error_handling is True: 62 | assert ep_is_alive is False 63 | else: 64 | assert ep_is_alive 65 | assert log.find("""UCXError('<[Send shutdown]""") != -1 66 | 67 | 68 | def _test_shutdown_unexpected_closed_peer_client( 69 | client_queue, server_queue, endpoint_error_handling 70 | ): 71 | async def run(): 72 | server_port = client_queue.get() 73 | ep = await ucp.create_endpoint( 74 | ucp.get_address(), 75 | server_port, 76 | endpoint_error_handling=endpoint_error_handling, 77 | ) 78 | msg = np.empty(100, dtype=np.int64) 79 | await ep.recv(msg) 80 | 81 | get_event_loop().run_until_complete(run()) 82 | 83 | 84 | @pytest.mark.parametrize("endpoint_error_handling", [True, False]) 85 | def test_shutdown_unexpected_closed_peer(caplog, endpoint_error_handling): 86 | """ 87 | Test clean server shutdown after unexpected peer close 88 | 89 | This will causes some UCX warnings to be issued, but this as expected. 90 | The main goal is to assert that the processes exit without errors 91 | despite a somewhat messy initial state. 92 | """ 93 | if endpoint_error_handling is False and any( 94 | [ 95 | t.startswith(i) 96 | for i in ("rc", "dc", "ud") 97 | for t in ucp.get_active_transports() 98 | ] 99 | ): 100 | pytest.skip( 101 | "Endpoint error handling is required when rc, dc or ud" 102 | "transport is enabled" 103 | ) 104 | 105 | client_queue = mp.Queue() 106 | server_queue = mp.Queue() 107 | p1 = mp.Process( 108 | target=_test_shutdown_unexpected_closed_peer_server, 109 | args=(client_queue, server_queue, endpoint_error_handling), 110 | ) 111 | p1.start() 112 | p2 = mp.Process( 113 | target=_test_shutdown_unexpected_closed_peer_client, 114 | args=(client_queue, server_queue, endpoint_error_handling), 115 | ) 116 | p2.start() 117 | p2.join() 118 | server_queue.put("client is down") 119 | p1.join() 120 | 121 | assert not p1.exitcode 122 | assert not p2.exitcode 123 | -------------------------------------------------------------------------------- /tests/test_endpoint.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import pytest 4 | 5 | import ucp 6 | 7 | 8 | @pytest.mark.asyncio 9 | @pytest.mark.parametrize("server_close_callback", [True, False]) 10 | async def test_close_callback(server_close_callback): 11 | closed = [False] 12 | 13 | def _close_callback(): 14 | closed[0] = True 15 | 16 | async def server_node(ep): 17 | if server_close_callback is True: 18 | ep.set_close_callback(_close_callback) 19 | if server_close_callback is False: 20 | await ep.close() 21 | listener.close() 22 | 23 | async def client_node(port): 24 | ep = await ucp.create_endpoint( 25 | ucp.get_address(), 26 | port, 27 | ) 28 | if server_close_callback is False: 29 | ep.set_close_callback(_close_callback) 30 | if server_close_callback is True: 31 | await ep.close() 32 | 33 | listener = ucp.create_listener( 34 | server_node, 35 | ) 36 | await client_node(listener.port) 37 | while not listener.closed(): 38 | await asyncio.sleep(0.01) 39 | assert closed[0] is True 40 | 41 | 42 | @pytest.mark.asyncio 43 | @pytest.mark.parametrize("transfer_api", ["am", "tag"]) 44 | async def test_cancel(transfer_api): 45 | async def server_node(ep): 46 | await ep.close() 47 | 48 | async def client_node(port): 49 | ep = await ucp.create_endpoint(ucp.get_address(), port) 50 | if transfer_api == "am": 51 | with pytest.raises( 52 | ucp.exceptions.UCXCanceled, 53 | match="am_recv", 54 | ): 55 | await ep.am_recv() 56 | else: 57 | with pytest.raises( 58 | ucp.exceptions.UCXCanceled, 59 | match="Recv.*tag", 60 | ): 61 | msg = bytearray(1) 62 | await ep.recv(msg) 63 | await ep.close() 64 | 65 | listener = ucp.create_listener(server_node) 66 | await client_node(listener.port) 67 | -------------------------------------------------------------------------------- /tests/test_info.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import ucp 4 | 5 | 6 | @pytest.fixture(autouse=True) 7 | def reset(): 8 | ucp.reset() 9 | yield 10 | ucp.reset() 11 | 12 | 13 | def test_context_info(): 14 | info = ucp.get_ucp_context_info() 15 | assert isinstance(info, str) 16 | 17 | 18 | def test_worker_info(): 19 | info = ucp.get_ucp_worker_info() 20 | assert isinstance(info, str) 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "transports", 25 | ["posix", "tcp", "posix,tcp"], 26 | ) 27 | def test_check_transport(transports): 28 | transports_list = transports.split(",") 29 | inactive_transports = list(set(["posix", "tcp"]) - set(transports_list)) 30 | 31 | ucp.reset() 32 | options = {"TLS": transports, "NET_DEVICES": "all"} 33 | ucp.init(options) 34 | 35 | active_transports = ucp.get_active_transports() 36 | for t in transports_list: 37 | assert any([at.startswith(t) for at in active_transports]) 38 | for it in inactive_transports: 39 | assert any([not at.startswith(it) for at in active_transports]) 40 | -------------------------------------------------------------------------------- /tests/test_multiple_nodes.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | import ucp 7 | 8 | 9 | def get_somaxconn(): 10 | with open("/proc/sys/net/core/somaxconn", "r") as f: 11 | return int(f.readline()) 12 | 13 | 14 | async def hello(ep): 15 | msg2send = np.arange(10) 16 | msg2recv = np.empty_like(msg2send) 17 | f1 = ep.send(msg2send) 18 | f2 = ep.recv(msg2recv) 19 | await f1 20 | await f2 21 | np.testing.assert_array_equal(msg2send, msg2recv) 22 | assert isinstance(ep.ucx_info(), str) 23 | 24 | 25 | async def server_node(ep): 26 | await hello(ep) 27 | assert isinstance(ep.ucx_info(), str) 28 | await ep.close() 29 | 30 | 31 | async def client_node(port): 32 | ep = await ucp.create_endpoint(ucp.get_address(), port) 33 | await hello(ep) 34 | assert isinstance(ep.ucx_info(), str) 35 | 36 | 37 | @pytest.mark.asyncio 38 | @pytest.mark.parametrize("num_servers", [1, 2, 4]) 39 | @pytest.mark.parametrize("num_clients", [10, 50, 100]) 40 | async def test_many_servers_many_clients(num_servers, num_clients): 41 | somaxconn = get_somaxconn() 42 | 43 | listeners = [] 44 | 45 | for _ in range(num_servers): 46 | listeners.append(ucp.create_listener(server_node)) 47 | 48 | # We ensure no more than `somaxconn` connections are submitted 49 | # at once. Doing otherwise can block and hang indefinitely. 50 | for i in range(0, num_clients * num_servers, somaxconn): 51 | clients = [] 52 | for __ in range(i, min(i + somaxconn, num_clients * num_servers)): 53 | clients.append(client_node(listeners[__ % num_servers].port)) 54 | await asyncio.gather(*clients) 55 | -------------------------------------------------------------------------------- /tests/test_probe.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import pytest 4 | 5 | import ucp 6 | 7 | 8 | @pytest.mark.asyncio 9 | @pytest.mark.parametrize("transfer_api", ["am", "tag"]) 10 | async def test_message_probe(transfer_api): 11 | msg = bytearray(b"0" * 10) 12 | 13 | async def server_node(ep): 14 | # Wait for remote endpoint to close before probing the endpoint for 15 | # in-transit message and receiving it. 16 | while not ep.closed(): 17 | await asyncio.sleep(0) # Yield task 18 | 19 | if transfer_api == "am": 20 | assert ep._ep.am_probe() is True 21 | received = await ep.am_recv() 22 | else: 23 | assert ep._ctx.worker.tag_probe(ep._tags["msg_recv"]) is True 24 | received = bytearray(10) 25 | await ep.recv(received) 26 | assert received == msg 27 | 28 | async def client_node(port): 29 | ep = await ucp.create_endpoint( 30 | ucp.get_address(), 31 | port, 32 | ) 33 | if transfer_api == "am": 34 | await ep.am_send(msg) 35 | else: 36 | await ep.send(msg) 37 | 38 | listener = ucp.create_listener( 39 | server_node, 40 | ) 41 | await client_node(listener.port) 42 | -------------------------------------------------------------------------------- /tests/test_reset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import ucp 4 | 5 | 6 | class ResetAfterN: 7 | """Calls ucp.reset() after n calls""" 8 | 9 | def __init__(self, n): 10 | self.n = n 11 | self.count = 0 12 | 13 | def __call__(self): 14 | self.count += 1 15 | if self.count == self.n: 16 | ucp.reset() 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_reset(): 21 | reset = ResetAfterN(2) 22 | 23 | def server(ep): 24 | ep.abort() 25 | reset() 26 | 27 | lt = ucp.create_listener(server) 28 | ep = await ucp.create_endpoint(ucp.get_address(), lt.port) 29 | del lt 30 | del ep 31 | reset() 32 | 33 | 34 | @pytest.mark.asyncio 35 | async def test_lt_still_in_scope_error(): 36 | reset = ResetAfterN(2) 37 | 38 | def server(ep): 39 | ep.abort() 40 | reset() 41 | 42 | lt = ucp.create_listener(server) 43 | ep = await ucp.create_endpoint(ucp.get_address(), lt.port) 44 | del ep 45 | with pytest.raises( 46 | ucp.exceptions.UCXError, 47 | match="Trying to reset UCX but not all Endpoints and/or Listeners are closed()", 48 | ): 49 | ucp.reset() 50 | 51 | lt.close() 52 | ucp.reset() 53 | 54 | 55 | @pytest.mark.asyncio 56 | async def test_ep_still_in_scope_error(): 57 | reset = ResetAfterN(2) 58 | 59 | def server(ep): 60 | ep.abort() 61 | reset() 62 | 63 | lt = ucp.create_listener(server) 64 | ep = await ucp.create_endpoint(ucp.get_address(), lt.port) 65 | del lt 66 | with pytest.raises( 67 | ucp.exceptions.UCXError, 68 | match="Trying to reset UCX but not all Endpoints and/or Listeners are closed()", 69 | ): 70 | ucp.reset() 71 | ep.abort() 72 | ucp.reset() 73 | -------------------------------------------------------------------------------- /tests/test_rma.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import ucp 4 | 5 | 6 | @pytest.mark.asyncio 7 | @pytest.mark.parametrize("blocking_progress_mode", [True, False]) 8 | async def test_fence(blocking_progress_mode): 9 | # Test needs to be async here to ensure progress tasks are cleared 10 | # and avoid warnings. 11 | 12 | ucp.init(blocking_progress_mode=blocking_progress_mode) 13 | # this should always succeed 14 | ucp.fence() 15 | 16 | 17 | @pytest.mark.asyncio 18 | @pytest.mark.parametrize("blocking_progress_mode", [True, False]) 19 | async def test_flush(blocking_progress_mode): 20 | ucp.init(blocking_progress_mode=blocking_progress_mode) 21 | 22 | await ucp.flush() 23 | -------------------------------------------------------------------------------- /tests/test_send_recv_am.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from functools import partial 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | import ucp 8 | 9 | msg_sizes = [0] + [2**i for i in range(0, 25, 4)] 10 | 11 | 12 | def _bytearray_assert_equal(a, b): 13 | assert a == b 14 | 15 | 16 | def get_data(): 17 | ret = [ 18 | { 19 | "allocator": bytearray, 20 | "generator": lambda n: bytearray(b"m" * n), 21 | "validator": lambda recv, exp: _bytearray_assert_equal(recv, exp), 22 | "memory_type": "host", 23 | }, 24 | { 25 | "allocator": partial(np.ones, dtype=np.uint8), 26 | "generator": partial(np.arange, dtype=np.int64), 27 | "validator": lambda recv, exp: np.testing.assert_equal( 28 | recv.view(np.int64), exp 29 | ), 30 | "memory_type": "host", 31 | }, 32 | ] 33 | 34 | try: 35 | import cupy as cp 36 | 37 | ret.append( 38 | { 39 | "allocator": partial(cp.ones, dtype=np.uint8), 40 | "generator": partial(cp.arange, dtype=np.int64), 41 | "validator": lambda recv, exp: cp.testing.assert_array_equal( 42 | recv.view(np.int64), exp 43 | ), 44 | "memory_type": "cuda", 45 | } 46 | ) 47 | except ImportError: 48 | pass 49 | 50 | return ret 51 | 52 | 53 | def simple_server(size, recv): 54 | async def server(ep): 55 | recv = await ep.am_recv() 56 | await ep.am_send(recv) 57 | await ep.close() 58 | 59 | return server 60 | 61 | 62 | @pytest.mark.asyncio 63 | @pytest.mark.parametrize("size", msg_sizes) 64 | @pytest.mark.parametrize("blocking_progress_mode", [True, False]) 65 | @pytest.mark.parametrize("recv_wait", [True, False]) 66 | @pytest.mark.parametrize("data", get_data()) 67 | async def test_send_recv_am(size, blocking_progress_mode, recv_wait, data): 68 | rndv_thresh = 8192 69 | ucp.init( 70 | options={"RNDV_THRESH": str(rndv_thresh)}, 71 | blocking_progress_mode=blocking_progress_mode, 72 | ) 73 | 74 | ucp.register_am_allocator(data["allocator"], data["memory_type"]) 75 | msg = data["generator"](size) 76 | 77 | recv = [] 78 | listener = ucp.create_listener(simple_server(size, recv)) 79 | num_clients = 1 80 | clients = [ 81 | await ucp.create_endpoint(ucp.get_address(), listener.port) 82 | for i in range(num_clients) 83 | ] 84 | for c in clients: 85 | if recv_wait: 86 | # By sleeping here we ensure that the listener's 87 | # ep.am_recv call will have to wait, rather than return 88 | # immediately as receive data is already available. 89 | await asyncio.sleep(1) 90 | await c.am_send(msg) 91 | recv_msg = await c.am_recv() 92 | for c in clients: 93 | await c.close() 94 | listener.close() 95 | 96 | if data["memory_type"] == "cuda" and msg.nbytes < rndv_thresh: 97 | # Eager messages are always received on the host, if no host 98 | # allocator is registered UCX-Py defaults to `bytearray`. 99 | assert recv_msg == bytearray(msg.get()) 100 | else: 101 | data["validator"](recv_msg, msg) 102 | -------------------------------------------------------------------------------- /tests/test_tags.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import pytest 4 | 5 | import ucp 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_tag_match(): 10 | msg1 = bytes("msg1", "utf-8") 11 | msg2 = bytes("msg2", "utf-8") 12 | 13 | async def server_node(ep): 14 | f1 = ep.send(msg1, tag="msg1") 15 | await asyncio.sleep(1) # Let msg1 finish 16 | f2 = ep.send(msg2, tag="msg2") 17 | await asyncio.gather(f1, f2) 18 | await ep.close() 19 | 20 | lf = ucp.create_listener(server_node) 21 | ep = await ucp.create_endpoint(ucp.get_address(), lf.port) 22 | m1, m2 = (bytearray(len(msg1)), bytearray(len(msg2))) 23 | f2 = asyncio.create_task(ep.recv(m2, tag="msg2")) 24 | 25 | # At this point f2 shouldn't be able to finish because its 26 | # tag "msg2" doesn't match the servers send tag "msg1" 27 | done, pending = await asyncio.wait({f2}, timeout=0.01) 28 | assert f2 in pending 29 | # "msg1" should be ready 30 | await ep.recv(m1, tag="msg1") 31 | assert m1 == msg1 32 | await f2 33 | assert m2 == msg2 34 | -------------------------------------------------------------------------------- /tests/test_ucx_getters.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import ucp 4 | 5 | 6 | @pytest.mark.asyncio 7 | async def test_get_ucp_worker(): 8 | worker = ucp.get_ucp_worker() 9 | assert isinstance(worker, int) 10 | 11 | async def server(ep): 12 | assert ep.get_ucp_worker() == worker 13 | 14 | lt = ucp.create_listener(server) 15 | ep = await ucp.create_endpoint(ucp.get_address(), lt.port) 16 | assert ep.get_ucp_worker() == worker 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_get_endpoint(): 21 | async def server(ep): 22 | ucp_ep = ep.get_ucp_endpoint() 23 | assert isinstance(ucp_ep, int) 24 | assert ucp_ep > 0 25 | 26 | lt = ucp.create_listener(server) 27 | ep = await ucp.create_endpoint(ucp.get_address(), lt.port) 28 | ucp_ep = ep.get_ucp_endpoint() 29 | assert isinstance(ucp_ep, int) 30 | assert ucp_ep > 0 31 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | import ucp 2 | 3 | 4 | def test_get_ucx_version(): 5 | version = ucp.get_ucx_version() 6 | assert isinstance(version, tuple) 7 | assert len(version) == 3 8 | # Check UCX isn't initialized 9 | assert ucp.core._ctx is None 10 | 11 | 12 | def test_version_constants_are_populated(): 13 | # __git_commit__ will only be non-empty in a built distribution 14 | assert isinstance(ucp.__git_commit__, str) 15 | 16 | # __version__ should always be non-empty 17 | assert isinstance(ucp.__version__, str) 18 | assert len(ucp.__version__) > 0 19 | 20 | 21 | def test_ucx_version_constant(): 22 | assert isinstance(ucp.__ucx_version__, str) 23 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | from contextlib import contextmanager 5 | 6 | import numpy as np 7 | import pytest 8 | 9 | import ucp 10 | 11 | normal_env = { 12 | "UCX_RNDV_SCHEME": "put_zcopy", 13 | "UCX_MEMTYPE_CACHE": "n", 14 | "UCX_TLS": "rc,cuda_copy,cuda_ipc", 15 | "CUDA_VISIBLE_DEVICES": "0", 16 | } 17 | 18 | 19 | def set_env(): 20 | os.environ.update(normal_env) 21 | 22 | 23 | def get_num_gpus(): 24 | import pynvml 25 | 26 | pynvml.nvmlInit() 27 | ngpus = pynvml.nvmlDeviceGetCount() 28 | pynvml.nvmlShutdown() 29 | return ngpus 30 | 31 | 32 | def get_cuda_devices(): 33 | if "CUDA_VISIBLE_DEVICES" in os.environ: 34 | return os.environ["CUDA_VISIBLE_DEVICES"].split(",") 35 | else: 36 | ngpus = get_num_gpus() 37 | return list(range(ngpus)) 38 | 39 | 40 | @contextmanager 41 | def captured_logger(logger, level=logging.INFO, propagate=None): 42 | """Capture output from the given Logger.""" 43 | if isinstance(logger, str): 44 | logger = logging.getLogger(logger) 45 | orig_level = logger.level 46 | orig_handlers = logger.handlers[:] 47 | if propagate is not None: 48 | orig_propagate = logger.propagate 49 | logger.propagate = propagate 50 | sio = io.StringIO() 51 | logger.handlers[:] = [logging.StreamHandler(sio)] 52 | logger.setLevel(level) 53 | try: 54 | yield sio 55 | finally: 56 | logger.handlers[:] = orig_handlers 57 | logger.setLevel(orig_level) 58 | if propagate is not None: 59 | logger.propagate = orig_propagate 60 | 61 | 62 | def cuda_array(size): 63 | try: 64 | import rmm 65 | 66 | return rmm.DeviceBuffer(size=size) 67 | except ImportError: 68 | import numba.cuda 69 | 70 | return numba.cuda.device_array((size,), dtype="u1") 71 | 72 | 73 | async def send(ep, frames): 74 | pytest.importorskip("distributed") 75 | from distributed.utils import nbytes 76 | 77 | await ep.send(np.array([len(frames)], dtype=np.uint64)) 78 | await ep.send( 79 | np.array([hasattr(f, "__cuda_array_interface__") for f in frames], dtype=bool) 80 | ) 81 | await ep.send(np.array([nbytes(f) for f in frames], dtype=np.uint64)) 82 | # Send frames 83 | for frame in frames: 84 | if nbytes(frame) > 0: 85 | await ep.send(frame) 86 | 87 | 88 | async def recv(ep): 89 | pytest.importorskip("distributed") 90 | 91 | from distributed.comm.utils import from_frames 92 | 93 | try: 94 | # Recv meta data 95 | nframes = np.empty(1, dtype=np.uint64) 96 | await ep.recv(nframes) 97 | is_cudas = np.empty(nframes[0], dtype=bool) 98 | await ep.recv(is_cudas) 99 | sizes = np.empty(nframes[0], dtype=np.uint64) 100 | await ep.recv(sizes) 101 | except (ucp.exceptions.UCXCanceled, ucp.exceptions.UCXCloseError) as e: 102 | msg = "SOMETHING TERRIBLE HAS HAPPENED IN THE TEST" 103 | raise e(msg) 104 | 105 | # Recv frames 106 | frames = [] 107 | for is_cuda, size in zip(is_cudas.tolist(), sizes.tolist()): 108 | if size > 0: 109 | if is_cuda: 110 | frame = cuda_array(size) 111 | else: 112 | frame = np.empty(size, dtype=np.uint8) 113 | await ep.recv(frame) 114 | frames.append(frame) 115 | else: 116 | if is_cuda: 117 | frames.append(cuda_array(size)) 118 | else: 119 | frames.append(b"") 120 | 121 | msg = await from_frames(frames) 122 | return frames, msg 123 | 124 | 125 | async def am_send(ep, frames): 126 | await ep.am_send(np.array([len(frames)], dtype=np.uint64)) 127 | # Send frames 128 | for frame in frames: 129 | await ep.am_send(frame) 130 | 131 | 132 | async def am_recv(ep): 133 | pytest.importorskip("distributed") 134 | 135 | from distributed.comm.utils import from_frames 136 | 137 | try: 138 | # Recv meta data 139 | nframes = (await ep.am_recv()).view(np.uint64) 140 | except (ucp.exceptions.UCXCanceled, ucp.exceptions.UCXCloseError) as e: 141 | msg = "SOMETHING TERRIBLE HAS HAPPENED IN THE TEST" 142 | raise e(msg) 143 | 144 | # Recv frames 145 | frames = [] 146 | for _ in range(nframes[0]): 147 | frame = await ep.am_recv() 148 | frames.append(frame) 149 | 150 | msg = await from_frames(frames) 151 | return frames, msg 152 | -------------------------------------------------------------------------------- /ucp/VERSION: -------------------------------------------------------------------------------- 1 | ../VERSION -------------------------------------------------------------------------------- /ucp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | """UCX-Py: Python bindings for UCX """ 5 | 6 | import logging 7 | import os 8 | 9 | logger = logging.getLogger("ucx") 10 | 11 | # Notice, if we have to update environment variables we need to do it 12 | # before importing UCX, which must happen also before the Cython code 13 | # import to prevent UCS unused variable warnings. 14 | if "UCX_MEMTYPE_CACHE" not in os.environ: 15 | # See 16 | logger.debug("Setting env UCX_MEMTYPE_CACHE=n, which is required by UCX") 17 | os.environ["UCX_MEMTYPE_CACHE"] = "n" 18 | 19 | 20 | # If libucx was installed as a wheel, we must request it to load the library symbols. 21 | # Otherwise, we assume that the library was installed in a system path that ld can find. 22 | try: 23 | import libucx 24 | except ImportError: 25 | pass 26 | else: 27 | libucx.load_library() 28 | del libucx 29 | 30 | 31 | from .core import * # noqa 32 | from .core import get_ucx_version # noqa 33 | from .utils import get_ucxpy_logger # noqa 34 | from ._libs.utils import get_address # noqa 35 | from ._version import __git_commit__, __version__ 36 | 37 | try: 38 | import pynvml 39 | except ImportError: 40 | pynvml = None 41 | 42 | _ucx_version = get_ucx_version() 43 | 44 | __ucx_min_version__ = "1.15.0" 45 | __ucx_version__ = "%d.%d.%d" % _ucx_version 46 | 47 | if _ucx_version < tuple(int(i) for i in __ucx_min_version__.split(".")): 48 | raise ImportError( 49 | f"Support for UCX {__ucx_version__} has ended. Please upgrade to " 50 | f"{__ucx_min_version__} or newer. If you believe the wrong version " 51 | "is being loaded, please check the path from where UCX is loaded " 52 | "by rerunning with the environment variable `UCX_LOG_LEVEL=debug`." 53 | ) 54 | 55 | # Setup UCX-Py logger 56 | logger = get_ucxpy_logger() 57 | 58 | if "UCX_RNDV_THRESH" not in os.environ: 59 | logger.info("Setting UCX_RNDV_THRESH=8192") 60 | os.environ["UCX_RNDV_THRESH"] = "8192" 61 | 62 | if "UCX_RNDV_FRAG_MEM_TYPE" not in os.environ: 63 | logger.info("Setting UCX_RNDV_FRAG_MEM_TYPE=cuda") 64 | os.environ["UCX_RNDV_FRAG_MEM_TYPE"] = "cuda" 65 | 66 | if ( 67 | pynvml is not None 68 | and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ 69 | and _ucx_version >= (1, 12, 0) 70 | ): 71 | try: 72 | pynvml.nvmlInit() 73 | device_count = pynvml.nvmlDeviceGetCount() 74 | large_bar1 = [False] * device_count 75 | 76 | def _is_mig_device(handle): 77 | try: 78 | pynvml.nvmlDeviceGetMigMode(handle)[0] 79 | except pynvml.NVMLError: 80 | return False 81 | return True 82 | 83 | for dev_idx in range(device_count): 84 | handle = pynvml.nvmlDeviceGetHandleByIndex(dev_idx) 85 | 86 | try: 87 | total_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total 88 | except pynvml.NVMLError_NotSupported: 89 | total_memory = None 90 | 91 | # Ignore MIG devices and devices with no memory resource (i.e., only 92 | # integrated CPU+GPU memory resource) and rely on UCX's default for 93 | # now. Increasing `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly 94 | # tested, as it's not yet clear whether it would be safe to set `1.0` 95 | # for those instances too. 96 | if _is_mig_device(handle) or total_memory is None: 97 | continue 98 | 99 | try: 100 | bar1_total = pynvml.nvmlDeviceGetBAR1MemoryInfo(handle).bar1Total 101 | except pynvml.NVMLError_NotSupported: 102 | # Bar1 access not supported on this device, set it to 103 | # zero (always lower than device memory). 104 | bar1_total = 0 105 | 106 | total_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total 107 | if total_memory <= bar1_total: 108 | large_bar1[dev_idx] = True 109 | 110 | if all(large_bar1): 111 | logger.info("Setting UCX_CUDA_COPY_MAX_REG_RATIO=1.0") 112 | os.environ["UCX_CUDA_COPY_MAX_REG_RATIO"] = "1.0" 113 | except ( 114 | pynvml.NVMLError_LibraryNotFound, 115 | pynvml.NVMLError_DriverNotLoaded, 116 | pynvml.NVMLError_Unknown, 117 | ): 118 | pass 119 | 120 | if "UCX_MAX_RNDV_RAILS" not in os.environ and _ucx_version >= (1, 12, 0): 121 | logger.info("Setting UCX_MAX_RNDV_RAILS=1") 122 | os.environ["UCX_MAX_RNDV_RAILS"] = "1" 123 | 124 | if "UCX_PROTO_ENABLE" not in os.environ and (1, 12, 0) <= _ucx_version < (1, 18, 0): 125 | # UCX protov2 still doesn't support CUDA async/managed memory 126 | logger.info("Setting UCX_PROTO_ENABLE=n") 127 | os.environ["UCX_PROTO_ENABLE"] = "n" 128 | -------------------------------------------------------------------------------- /ucp/_libs/__init__.pxd: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | -------------------------------------------------------------------------------- /ucp/_libs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | from .utils import nvtx_annotate # noqa 5 | -------------------------------------------------------------------------------- /ucp/_libs/arr.pxd: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | 7 | from libc.stdint cimport uintptr_t 8 | 9 | 10 | cdef class Array: 11 | cdef readonly uintptr_t ptr 12 | cdef readonly bint readonly 13 | cdef readonly object obj 14 | 15 | cdef readonly Py_ssize_t itemsize 16 | 17 | cdef readonly Py_ssize_t ndim 18 | cdef Py_ssize_t[::1] shape_mv 19 | cdef Py_ssize_t[::1] strides_mv 20 | 21 | cdef readonly bint cuda 22 | 23 | cpdef bint _c_contiguous(self) 24 | cpdef bint _f_contiguous(self) 25 | cpdef bint _contiguous(self) 26 | cpdef Py_ssize_t _nbytes(self) 27 | 28 | 29 | cpdef Array asarray(obj) 30 | -------------------------------------------------------------------------------- /ucp/_libs/arr.pyi: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | from typing import Generic, Tuple, TypeVar 5 | 6 | T = TypeVar("T") 7 | 8 | class Array(Generic[T]): 9 | def __init__(self, obj: T): ... 10 | @property 11 | def c_contiguous(self) -> bool: ... 12 | @property 13 | def f_contiguous(self) -> bool: ... 14 | @property 15 | def contiguous(self) -> bool: ... 16 | @property 17 | def nbytes(self) -> int: ... 18 | @property 19 | def shape(self) -> Tuple[int]: ... 20 | @property 21 | def strides(self) -> Tuple[int]: ... 22 | @property 23 | def cuda(self) -> bool: ... 24 | @property 25 | def obj(self) -> T: ... 26 | 27 | def asarray(obj) -> Array: ... 28 | -------------------------------------------------------------------------------- /ucp/_libs/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | import contextlib 5 | import logging 6 | 7 | logger = logging.getLogger("ucx") 8 | 9 | 10 | @contextlib.contextmanager 11 | def log_errors(reraise_exception=False): 12 | try: 13 | yield 14 | except BaseException as e: 15 | logger.exception(e) 16 | if reraise_exception: 17 | raise 18 | 19 | 20 | class UCXBaseException(Exception): 21 | pass 22 | 23 | 24 | class UCXError(UCXBaseException): 25 | pass 26 | 27 | 28 | class UCXConfigError(UCXError): 29 | pass 30 | 31 | 32 | class UCXWarning(UserWarning): 33 | pass 34 | 35 | 36 | class UCXCloseError(UCXBaseException): 37 | pass 38 | 39 | 40 | class UCXCanceled(UCXBaseException): 41 | pass 42 | 43 | 44 | class UCXConnectionReset(UCXBaseException): 45 | pass 46 | 47 | 48 | class UCXMsgTruncated(UCXBaseException): 49 | pass 50 | 51 | 52 | class UCXNotConnected(UCXBaseException): 53 | pass 54 | 55 | 56 | class UCXUnreachable(UCXBaseException): 57 | pass 58 | -------------------------------------------------------------------------------- /ucp/_libs/packed_remote_key.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 UT-Battelle, LLC. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | from libc.stdint cimport uintptr_t 7 | from libc.stdlib cimport free 8 | from libc.string cimport memcpy 9 | 10 | from .arr cimport Array 11 | from .ucx_api_dep cimport * 12 | 13 | 14 | cdef class PackedRemoteKey: 15 | """ A packed remote key. This key is suitable for sending to remote nodes to setup 16 | remote access to local memory. Users should not instance this class directly and 17 | should use the from_buffer() and from_mem_handle() class methods or the 18 | pack_rkey() method on the UCXMemoryHandle class 19 | """ 20 | cdef void *_key 21 | cdef Py_ssize_t _length 22 | 23 | def __cinit__(self, uintptr_t packed_key_as_int, Py_ssize_t length): 24 | key = packed_key_as_int 25 | self._key = malloc(length) 26 | self._length = length 27 | memcpy(self._key, key, length) 28 | 29 | @classmethod 30 | def from_buffer(cls, buffer): 31 | """ Wrap a received buffer in a PackedRemoteKey to turn magic buffers into a 32 | python class suitable for unpacking on an EP 33 | 34 | Parameters 35 | ---------- 36 | buffer: 37 | Python buffer to be wrapped 38 | """ 39 | buf = Array(buffer) 40 | assert buf.c_contiguous 41 | return PackedRemoteKey(buf.ptr, buf.nbytes) 42 | 43 | @classmethod 44 | def from_mem_handle(self, UCXMemoryHandle mem): 45 | """ Create a new packed remote key from a given UCXMemoryHandle class 46 | 47 | Parameters 48 | ---------- 49 | mem: UCXMemoryHandle 50 | The memory handle to be packed in an rkey for sending 51 | """ 52 | cdef void *key 53 | cdef size_t len 54 | cdef ucs_status_t status 55 | status = ucp_rkey_pack(mem._context._handle, mem._mem_handle, &key, &len) 56 | packed_key = PackedRemoteKey(key, len) 57 | ucp_rkey_buffer_release(key) 58 | assert_ucs_status(status) 59 | return packed_key 60 | 61 | def __dealloc__(self): 62 | free(self._key) 63 | 64 | @property 65 | def key(self): 66 | return int(self._key) 67 | 68 | @property 69 | def length(self): 70 | return int(self._length) 71 | 72 | def __getbuffer__(self, Py_buffer *buffer, int flags): 73 | get_ucx_object(buffer, flags, self._key, self._length, self) 74 | 75 | def __releasebuffer__(self, Py_buffer *buffer): 76 | pass 77 | 78 | def __reduce__(self): 79 | return (PackedRemoteKey.from_buffer, (bytes(self),)) 80 | 81 | def __hash__(self): 82 | return hash(bytes(self)) 83 | -------------------------------------------------------------------------------- /ucp/_libs/src/c_util.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 3 | * See file LICENSE for terms. 4 | */ 5 | 6 | #include "c_util.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | int c_util_set_sockaddr(ucs_sock_addr_t *sockaddr, const char *ip_address, uint16_t port) { 15 | struct sockaddr_in *addr = malloc(sizeof(struct sockaddr_in)); 16 | if(addr == NULL) { 17 | return 1; 18 | } 19 | memset(addr, 0, sizeof(struct sockaddr_in)); 20 | addr->sin_family = AF_INET; 21 | addr->sin_addr.s_addr = ip_address==NULL ? INADDR_ANY : inet_addr(ip_address); 22 | addr->sin_port = htons(port); 23 | sockaddr->addr = (const struct sockaddr *) addr; 24 | sockaddr->addrlen = sizeof(struct sockaddr_in); 25 | return 0; 26 | } 27 | 28 | 29 | void c_util_sockaddr_free(ucs_sock_addr_t *sockaddr) { 30 | free((void*) sockaddr->addr); 31 | } 32 | 33 | void c_util_sockaddr_get_ip_port_str(const struct sockaddr_storage *sock_addr, 34 | char *ip_str, char *port_str, 35 | size_t max_str_size) 36 | { 37 | struct sockaddr_in addr_in; 38 | struct sockaddr_in6 addr_in6; 39 | 40 | switch (sock_addr->ss_family) { 41 | case AF_INET: 42 | memcpy(&addr_in, sock_addr, sizeof(struct sockaddr_in)); 43 | inet_ntop(AF_INET, &addr_in.sin_addr, ip_str, max_str_size); 44 | snprintf(port_str, max_str_size, "%d", ntohs(addr_in.sin_port)); 45 | case AF_INET6: 46 | memcpy(&addr_in6, sock_addr, sizeof(struct sockaddr_in6)); 47 | inet_ntop(AF_INET6, &addr_in6.sin6_addr, ip_str, max_str_size); 48 | snprintf(port_str, max_str_size, "%d", ntohs(addr_in6.sin6_port)); 49 | default: 50 | ip_str = "Invalid address family"; 51 | port_str = "Invalid address family"; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /ucp/_libs/src/c_util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. 3 | * See file LICENSE for terms. 4 | */ 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | int c_util_set_sockaddr(ucs_sock_addr_t *sockaddr, const char *ip_address, uint16_t port); 11 | 12 | void c_util_sockaddr_free(ucs_sock_addr_t *sockaddr); 13 | 14 | void c_util_sockaddr_get_ip_port_str( 15 | const struct sockaddr_storage *sock_addr, 16 | char *ip_str, char *port_str, 17 | size_t max_str_size 18 | ); 19 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_address_object.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import pickle 3 | 4 | from ucp._libs import ucx_api 5 | 6 | mp = mp.get_context("spawn") 7 | 8 | 9 | def test_pickle_ucx_address(): 10 | ctx = ucx_api.UCXContext() 11 | worker = ucx_api.UCXWorker(ctx) 12 | org_address = worker.get_address() 13 | dumped_address = pickle.dumps(org_address) 14 | org_address_hash = hash(org_address) 15 | org_address = bytes(org_address) 16 | new_address = pickle.loads(dumped_address) 17 | assert org_address_hash == hash(new_address) 18 | assert bytes(org_address) == bytes(new_address) 19 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_cancel.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import re 3 | 4 | import pytest 5 | 6 | from ucp._libs import ucx_api 7 | from ucp._libs.arr import Array 8 | from ucp._libs.utils import get_address 9 | from ucp.exceptions import UCXCanceled 10 | 11 | mp = mp.get_context("spawn") 12 | 13 | WireupMessage = bytearray(b"wireup") 14 | DataMessage = bytearray(b"0" * 10) 15 | 16 | 17 | def _handler(request, exception, ret): 18 | if exception is not None: 19 | ret[0] = exception 20 | else: 21 | ret[0] = request 22 | 23 | 24 | def _server_cancel(queue, transfer_api): 25 | """Server that establishes an endpoint to client and immediately closes 26 | it, triggering received messages to be canceled on the client. 27 | """ 28 | feature_flags = ( 29 | ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, 30 | ) 31 | ctx = ucx_api.UCXContext(feature_flags=feature_flags) 32 | worker = ucx_api.UCXWorker(ctx) 33 | 34 | # Keep endpoint to be used from outside the listener callback 35 | ep = [None] 36 | 37 | def _listener_handler(conn_request): 38 | ep[0] = ucx_api.UCXEndpoint.create_from_conn_request( 39 | worker, 40 | conn_request, 41 | endpoint_error_handling=True, 42 | ) 43 | 44 | listener = ucx_api.UCXListener(worker=worker, port=0, cb_func=_listener_handler) 45 | queue.put(listener.port) 46 | 47 | while ep[0] is None: 48 | worker.progress() 49 | 50 | ep[0].close() 51 | worker.progress() 52 | 53 | 54 | def _client_cancel(queue, transfer_api): 55 | """Client that connects to server and waits for messages to be received, 56 | because the server closes without sending anything, the messages will 57 | trigger cancelation. 58 | """ 59 | feature_flags = ( 60 | ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, 61 | ) 62 | ctx = ucx_api.UCXContext(feature_flags=feature_flags) 63 | worker = ucx_api.UCXWorker(ctx) 64 | port = queue.get() 65 | ep = ucx_api.UCXEndpoint.create( 66 | worker, 67 | get_address(), 68 | port, 69 | endpoint_error_handling=True, 70 | ) 71 | 72 | ret = [None] 73 | 74 | if transfer_api == "am": 75 | ucx_api.am_recv_nb(ep, cb_func=_handler, cb_args=(ret,)) 76 | 77 | match_msg = ".*am_recv.*" 78 | else: 79 | msg = Array(bytearray(1)) 80 | ucx_api.tag_recv_nb( 81 | worker, msg, msg.nbytes, tag=0, cb_func=_handler, cb_args=(ret,), ep=ep 82 | ) 83 | 84 | match_msg = ".*tag_recv_nb.*" 85 | 86 | while ep.is_alive(): 87 | worker.progress() 88 | 89 | canceled = worker.cancel_inflight_messages() 90 | 91 | while ret[0] is None: 92 | worker.progress() 93 | 94 | assert canceled == 1 95 | assert isinstance(ret[0], UCXCanceled) 96 | assert re.match(match_msg, ret[0].args[0]) 97 | 98 | 99 | @pytest.mark.parametrize("transfer_api", ["am", "tag"]) 100 | def test_message_probe(transfer_api): 101 | queue = mp.Queue() 102 | server = mp.Process( 103 | target=_server_cancel, 104 | args=(queue, transfer_api), 105 | ) 106 | server.start() 107 | client = mp.Process( 108 | target=_client_cancel, 109 | args=(queue, transfer_api), 110 | ) 111 | client.start() 112 | client.join(timeout=10) 113 | server.join(timeout=10) 114 | assert client.exitcode == 0 115 | assert server.exitcode == 0 116 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from ucp._libs import ucx_api 6 | from ucp._libs.arr import Array 7 | from ucp._libs.exceptions import UCXConfigError 8 | 9 | 10 | def test_get_config(): 11 | # Cache user-defined UCX_TLS and unset it to test default value 12 | tls = os.environ.get("UCX_TLS", None) 13 | if tls is not None: 14 | del os.environ["UCX_TLS"] 15 | 16 | ctx = ucx_api.UCXContext() 17 | config = ctx.get_config() 18 | assert isinstance(config, dict) 19 | assert config["TLS"] == "all" 20 | 21 | # Restore user-defined UCX_TLS 22 | if tls is not None: 23 | os.environ["UCX_TLS"] = tls 24 | 25 | 26 | def test_set_env(): 27 | os.environ["UCX_SEG_SIZE"] = "2M" 28 | ctx = ucx_api.UCXContext() 29 | config = ctx.get_config() 30 | assert config["SEG_SIZE"] == os.environ["UCX_SEG_SIZE"] 31 | 32 | 33 | def test_init_options(): 34 | os.environ["UCX_SEG_SIZE"] = "2M" # Should be ignored 35 | options = {"SEG_SIZE": "3M"} 36 | ctx = ucx_api.UCXContext(options) 37 | config = ctx.get_config() 38 | assert config["SEG_SIZE"] == options["SEG_SIZE"] 39 | 40 | 41 | @pytest.mark.skipif( 42 | ucx_api.get_ucx_version() >= (1, 12, 0), 43 | reason="Beginning with UCX >= 1.12, it's only possible to validate " 44 | "UCP options but not options from other modules such as UCT. " 45 | "See https://github.com/openucx/ucx/issues/7519.", 46 | ) 47 | def test_init_unknown_option(): 48 | options = {"UNKNOWN_OPTION": "3M"} 49 | with pytest.raises(UCXConfigError): 50 | ucx_api.UCXContext(options) 51 | 52 | 53 | def test_init_invalid_option(): 54 | options = {"SEG_SIZE": "invalid-size"} 55 | with pytest.raises(UCXConfigError): 56 | ucx_api.UCXContext(options) 57 | 58 | 59 | @pytest.mark.parametrize( 60 | "feature_flag", [ucx_api.Feature.TAG, ucx_api.Feature.STREAM, ucx_api.Feature.AM] 61 | ) 62 | def test_feature_flags_mismatch(feature_flag): 63 | ctx = ucx_api.UCXContext(feature_flags=(feature_flag,)) 64 | worker = ucx_api.UCXWorker(ctx) 65 | addr = worker.get_address() 66 | ep = ucx_api.UCXEndpoint.create_from_worker_address( 67 | worker, addr, endpoint_error_handling=False 68 | ) 69 | msg = Array(bytearray(10)) 70 | if feature_flag != ucx_api.Feature.TAG: 71 | with pytest.raises( 72 | ValueError, match="UCXContext must be created with `Feature.TAG`" 73 | ): 74 | ucx_api.tag_send_nb(ep, msg, msg.nbytes, 0, None) 75 | with pytest.raises( 76 | ValueError, match="UCXContext must be created with `Feature.TAG`" 77 | ): 78 | ucx_api.tag_recv_nb(worker, msg, msg.nbytes, 0, None) 79 | if feature_flag != ucx_api.Feature.STREAM: 80 | with pytest.raises( 81 | ValueError, match="UCXContext must be created with `Feature.STREAM`" 82 | ): 83 | ucx_api.stream_send_nb(ep, msg, msg.nbytes, None) 84 | with pytest.raises( 85 | ValueError, match="UCXContext must be created with `Feature.STREAM`" 86 | ): 87 | ucx_api.stream_recv_nb(ep, msg, msg.nbytes, None) 88 | if feature_flag != ucx_api.Feature.AM: 89 | with pytest.raises( 90 | ValueError, match="UCXContext must be created with `Feature.AM`" 91 | ): 92 | ucx_api.am_send_nbx(ep, msg, msg.nbytes, None) 93 | with pytest.raises( 94 | ValueError, match="UCXContext must be created with `Feature.AM`" 95 | ): 96 | ucx_api.am_recv_nb(ep, None) 97 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_endpoint.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import multiprocessing as mp 3 | 4 | import pytest 5 | 6 | from ucp._libs import ucx_api 7 | from ucp._libs.utils import get_address 8 | 9 | mp = mp.get_context("spawn") 10 | 11 | 12 | def _close_callback(closed): 13 | closed[0] = True 14 | 15 | 16 | def _server(queue, server_close_callback): 17 | """Server that send received message back to the client 18 | 19 | Notice, since it is illegal to call progress() in call-back functions, 20 | we use a "chain" of call-back functions. 21 | """ 22 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) 23 | worker = ucx_api.UCXWorker(ctx) 24 | 25 | listener_finished = [False] 26 | closed = [False] 27 | 28 | # A reference to listener's endpoint is stored to prevent it from going 29 | # out of scope too early. 30 | # ep = None 31 | 32 | def _listener_handler(conn_request): 33 | global ep 34 | ep = ucx_api.UCXEndpoint.create_from_conn_request( 35 | worker, 36 | conn_request, 37 | endpoint_error_handling=True, 38 | ) 39 | if server_close_callback is True: 40 | ep.set_close_callback(functools.partial(_close_callback, closed)) 41 | listener_finished[0] = True 42 | 43 | listener = ucx_api.UCXListener(worker=worker, port=0, cb_func=_listener_handler) 44 | queue.put(listener.port) 45 | 46 | if server_close_callback is True: 47 | while closed[0] is False: 48 | worker.progress() 49 | assert closed[0] is True 50 | else: 51 | while listener_finished[0] is False: 52 | worker.progress() 53 | 54 | 55 | def _client(port, server_close_callback): 56 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) 57 | worker = ucx_api.UCXWorker(ctx) 58 | ep = ucx_api.UCXEndpoint.create( 59 | worker, 60 | get_address(), 61 | port, 62 | endpoint_error_handling=True, 63 | ) 64 | if server_close_callback is True: 65 | ep.close() 66 | worker.progress() 67 | else: 68 | closed = [False] 69 | ep.set_close_callback(functools.partial(_close_callback, closed)) 70 | while closed[0] is False: 71 | worker.progress() 72 | 73 | 74 | @pytest.mark.parametrize("server_close_callback", [True, False]) 75 | def test_close_callback(server_close_callback): 76 | queue = mp.Queue() 77 | server = mp.Process( 78 | target=_server, 79 | args=(queue, server_close_callback), 80 | ) 81 | server.start() 82 | port = queue.get() 83 | client = mp.Process( 84 | target=_client, 85 | args=(port, server_close_callback), 86 | ) 87 | client.start() 88 | client.join(timeout=10) 89 | server.join(timeout=10) 90 | assert client.exitcode == 0 91 | assert server.exitcode == 0 92 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_listener.py: -------------------------------------------------------------------------------- 1 | from ucp._libs import ucx_api 2 | 3 | 4 | def test_listener_ip_port(): 5 | ctx = ucx_api.UCXContext() 6 | worker = ucx_api.UCXWorker(ctx) 7 | 8 | def _listener_handler(conn_request): 9 | pass 10 | 11 | listener = ucx_api.UCXListener(worker=worker, port=0, cb_func=_listener_handler) 12 | 13 | assert isinstance(listener.ip, str) and listener.ip 14 | assert ( 15 | isinstance(listener.port, int) and listener.port >= 0 and listener.port <= 65535 16 | ) 17 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_mem.py: -------------------------------------------------------------------------------- 1 | import array 2 | import io 3 | import mmap 4 | 5 | import pytest 6 | 7 | from ucp._libs import ucx_api 8 | 9 | builtin_buffers = [ 10 | b"", 11 | b"abcd", 12 | array.array("i", []), 13 | array.array("i", [0, 1, 2, 3]), 14 | array.array("I", [0, 1, 2, 3]), 15 | array.array("f", []), 16 | array.array("f", [0, 1, 2, 3]), 17 | array.array("d", [0, 1, 2, 3]), 18 | memoryview(array.array("B", [0, 1, 2, 3, 4, 5])).cast("B", (3, 2)), 19 | memoryview(b"abcd"), 20 | memoryview(bytearray(b"abcd")), 21 | io.BytesIO(b"abcd").getbuffer(), 22 | mmap.mmap(-1, 5), 23 | ] 24 | 25 | 26 | def test_alloc(): 27 | ctx = ucx_api.UCXContext({}) 28 | mem = ucx_api.UCXMemoryHandle.alloc(ctx, 1024) 29 | rkey = mem.pack_rkey() 30 | assert rkey is not None 31 | 32 | 33 | @pytest.mark.parametrize("buffer", builtin_buffers) 34 | def test_map(buffer): 35 | ctx = ucx_api.UCXContext({}) 36 | mem = ucx_api.UCXMemoryHandle.map(ctx, buffer) 37 | rkey = mem.pack_rkey() 38 | assert rkey is not None 39 | 40 | 41 | def test_ctx_alloc(): 42 | ctx = ucx_api.UCXContext({}) 43 | mem = ctx.alloc(1024) 44 | rkey = mem.pack_rkey() 45 | assert rkey is not None 46 | 47 | 48 | @pytest.mark.parametrize("buffer", builtin_buffers) 49 | def test_ctx_map(buffer): 50 | ctx = ucx_api.UCXContext({}) 51 | mem = ctx.map(buffer) 52 | rkey = mem.pack_rkey() 53 | assert rkey is not None 54 | 55 | 56 | def test_rkey_unpack(): 57 | ctx = ucx_api.UCXContext({}) 58 | mem = ucx_api.UCXMemoryHandle.alloc(ctx, 1024) 59 | packed_rkey = mem.pack_rkey() 60 | worker = ucx_api.UCXWorker(ctx) 61 | ep = ucx_api.UCXEndpoint.create_from_worker_address( 62 | worker, 63 | worker.get_address(), 64 | endpoint_error_handling=True, 65 | ) 66 | rkey = ep.unpack_rkey(packed_rkey) 67 | assert rkey is not None 68 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_peer_send_recv.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import os 3 | from itertools import repeat 4 | 5 | import pytest 6 | 7 | from ucp._libs import ucx_api 8 | from ucp._libs.utils_test import blocking_flush, blocking_recv, blocking_send 9 | 10 | mp = mp.get_context("spawn") 11 | 12 | 13 | def _rma_setup(worker, address, prkey, base, msg_size): 14 | ep = ucx_api.UCXEndpoint.create_from_worker_address( 15 | worker, address, endpoint_error_handling=True 16 | ) 17 | rkey = ep.unpack_rkey(prkey) 18 | mem = ucx_api.RemoteMemory(rkey, base, msg_size) 19 | return ep, mem 20 | 21 | 22 | def _test_peer_communication_rma(queue, rank, msg_size): 23 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.RMA, ucx_api.Feature.TAG)) 24 | worker = ucx_api.UCXWorker(ctx) 25 | self_address = worker.get_address() 26 | mem_handle = ctx.alloc(msg_size) 27 | self_base = mem_handle.address 28 | self_prkey = mem_handle.pack_rkey() 29 | 30 | self_ep, self_mem = _rma_setup( 31 | worker, self_address, self_prkey, self_base, msg_size 32 | ) 33 | send_msg = bytes(repeat(rank, msg_size)) 34 | if not self_mem.put_nbi(send_msg): 35 | blocking_flush(self_ep) 36 | 37 | queue.put((rank, self_address, self_prkey, self_base)) 38 | right_rank, right_address, right_prkey, right_base = queue.get() 39 | left_rank, left_address, left_prkey, left_base = queue.get() 40 | 41 | right_ep, right_mem = _rma_setup( 42 | worker, right_address, right_prkey, right_base, msg_size 43 | ) 44 | right_msg = bytearray(msg_size) 45 | right_mem.get_nbi(right_msg) 46 | 47 | left_ep, left_mem = _rma_setup( 48 | worker, left_address, left_prkey, left_base, msg_size 49 | ) 50 | left_msg = bytearray(msg_size) 51 | left_mem.get_nbi(left_msg) 52 | 53 | blocking_flush(worker) 54 | assert left_msg == bytes(repeat(left_rank, msg_size)) 55 | assert right_msg == bytes(repeat(right_rank, msg_size)) 56 | 57 | # We use the blocking tag send/recv as a barrier implementation 58 | recv_msg = bytearray(8) 59 | if rank == 0: 60 | send_msg = bytes(os.urandom(8)) 61 | blocking_send(worker, right_ep, send_msg, right_rank) 62 | blocking_recv(worker, left_ep, recv_msg, rank) 63 | else: 64 | blocking_recv(worker, left_ep, recv_msg, rank) 65 | blocking_send(worker, right_ep, recv_msg, right_rank) 66 | 67 | 68 | def _test_peer_communication_tag(queue, rank, msg_size): 69 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) 70 | worker = ucx_api.UCXWorker(ctx) 71 | queue.put((rank, worker.get_address())) 72 | right_rank, right_address = queue.get() 73 | left_rank, left_address = queue.get() 74 | 75 | right_ep = ucx_api.UCXEndpoint.create_from_worker_address( 76 | worker, 77 | right_address, 78 | endpoint_error_handling=True, 79 | ) 80 | left_ep = ucx_api.UCXEndpoint.create_from_worker_address( 81 | worker, 82 | left_address, 83 | endpoint_error_handling=True, 84 | ) 85 | recv_msg = bytearray(msg_size) 86 | if rank == 0: 87 | send_msg = bytes(os.urandom(msg_size)) 88 | blocking_send(worker, right_ep, send_msg, right_rank) 89 | blocking_recv(worker, left_ep, recv_msg, rank) 90 | assert send_msg == recv_msg 91 | else: 92 | blocking_recv(worker, left_ep, recv_msg, rank) 93 | blocking_send(worker, right_ep, recv_msg, right_rank) 94 | 95 | 96 | @pytest.mark.parametrize( 97 | "test_name", [_test_peer_communication_tag, _test_peer_communication_rma] 98 | ) 99 | @pytest.mark.parametrize("msg_size", [10, 2**24]) 100 | def test_peer_communication(test_name, msg_size, num_nodes=2): 101 | """Test peer communication by sending a message between each worker""" 102 | queues = [mp.Queue() for _ in range(num_nodes)] 103 | ps = [] 104 | addresses = [] 105 | for rank, queue in enumerate(queues): 106 | p = mp.Process(target=test_name, args=(queue, rank, msg_size)) 107 | p.start() 108 | ps.append(p) 109 | addresses.append(queue.get()) 110 | 111 | for i in range(num_nodes): 112 | queues[i].put(addresses[(i + 1) % num_nodes]) # Right peer 113 | queues[i].put(addresses[(i - 1) % num_nodes]) # Left peer 114 | 115 | for p in ps: 116 | p.join() 117 | assert not p.exitcode 118 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_probe.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | import pytest 4 | 5 | from ucp._libs import ucx_api 6 | from ucp._libs.utils import get_address 7 | from ucp._libs.utils_test import ( 8 | blocking_am_recv, 9 | blocking_am_send, 10 | blocking_recv, 11 | blocking_send, 12 | ) 13 | 14 | mp = mp.get_context("spawn") 15 | 16 | WireupMessage = bytearray(b"wireup") 17 | DataMessage = bytearray(b"0" * 10) 18 | 19 | 20 | def _server_probe(queue, transfer_api): 21 | """Server that probes and receives message after client disconnected. 22 | 23 | Note that since it is illegal to call progress() in callback functions, 24 | we keep a reference to the endpoint after the listener callback has 25 | terminated, this way we can progress even after Python blocking calls. 26 | """ 27 | feature_flags = ( 28 | ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, 29 | ) 30 | ctx = ucx_api.UCXContext(feature_flags=feature_flags) 31 | worker = ucx_api.UCXWorker(ctx) 32 | 33 | # Keep endpoint to be used from outside the listener callback 34 | ep = [None] 35 | 36 | def _listener_handler(conn_request): 37 | ep[0] = ucx_api.UCXEndpoint.create_from_conn_request( 38 | worker, 39 | conn_request, 40 | endpoint_error_handling=True, 41 | ) 42 | 43 | listener = ucx_api.UCXListener(worker=worker, port=0, cb_func=_listener_handler) 44 | queue.put(listener.port), 45 | 46 | while ep[0] is None: 47 | worker.progress() 48 | 49 | ep = ep[0] 50 | 51 | # Ensure wireup and inform client before it can disconnect 52 | if transfer_api == "am": 53 | wireup = blocking_am_recv(worker, ep) 54 | else: 55 | wireup = bytearray(len(WireupMessage)) 56 | blocking_recv(worker, ep, wireup) 57 | queue.put("wireup completed") 58 | 59 | # Ensure client has disconnected -- endpoint is not alive anymore 60 | while ep.is_alive() is True: 61 | worker.progress() 62 | 63 | # Probe/receive message even after the remote endpoint has disconnected 64 | if transfer_api == "am": 65 | while ep.am_probe() is False: 66 | worker.progress() 67 | received = blocking_am_recv(worker, ep) 68 | else: 69 | while worker.tag_probe(0) is False: 70 | worker.progress() 71 | received = bytearray(len(DataMessage)) 72 | blocking_recv(worker, ep, received) 73 | 74 | assert wireup == WireupMessage 75 | assert received == DataMessage 76 | 77 | 78 | def _client_probe(queue, transfer_api): 79 | feature_flags = ( 80 | ucx_api.Feature.AM if transfer_api == "am" else ucx_api.Feature.TAG, 81 | ) 82 | ctx = ucx_api.UCXContext(feature_flags=feature_flags) 83 | worker = ucx_api.UCXWorker(ctx) 84 | port = queue.get() 85 | ep = ucx_api.UCXEndpoint.create( 86 | worker, 87 | get_address(), 88 | port, 89 | endpoint_error_handling=True, 90 | ) 91 | 92 | _send = blocking_am_send if transfer_api == "am" else blocking_send 93 | 94 | _send(worker, ep, WireupMessage) 95 | _send(worker, ep, DataMessage) 96 | 97 | # Wait for wireup before disconnecting 98 | assert queue.get() == "wireup completed" 99 | 100 | 101 | @pytest.mark.parametrize("transfer_api", ["am", "tag"]) 102 | def test_message_probe(transfer_api): 103 | queue = mp.Queue() 104 | server = mp.Process( 105 | target=_server_probe, 106 | args=(queue, transfer_api), 107 | ) 108 | server.start() 109 | client = mp.Process( 110 | target=_client_probe, 111 | args=(queue, transfer_api), 112 | ) 113 | client.start() 114 | client.join(timeout=10) 115 | server.join(timeout=10) 116 | assert client.exitcode == 0 117 | assert server.exitcode == 0 118 | -------------------------------------------------------------------------------- /ucp/_libs/tests/test_server_client.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import os 3 | from queue import Empty as QueueIsEmpty 4 | 5 | import pytest 6 | 7 | from ucp._libs import ucx_api 8 | from ucp._libs.arr import Array 9 | from ucp._libs.utils import get_address 10 | from ucp._libs.utils_test import blocking_recv, blocking_send 11 | 12 | mp = mp.get_context("spawn") 13 | 14 | 15 | def _echo_server(get_queue, put_queue, msg_size): 16 | """Server that send received message back to the client 17 | 18 | Notice, since it is illegal to call progress() in call-back functions, 19 | we use a "chain" of call-back functions. 20 | """ 21 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) 22 | worker = ucx_api.UCXWorker(ctx) 23 | 24 | # A reference to listener's endpoint is stored to prevent it from going 25 | # out of scope too early. 26 | ep = None 27 | 28 | def _send_handle(request, exception, msg): 29 | # Notice, we pass `msg` to the handler in order to make sure 30 | # it doesn't go out of scope prematurely. 31 | assert exception is None 32 | 33 | def _recv_handle(request, exception, ep, msg): 34 | assert exception is None 35 | ucx_api.tag_send_nb( 36 | ep, msg, msg.nbytes, tag=0, cb_func=_send_handle, cb_args=(msg,) 37 | ) 38 | 39 | def _listener_handler(conn_request): 40 | global ep 41 | ep = ucx_api.UCXEndpoint.create_from_conn_request( 42 | worker, 43 | conn_request, 44 | endpoint_error_handling=True, 45 | ) 46 | msg = Array(bytearray(msg_size)) 47 | ucx_api.tag_recv_nb( 48 | worker, msg, msg.nbytes, tag=0, cb_func=_recv_handle, cb_args=(ep, msg) 49 | ) 50 | 51 | listener = ucx_api.UCXListener(worker=worker, port=0, cb_func=_listener_handler) 52 | put_queue.put(listener.port) 53 | 54 | while True: 55 | worker.progress() 56 | try: 57 | get_queue.get(block=False, timeout=0.1) 58 | except QueueIsEmpty: 59 | continue 60 | else: 61 | break 62 | 63 | 64 | def _echo_client(msg_size, port): 65 | ctx = ucx_api.UCXContext(feature_flags=(ucx_api.Feature.TAG,)) 66 | worker = ucx_api.UCXWorker(ctx) 67 | ep = ucx_api.UCXEndpoint.create( 68 | worker, 69 | get_address(), 70 | port, 71 | endpoint_error_handling=True, 72 | ) 73 | send_msg = bytes(os.urandom(msg_size)) 74 | recv_msg = bytearray(msg_size) 75 | blocking_send(worker, ep, send_msg) 76 | blocking_recv(worker, ep, recv_msg) 77 | assert send_msg == recv_msg 78 | 79 | 80 | @pytest.mark.parametrize("msg_size", [10, 2**24]) 81 | def test_server_client(msg_size): 82 | put_queue, get_queue = mp.Queue(), mp.Queue() 83 | server = mp.Process( 84 | target=_echo_server, 85 | args=(put_queue, get_queue, msg_size), 86 | ) 87 | server.start() 88 | port = get_queue.get() 89 | client = mp.Process(target=_echo_client, args=(msg_size, port)) 90 | client.start() 91 | client.join(timeout=10) 92 | assert not client.exitcode 93 | put_queue.put("Finished") 94 | server.join(timeout=10) 95 | assert not server.exitcode 96 | -------------------------------------------------------------------------------- /ucp/_libs/transfer_common.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | # cython: language_level=3 6 | 7 | from libc.stdint cimport uintptr_t 8 | 9 | from .exceptions import UCXCanceled, UCXError, log_errors 10 | from .ucx_api_dep cimport * 11 | 12 | 13 | # This callback function is currently needed by stream_send_nb and 14 | # tag_send_nb transfer functions, as well as UCXEndpoint and UCXWorker 15 | # flush methods. 16 | cdef void _send_callback(void *request, ucs_status_t status) with gil: 17 | cdef UCXRequest req 18 | cdef dict req_info 19 | cdef str name, ucx_status_msg, msg 20 | cdef set inflight_msgs 21 | cdef tuple cb_args 22 | cdef dict cb_kwargs 23 | with log_errors(): 24 | req = UCXRequest( request) 25 | assert not req.closed() 26 | req_info = req._handle.info 27 | req_info["status"] = "finished" 28 | 29 | if "cb_func" not in req_info: 30 | # This callback function was called before ucp_tag_send_nb() returned 31 | return 32 | 33 | exception = None 34 | if status == UCS_ERR_CANCELED: 35 | name = req_info["name"] 36 | msg = "<%s>: " % name 37 | exception = UCXCanceled(msg) 38 | elif status != UCS_OK: 39 | name = req_info["name"] 40 | ucx_status_msg = ucs_status_string(status).decode("utf-8") 41 | msg = "<%s>: %s" % (name, ucx_status_msg) 42 | exception = UCXError(msg) 43 | try: 44 | inflight_msgs = req_info["inflight_msgs"] 45 | inflight_msgs.discard(req) 46 | cb_func = req_info["cb_func"] 47 | if cb_func is not None: 48 | cb_args = req_info["cb_args"] 49 | if cb_args is None: 50 | cb_args = () 51 | cb_kwargs = req_info["cb_kwargs"] 52 | if cb_kwargs is None: 53 | cb_kwargs = {} 54 | cb_func(req, exception, *cb_args, **cb_kwargs) 55 | finally: 56 | req.close() 57 | -------------------------------------------------------------------------------- /ucp/_libs/typedefs.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | import enum 7 | 8 | from cpython.ref cimport PyObject 9 | 10 | from .ucx_api_dep cimport * 11 | 12 | 13 | class Feature(enum.Enum): 14 | """Enum of the UCP_FEATURE_* constants""" 15 | TAG = UCP_FEATURE_TAG 16 | RMA = UCP_FEATURE_RMA 17 | AMO32 = UCP_FEATURE_AMO32 18 | AMO64 = UCP_FEATURE_AMO64 19 | WAKEUP = UCP_FEATURE_WAKEUP 20 | STREAM = UCP_FEATURE_STREAM 21 | AM = UCP_FEATURE_AM 22 | 23 | 24 | class AllocatorType(enum.Enum): 25 | HOST = 0 26 | CUDA = 1 27 | UNSUPPORTED = -1 28 | 29 | 30 | # Struct used as requests by UCX 31 | cdef struct ucx_py_request: 32 | bint finished # Used by downstream projects such as cuML 33 | unsigned int uid 34 | PyObject *info 35 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_address.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2020-2021, UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | # cython: language_level=3 6 | 7 | from libc.stdint cimport uintptr_t 8 | from libc.stdlib cimport free 9 | from libc.string cimport memcpy 10 | 11 | from .arr cimport Array 12 | from .ucx_api_dep cimport * 13 | 14 | 15 | def _ucx_address_finalizer( 16 | uintptr_t handle_as_int, 17 | uintptr_t worker_handle_as_int, 18 | ): 19 | cdef ucp_address_t *address = handle_as_int 20 | cdef ucp_worker_h worker = worker_handle_as_int 21 | if worker_handle_as_int != 0: 22 | ucp_worker_release_address(worker, address) 23 | else: 24 | free(address) 25 | 26 | 27 | cdef class UCXAddress(UCXObject): 28 | """Python representation of ucp_address_t""" 29 | cdef ucp_address_t *_address 30 | cdef size_t _length 31 | 32 | def __cinit__( 33 | self, 34 | uintptr_t address_as_int, 35 | size_t length, 36 | UCXWorker worker=None, 37 | ): 38 | address = address_as_int 39 | # Copy address to `self._address` 40 | self._address = malloc(length) 41 | self._length = length 42 | memcpy(self._address, address, length) 43 | 44 | self.add_handle_finalizer( 45 | _ucx_address_finalizer, 46 | int(self._address), 47 | 0 if worker is None else worker.handle, 48 | ) 49 | if worker is not None: 50 | worker.add_child(self) 51 | 52 | @classmethod 53 | def from_buffer(cls, buffer): 54 | buf = Array(buffer) 55 | assert buf.c_contiguous 56 | return UCXAddress(buf.ptr, buf.nbytes) 57 | 58 | @classmethod 59 | def from_worker(cls, UCXWorker worker): 60 | cdef ucs_status_t status 61 | cdef ucp_worker_h ucp_worker = worker._handle 62 | cdef ucp_address_t *address 63 | cdef size_t length 64 | status = ucp_worker_get_address(ucp_worker, &address, &length) 65 | assert_ucs_status(status) 66 | return UCXAddress(address, length, worker=worker) 67 | 68 | @property 69 | def address(self): 70 | return self._address 71 | 72 | @property 73 | def length(self): 74 | return int(self._length) 75 | 76 | def __getbuffer__(self, Py_buffer *buffer, int flags): 77 | get_ucx_object(buffer, flags, self._address, self._length, self) 78 | 79 | def __releasebuffer__(self, Py_buffer *buffer): 80 | pass 81 | 82 | def __reduce__(self): 83 | return (UCXAddress.from_buffer, (bytes(self),)) 84 | 85 | def __hash__(self): 86 | return hash(bytes(self)) 87 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_api.pyi: -------------------------------------------------------------------------------- 1 | import enum 2 | from typing import Callable, Dict, Iterable, Mapping, Optional, Tuple 3 | 4 | # typedefs.pyx 5 | 6 | class AllocatorType(enum.Enum): 7 | HOST: int 8 | CUDA: int 9 | UNSUPPORTED: int 10 | 11 | class Feature(enum.Enum): 12 | TAG: int 13 | RMA: int 14 | AMO32: int 15 | AMO64: int 16 | WAKEUP: int 17 | STREAM: int 18 | AM: int 19 | 20 | # utils.pyx 21 | 22 | def get_current_options() -> Dict[str, str]: ... 23 | def get_ucx_version() -> Tuple[int]: ... 24 | 25 | # ucx_object.pyx 26 | 27 | class UCXObject: 28 | def close(self) -> None: ... 29 | 30 | # ucx_context.pyx 31 | 32 | class UCXContext(UCXObject): 33 | def __init__( 34 | self, config_dict: Mapping = ..., feature_flags: Iterable[Feature] = ... 35 | ): ... 36 | 37 | # ucx_address.pyx 38 | 39 | class UCXAddress: 40 | @classmethod 41 | def from_buffer(cls, buffer) -> UCXAddress: ... 42 | @classmethod 43 | def from_worker(cls, worker: UCXWorker) -> UCXAddress: ... 44 | @property 45 | def address(self) -> int: ... 46 | @property 47 | def length(self) -> int: ... 48 | 49 | # ucx_worker.pyx 50 | 51 | class UCXWorker(UCXObject): 52 | def __init__(self, context: UCXContext): ... 53 | def progress(self) -> None: ... 54 | def ep_create( 55 | self, ip_address: str, port: int, endpoint_error_handling: bool 56 | ) -> UCXEndpoint: ... 57 | def ep_create_from_worker_address( 58 | self, ip_address: str, port: int, endpoint_error_handling: bool 59 | ) -> UCXEndpoint: ... 60 | def ep_create_from_conn_request( 61 | self, conn_request: int, endpoint_error_handling: bool 62 | ) -> UCXEndpoint: ... 63 | def register_am_allocator( 64 | self, allocator: Callable, allocator_type: AllocatorType 65 | ) -> None: ... 66 | 67 | # ucx_listener.pyx 68 | 69 | class UCXListener(UCXObject): 70 | port: int 71 | ip: str 72 | def __init__( 73 | self, 74 | worker: UCXWorker, 75 | port: int, 76 | cb_func: Callable, 77 | cb_args: Optional[tuple] = ..., 78 | cb_kwargs: dict = ..., 79 | ): ... 80 | 81 | # ucx_endpoint.pyx 82 | 83 | class UCXEndpoint(UCXObject): 84 | def info(self) -> str: ... 85 | @property 86 | def worker(self) -> UCXWorker: ... 87 | def unpack_rkey(self, rkey) -> UCXRkey: ... 88 | 89 | # ucx_memory_handle.pyx 90 | 91 | class UCXMemoryHandle(UCXObject): 92 | @classmethod 93 | def alloc(cls, ctx: UCXContext, size: int) -> UCXMemoryHandle: ... 94 | @classmethod 95 | def map(cls, ctx: UCXContext, buffer) -> UCXMemoryHandle: ... 96 | def pack_rkey(self) -> PackedRemoteKey: ... 97 | 98 | # transfer_am.pyx 99 | 100 | def am_send_nbx( 101 | ep: UCXEndpoint, 102 | buffer, 103 | nbytes: int, 104 | cb_func: Callable, 105 | cb_args: Optional[tuple] = ..., 106 | cb_kwargs: Optional[dict] = ..., 107 | name: Optional[str] = ..., 108 | ): ... 109 | def am_recv_nb( 110 | ep: UCXEndpoint, 111 | cb_func: Callable, 112 | cb_args: Optional[tuple] = ..., 113 | cb_kwargs: Optional[dict] = ..., 114 | name: Optional[str] = ..., 115 | ): ... 116 | 117 | # transfer_stream.pyx 118 | 119 | def stream_send_nb( 120 | ep: UCXEndpoint, 121 | buffer, 122 | nbytes: int, 123 | cb_func: Callable, 124 | cb_args: Optional[tuple] = ..., 125 | cb_kwargs: Optional[dict] = ..., 126 | name: Optional[str] = ..., 127 | ): ... 128 | def stream_recv_nb( 129 | ep: UCXEndpoint, 130 | buffer, 131 | nbytes: int, 132 | cb_func: Callable, 133 | cb_args: Optional[tuple] = ..., 134 | cb_kwargs: Optional[dict] = ..., 135 | name: Optional[str] = ..., 136 | ): ... 137 | 138 | # transfer_tag.pyx 139 | 140 | def tag_send_nb( 141 | ep: UCXEndpoint, 142 | buffer, 143 | nbytes: int, 144 | tag: int, 145 | cb_func: Callable, 146 | cb_args: Optional[tuple] = ..., 147 | cb_kwargs: Optional[dict] = ..., 148 | name: Optional[str] = ..., 149 | ): ... 150 | def tag_recv_nb( 151 | worker: UCXWorker, 152 | buffer, 153 | nbytes: int, 154 | tag: int, 155 | cb_func: Callable, 156 | cb_args: Optional[tuple] = ..., 157 | cb_kwargs: Optional[dict] = ..., 158 | name: Optional[str] = ..., 159 | ep: Optional[UCXEndpoint] = ..., 160 | ): ... 161 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_api.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | include "packed_remote_key.pyx" 5 | include "transfer_am.pyx" 6 | include "transfer_common.pyx" 7 | include "transfer_stream.pyx" 8 | include "transfer_tag.pyx" 9 | include "typedefs.pyx" 10 | include "ucx_address.pyx" 11 | include "ucx_context.pyx" 12 | include "ucx_endpoint.pyx" 13 | include "ucx_listener.pyx" 14 | include "ucx_memory_handle.pyx" 15 | include "ucx_object.pyx" 16 | include "ucx_request.pyx" 17 | include "ucx_rkey.pyx" 18 | include "ucx_rma.pyx" 19 | include "ucx_worker.pyx" 20 | include "ucx_worker_cb.pyx" 21 | include "ucxio.pyx" 22 | include "utils.pyx" 23 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_context.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2021 UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | # cython: language_level=3 6 | 7 | import functools 8 | import logging 9 | 10 | from libc.stdint cimport uintptr_t 11 | from libc.stdio cimport FILE 12 | from libc.string cimport memset 13 | 14 | from .ucx_api_dep cimport * 15 | 16 | logger = logging.getLogger("ucx") 17 | 18 | 19 | def _ucx_context_handle_finalizer(uintptr_t handle): 20 | ucp_cleanup( handle) 21 | 22 | 23 | cdef class UCXContext(UCXObject): 24 | """Python representation of `ucp_context_h` 25 | 26 | Parameters 27 | ---------- 28 | config_dict: Mapping[str, str] 29 | UCX options such as "MEMTYPE_CACHE=n" and "SEG_SIZE=3M" 30 | feature_flags: Iterable[Feature] 31 | Tuple of UCX feature flags 32 | """ 33 | cdef: 34 | ucp_context_h _handle 35 | dict _config 36 | tuple _feature_flags 37 | readonly bint cuda_support 38 | 39 | def __init__( 40 | self, 41 | config_dict={}, 42 | feature_flags=( 43 | Feature.TAG, 44 | Feature.WAKEUP, 45 | Feature.STREAM, 46 | Feature.AM, 47 | Feature.RMA 48 | ) 49 | ): 50 | cdef ucp_params_t ucp_params 51 | cdef ucp_worker_params_t worker_params 52 | cdef ucs_status_t status 53 | self._feature_flags = tuple(feature_flags) 54 | 55 | memset(&ucp_params, 0, sizeof(ucp_params)) 56 | ucp_params.field_mask = ( 57 | UCP_PARAM_FIELD_FEATURES | 58 | UCP_PARAM_FIELD_REQUEST_SIZE | 59 | UCP_PARAM_FIELD_REQUEST_INIT 60 | ) 61 | ucp_params.features = functools.reduce( 62 | lambda x, y: x | y.value, feature_flags, 0 63 | ) 64 | ucp_params.request_size = sizeof(ucx_py_request) 65 | ucp_params.request_init = ( 66 | ucx_py_request_reset 67 | ) 68 | 69 | cdef ucp_config_t *config = _read_ucx_config(config_dict) 70 | try: 71 | status = ucp_init(&ucp_params, config, &self._handle) 72 | assert_ucs_status(status) 73 | self._config = ucx_config_to_dict(config) 74 | finally: 75 | ucp_config_release(config) 76 | 77 | # UCX supports CUDA if "cuda" is part of the TLS or TLS is "all" 78 | cdef str tls = self._config["TLS"] 79 | cuda_transports = {"cuda", "cuda_copy"} 80 | if tls.startswith("^"): 81 | # UCX_TLS=^x,y,z means "all \ {x, y, z}" 82 | disabled = set(tls[1:].split(",")) 83 | self.cuda_support = not (disabled & cuda_transports) 84 | else: 85 | enabled = set(tls.split(",")) 86 | self.cuda_support = bool( 87 | enabled & ({"all", "cuda_ipc"} | cuda_transports) 88 | ) 89 | 90 | self.add_handle_finalizer( 91 | _ucx_context_handle_finalizer, 92 | int(self._handle) 93 | ) 94 | 95 | logger.info("UCP initiated using config: ") 96 | cdef str k, v 97 | for k, v in self._config.items(): 98 | logger.info(f" {k}: {v}") 99 | 100 | cpdef dict get_config(self): 101 | return self._config 102 | 103 | @property 104 | def handle(self): 105 | assert self.initialized 106 | return int(self._handle) 107 | 108 | def info(self): 109 | assert self.initialized 110 | 111 | cdef FILE *text_fd = create_text_fd() 112 | ucp_context_print_info(self._handle, text_fd) 113 | return decode_text_fd(text_fd) 114 | 115 | def map(self, mem): 116 | return UCXMemoryHandle.map(self, mem) 117 | 118 | def alloc(self, size): 119 | return UCXMemoryHandle.alloc(self, size) 120 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_listener.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | from libc.stdint cimport uint16_t, uintptr_t 7 | 8 | from .exceptions import log_errors 9 | from .ucx_api_dep cimport * 10 | 11 | 12 | cdef void _listener_callback(ucp_conn_request_h conn_request, void *args) with gil: 13 | """Callback function used by UCXListener""" 14 | cdef dict cb_data = args 15 | 16 | with log_errors(): 17 | cb_data['cb_func']( 18 | int(conn_request), 19 | *cb_data['cb_args'], 20 | **cb_data['cb_kwargs'] 21 | ) 22 | 23 | 24 | def _ucx_listener_handle_finalizer(uintptr_t handle): 25 | ucp_listener_destroy( handle) 26 | 27 | 28 | cdef class UCXListener(UCXObject): 29 | """Python representation of `ucp_listener_h` 30 | 31 | Create and start a listener to accept incoming connections. 32 | 33 | Notice, the listening is closed when the returned Listener 34 | goes out of scope thus remember to keep a reference to the object. 35 | 36 | Parameters 37 | ---------- 38 | worker: UCXWorker 39 | Listening worker. 40 | port: int 41 | An unused port number for listening, or `0` to let UCX assign 42 | an unused port. 43 | callback_func: callable 44 | A callback function that gets invoked when an incoming 45 | connection is accepted. The arguments are `conn_request` 46 | followed by *cb_args and **cb_kwargs (if not None). 47 | cb_args: tuple, optional 48 | Extra arguments to the call-back function 49 | cb_kwargs: dict, optional 50 | Extra keyword arguments to the call-back function 51 | 52 | Returns 53 | ------- 54 | Listener: UCXListener 55 | The new listener. When this object is deleted, the listening stops 56 | """ 57 | 58 | cdef: 59 | ucp_listener_h _handle 60 | dict cb_data 61 | 62 | cdef public: 63 | uint16_t port 64 | str ip 65 | 66 | def __init__( 67 | self, 68 | UCXWorker worker, 69 | uint16_t port, 70 | cb_func, 71 | tuple cb_args=None, 72 | dict cb_kwargs=None 73 | ): 74 | if cb_args is None: 75 | cb_args = () 76 | if cb_kwargs is None: 77 | cb_kwargs = {} 78 | cdef ucp_listener_params_t params 79 | cdef ucp_listener_conn_callback_t _listener_cb = ( 80 | _listener_callback 81 | ) 82 | cdef ucp_listener_attr_t attr 83 | self.cb_data = { 84 | "cb_func": cb_func, 85 | "cb_args": cb_args, 86 | "cb_kwargs": cb_kwargs, 87 | } 88 | params.field_mask = ( 89 | UCP_LISTENER_PARAM_FIELD_SOCK_ADDR | UCP_LISTENER_PARAM_FIELD_CONN_HANDLER 90 | ) 91 | params.conn_handler.cb = _listener_cb 92 | params.conn_handler.arg = self.cb_data 93 | if c_util_set_sockaddr(¶ms.sockaddr, NULL, port): 94 | raise MemoryError("Failed allocation of sockaddr") 95 | 96 | cdef ucs_status_t status = ucp_listener_create( 97 | worker._handle, ¶ms, &self._handle 98 | ) 99 | c_util_sockaddr_free(¶ms.sockaddr) 100 | assert_ucs_status(status) 101 | 102 | attr.field_mask = UCP_LISTENER_ATTR_FIELD_SOCKADDR 103 | status = ucp_listener_query(self._handle, &attr) 104 | if status != UCS_OK: 105 | ucp_listener_destroy(self._handle) 106 | assert_ucs_status(status) 107 | 108 | DEF MAX_STR_LEN = 50 109 | cdef char ip_str[MAX_STR_LEN] 110 | cdef char port_str[MAX_STR_LEN] 111 | c_util_sockaddr_get_ip_port_str(&attr.sockaddr, 112 | ip_str, 113 | port_str, 114 | MAX_STR_LEN) 115 | 116 | self.port = int(port_str.decode(errors="ignore")) 117 | self.ip = ip_str.decode(errors="ignore") 118 | 119 | self.add_handle_finalizer( 120 | _ucx_listener_handle_finalizer, 121 | int(self._handle) 122 | ) 123 | worker.add_child(self) 124 | 125 | @property 126 | def handle(self): 127 | assert self.initialized 128 | return int(self._handle) 129 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_memory_handle.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2021, UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | # cython: language_level=3 6 | 7 | from libc.stdint cimport uintptr_t 8 | 9 | from .arr cimport Array 10 | from .ucx_api_dep cimport * 11 | 12 | 13 | def _ucx_mem_handle_finalizer(uintptr_t handle_as_int, UCXContext ctx): 14 | assert ctx.initialized 15 | cdef ucp_mem_h handle = handle_as_int 16 | cdef ucs_status_t status 17 | status = ucp_mem_unmap(ctx._handle, handle) 18 | assert_ucs_status(status) 19 | 20 | 21 | cdef class UCXMemoryHandle(UCXObject): 22 | """ Python representation for ucp_mem_h type. Users should not instance this class 23 | directly and instead use either the map or the alloc class methods 24 | """ 25 | cdef ucp_mem_h _mem_handle 26 | cdef UCXContext _context 27 | cdef uint64_t r_address 28 | cdef size_t _length 29 | 30 | def __cinit__(self, UCXContext ctx, uintptr_t par): 31 | cdef ucs_status_t status 32 | cdef ucp_context_h ctx_handle = ctx.handle 33 | cdef ucp_mem_map_params_t *params = par 34 | self._context = ctx 35 | status = ucp_mem_map(ctx_handle, params, &self._mem_handle) 36 | assert_ucs_status(status) 37 | self._populate_metadata() 38 | self.add_handle_finalizer( 39 | _ucx_mem_handle_finalizer, 40 | int(self._mem_handle), 41 | self._context 42 | ) 43 | ctx.add_child(self) 44 | 45 | @classmethod 46 | def alloc(cls, ctx, size): 47 | """ Allocate a new pool of registered memory. This memory can be used for 48 | RMA and AMO operations. This memory should not be accessed from outside 49 | these operations. 50 | 51 | Parameters 52 | ---------- 53 | ctx: UCXContext 54 | The UCX context that this memory should be registered to 55 | size: int 56 | Minimum amount of memory to allocate 57 | """ 58 | cdef ucp_mem_map_params_t params 59 | cdef ucs_status_t status 60 | 61 | params.field_mask = ( 62 | UCP_MEM_MAP_PARAM_FIELD_FLAGS | 63 | UCP_MEM_MAP_PARAM_FIELD_LENGTH 64 | ) 65 | params.length = size 66 | params.flags = UCP_MEM_MAP_NONBLOCK | UCP_MEM_MAP_ALLOCATE 67 | 68 | return UCXMemoryHandle(ctx, ¶ms) 69 | 70 | @classmethod 71 | def map(cls, ctx, mem): 72 | """ Register an existing memory object to UCX for use in RMA and AMO operations 73 | It is not safe to access this memory from outside UCX while operations are 74 | outstanding 75 | 76 | Parameters 77 | ---------- 78 | ctx: UCXContext 79 | The UCX context that this memory should be registered to 80 | mem: buffer 81 | The memory object to be registered 82 | """ 83 | cdef ucp_mem_map_params_t params 84 | cdef ucs_status_t status 85 | 86 | buff = Array(mem) 87 | 88 | params.field_mask = ( 89 | UCP_MEM_MAP_PARAM_FIELD_ADDRESS | 90 | UCP_MEM_MAP_PARAM_FIELD_LENGTH 91 | ) 92 | params.address = buff.ptr 93 | params.length = buff.nbytes 94 | 95 | return UCXMemoryHandle(ctx, ¶ms) 96 | 97 | def pack_rkey(self): 98 | """ Returns an UCXRKey object that represents a packed key. This key is what 99 | allows the UCX API to associate this memory with an EP. 100 | """ 101 | return PackedRemoteKey.from_mem_handle(self) 102 | 103 | @property 104 | def mem_handle(self): 105 | return self._mem_handle 106 | 107 | # Done as a separate function because some day I plan on making this loaded lazily 108 | # I believe this reports the actual registered space, rather than what was requested 109 | def _populate_metadata(self): 110 | cdef ucs_status_t status 111 | cdef ucp_mem_attr_t attr 112 | 113 | attr.field_mask = ( 114 | UCP_MEM_ATTR_FIELD_ADDRESS | 115 | UCP_MEM_ATTR_FIELD_LENGTH 116 | ) 117 | status = ucp_mem_query(self._mem_handle, &attr) 118 | assert_ucs_status(status) 119 | self.r_address = attr.address 120 | self._length = attr.length 121 | 122 | @property 123 | def address(self): 124 | """ Get base address for the memory registration """ 125 | return self.r_address 126 | 127 | @property 128 | def length(self): 129 | """ Get length of registered memory """ 130 | return self._length 131 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_object.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | import weakref 7 | 8 | 9 | def _handle_finalizer_wrapper( 10 | children, handle_finalizer, handle_as_int, *extra_args, **extra_kargs 11 | ): 12 | for weakref_to_child in children: 13 | child = weakref_to_child() 14 | if child is not None: 15 | child.close() 16 | handle_finalizer(handle_as_int, *extra_args, **extra_kargs) 17 | 18 | 19 | cdef class UCXObject: 20 | """Base class for UCX classes 21 | 22 | This base class streamlines the cleanup of UCX objects and reduces duplicate code. 23 | """ 24 | cdef: 25 | object __weakref__ 26 | object _finalizer 27 | list _children 28 | 29 | def __cinit__(self): 30 | # The finalizer, which can be called multiple times but only 31 | # evoke the finalizer function once. 32 | # Is None when the underlying UCX handle hasen't been initialized. 33 | self._finalizer = None 34 | # List of weak references of UCX objects that make use of this object 35 | self._children = [] 36 | 37 | cpdef void close(self) except *: 38 | """Close the object and free the underlying UCX handle. 39 | Does nothing if the object is already closed 40 | """ 41 | if self.initialized: 42 | self._finalizer() 43 | 44 | @property 45 | def initialized(self): 46 | """Is the underlying UCX handle initialized""" 47 | return self._finalizer and self._finalizer.alive 48 | 49 | cpdef void add_child(self, child) except *: 50 | """Add a UCX object to this object's children. The underlying UCX 51 | handle will be freed when this obejct is freed. 52 | """ 53 | self._children.append(weakref.ref(child)) 54 | 55 | def add_handle_finalizer(self, handle_finalizer, handle_as_int, *extra_args): 56 | """Add a finalizer of `handle_as_int`""" 57 | self._finalizer = weakref.finalize( 58 | self, 59 | _handle_finalizer_wrapper, 60 | self._children, 61 | handle_finalizer, 62 | handle_as_int, 63 | *extra_args 64 | ) 65 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_request.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | # cython: language_level=3 5 | 6 | from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject 7 | from libc.stdint cimport uintptr_t 8 | 9 | from .exceptions import UCXError, UCXMsgTruncated 10 | from .ucx_api_dep cimport * 11 | 12 | 13 | # Counter used as UCXRequest UIDs 14 | cdef unsigned int _ucx_py_request_counter = 0 15 | 16 | 17 | cdef class UCXRequest: 18 | """Python wrapper of UCX request handle. 19 | 20 | Don't create this class directly, the send/recv functions and their 21 | callback functions will return UCXRequest objects. 22 | 23 | Notice, this class doesn't own the handle and multiple instances of 24 | UCXRequest can point to the same underlying UCX handle. 25 | Furthermore, UCX can modify/free the UCX handle without notice 26 | thus we use `_uid` to make sure the handle hasn't been modified. 27 | """ 28 | cdef: 29 | ucx_py_request *_handle 30 | unsigned int _uid 31 | 32 | def __init__(self, uintptr_t req_as_int): 33 | global _ucx_py_request_counter 34 | cdef ucx_py_request *req = req_as_int 35 | assert req != NULL 36 | self._handle = req 37 | 38 | cdef dict info = {"status": "pending"} 39 | if self._handle.info == NULL: # First time we are wrapping this UCX request 40 | Py_INCREF(info) 41 | self._handle.info = info 42 | _ucx_py_request_counter += 1 43 | self._uid = _ucx_py_request_counter 44 | assert self._handle.uid == 0 45 | self._handle.uid = _ucx_py_request_counter 46 | else: 47 | self._uid = self._handle.uid 48 | 49 | cpdef bint closed(self): 50 | return self._handle == NULL or self._uid != self._handle.uid 51 | 52 | cpdef void close(self) except *: 53 | """This routine releases the non-blocking request back to UCX, 54 | regardless of its current state. Communications operations associated with 55 | this request will make progress internally, however no further notifications or 56 | callbacks will be invoked for this request. """ 57 | 58 | if not self.closed(): 59 | Py_DECREF(self._handle.info) 60 | self._handle.info = NULL 61 | self._handle.uid = 0 62 | ucp_request_free(self._handle) 63 | self._handle = NULL 64 | 65 | @property 66 | def info(self): 67 | assert not self.closed() 68 | return self._handle.info 69 | 70 | @property 71 | def handle(self): 72 | assert not self.closed() 73 | return int(self._handle) 74 | 75 | def __hash__(self): 76 | if self.closed(): 77 | return id(self) 78 | else: 79 | return self._uid 80 | 81 | def __eq__(self, other): 82 | return hash(self) == hash(other) 83 | 84 | def __repr__(self): 85 | if self.closed(): 86 | return "" 87 | else: 88 | return ( 89 | f"" 91 | ) 92 | 93 | 94 | cdef UCXRequest _handle_status( 95 | ucs_status_ptr_t status, 96 | int64_t expected_receive, 97 | cb_func, 98 | cb_args, 99 | cb_kwargs, 100 | unicode name, 101 | set inflight_msgs 102 | ): 103 | if UCS_PTR_STATUS(status) == UCS_OK: 104 | return 105 | cdef str ucx_status_msg, msg 106 | if UCS_PTR_IS_ERR(status): 107 | ucx_status_msg = ( 108 | ucs_status_string(UCS_PTR_STATUS(status)).decode("utf-8") 109 | ) 110 | msg = "<%s>: %s" % (name, ucx_status_msg) 111 | raise UCXError(msg) 112 | cdef UCXRequest req = UCXRequest( status) 113 | assert not req.closed() 114 | cdef dict req_info = req._handle.info 115 | if req_info["status"] == "finished": 116 | try: 117 | # The callback function has already handled the request 118 | received = req_info.get("received", None) 119 | if received is not None and received != expected_receive: 120 | msg = "<%s>: length mismatch: %d (got) != %d (expected)" % ( 121 | name, received, expected_receive 122 | ) 123 | raise UCXMsgTruncated(msg) 124 | else: 125 | cb_func(req, None, *cb_args, **cb_kwargs) 126 | return 127 | finally: 128 | req.close() 129 | else: 130 | req_info["cb_func"] = cb_func 131 | req_info["cb_args"] = cb_args 132 | req_info["cb_kwargs"] = cb_kwargs 133 | req_info["expected_receive"] = expected_receive 134 | req_info["name"] = name 135 | inflight_msgs.add(req) 136 | req_info["inflight_msgs"] = inflight_msgs 137 | return req 138 | -------------------------------------------------------------------------------- /ucp/_libs/ucx_rkey.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2021, UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | # cython: language_level=3 6 | 7 | import logging 8 | 9 | from libc.stdint cimport uintptr_t 10 | 11 | from .arr cimport Array 12 | from .ucx_api_dep cimport * 13 | 14 | logger = logging.getLogger("ucx") 15 | 16 | 17 | def _ucx_remote_mem_finalizer_post_flush(req, exception, UCXRkey rkey): 18 | if exception is not None: 19 | logger.debug("Remote memory finalizer exception: %s" % str(exception)) 20 | ucp_rkey_destroy(rkey._handle) 21 | 22 | 23 | def _ucx_rkey_finalizer(UCXRkey rkey, UCXEndpoint ep): 24 | req = ep.flush(_ucx_remote_mem_finalizer_post_flush, (rkey,)) 25 | 26 | # Flush completed immediately and callback wasn't called 27 | if req is None: 28 | ucp_rkey_destroy(rkey._handle) 29 | 30 | 31 | cdef class UCXRkey(UCXObject): 32 | cdef ucp_rkey_h _handle 33 | cdef UCXEndpoint ep 34 | 35 | def __init__(self, UCXEndpoint ep, PackedRemoteKey rkey): 36 | cdef ucs_status_t status 37 | rkey_arr = Array(rkey) 38 | cdef const void *key_data = rkey_arr.ptr 39 | status = ucp_ep_rkey_unpack(ep._handle, key_data, &self._handle) 40 | assert_ucs_status(status) 41 | self.ep = ep 42 | self.add_handle_finalizer( 43 | _ucx_rkey_finalizer, 44 | self, 45 | ep 46 | ) 47 | ep.add_child(self) 48 | 49 | @property 50 | def ep(self): 51 | return self.ep 52 | -------------------------------------------------------------------------------- /ucp/_libs/ucxio.pyx: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, UT-Battelle, LLC. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | from io import SEEK_CUR, SEEK_END, SEEK_SET, RawIOBase 5 | 6 | from .arr cimport Array 7 | from .ucx_api_dep cimport * 8 | 9 | 10 | def blocking_handler(request, exception, finished): 11 | assert exception is None 12 | finished[0] = True 13 | 14 | 15 | class UCXIO(RawIOBase): 16 | """A class to simulate python streams backed by UCX RMA operations 17 | 18 | Parameters 19 | ---------- 20 | dest: int 21 | A 64 bit number that represents the remote address that will be written to 22 | and read from. 23 | length: int 24 | Maximum length of the region that can be written to and read from. 25 | rkey: UCXRkey 26 | An unpacked UCXRkey that represents the remote memory that was unpacked by 27 | UCX for use in RMA operations. 28 | """ 29 | 30 | def __init__(self, dest, length, rkey): 31 | self.pos = 0 32 | self.remote_addr = dest 33 | self.length = length 34 | self.rkey = rkey 35 | self.cb_finished = [False] 36 | 37 | def block_on_request(self, req): 38 | if req is not None: 39 | while not self.cb_finished[0]: 40 | self.rkey.ep.worker.progress() 41 | self.cb_finished[0] = False 42 | 43 | def flush(self): 44 | req = self.rkey.ep.flush(blocking_handler, cb_args=(self.cb_finished,)) 45 | self.block_on_request(req) 46 | 47 | def seek(self, pos, whence=SEEK_SET): 48 | if whence == SEEK_SET: 49 | self.pos = min(max(pos, 0), self.length) 50 | elif whence == SEEK_CUR: 51 | if pos < 0: 52 | self.pos = max(self.pos + pos, 0) 53 | else: 54 | self.pos = min(self.pos + pos, self.length) 55 | elif whence == SEEK_END: 56 | self.pos = min(max(self.pos + pos, 0), self.length) 57 | else: 58 | raise ValueError("Invalid argument") 59 | return self.pos 60 | 61 | def _do_rma(self, op, buff): 62 | data = Array(buff) 63 | size = data.nbytes 64 | if self.pos + size > self.length: 65 | size = self.length - self.pos 66 | finished = op(data, size, self.remote_addr + self.pos, self.rkey) 67 | self.pos += size 68 | if not finished: 69 | self.flush() 70 | return size 71 | 72 | def readinto(self, buff): 73 | return self._do_rma(get_nbi, buff) 74 | 75 | def write(self, buff): 76 | return self._do_rma(put_nbi, buff) 77 | 78 | def seekable(self): 79 | return True 80 | 81 | def writable(self): 82 | return True 83 | 84 | def readable(self): 85 | return True 86 | -------------------------------------------------------------------------------- /ucp/_libs/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | import fcntl 5 | import glob 6 | import os 7 | import socket 8 | import struct 9 | 10 | try: 11 | from nvtx import annotate as nvtx_annotate 12 | except ImportError: 13 | # If nvtx module is not installed, `annotate` yields only. 14 | from contextlib import contextmanager 15 | 16 | @contextmanager 17 | def nvtx_annotate(message=None, color=None, domain=None): 18 | yield 19 | 20 | 21 | try: 22 | from dask.utils import format_bytes, format_time, parse_bytes 23 | except ImportError: 24 | 25 | def format_time(x): 26 | if x < 1e-6: 27 | return f"{x * 1e9:.3f} ns" 28 | if x < 1e-3: 29 | return f"{x * 1e6:.3f} us" 30 | if x < 1: 31 | return f"{x * 1e3:.3f} ms" 32 | else: 33 | return f"{x:.3f} s" 34 | 35 | def format_bytes(x): 36 | """Return formatted string in B, KiB, MiB, GiB or TiB""" 37 | if x < 1024: 38 | return f"{x} B" 39 | elif x < 1024**2: 40 | return f"{x / 1024:.2f} KiB" 41 | elif x < 1024**3: 42 | return f"{x / 1024**2:.2f} MiB" 43 | elif x < 1024**4: 44 | return f"{x / 1024**3:.2f} GiB" 45 | else: 46 | return f"{x / 1024**4:.2f} TiB" 47 | 48 | parse_bytes = None 49 | 50 | 51 | def print_separator(separator="-", length=80): 52 | """Print a single separator character multiple times""" 53 | print(separator * length) 54 | 55 | 56 | def print_key_value(key, value, key_length=25): 57 | """Print a key and value with fixed key-field length""" 58 | print(f"{key: <{key_length}} | {value}") 59 | 60 | 61 | def print_multi(values, key_length=25): 62 | """Print a key and value with fixed key-field length""" 63 | assert isinstance(values, tuple) or isinstance(values, list) 64 | assert len(values) > 1 65 | 66 | print_str = "".join(f"{s: <{key_length}} | " for s in values[:-1]) 67 | print_str += values[-1] 68 | print(print_str) 69 | 70 | 71 | def get_address(ifname=None): 72 | """ 73 | Get the address associated with a network interface. 74 | 75 | Parameters 76 | ---------- 77 | ifname : str 78 | The network interface name to find the address for. 79 | If None, it uses the value of environment variable `UCXPY_IFNAME` 80 | and if `UCXPY_IFNAME` is not set it defaults to "ib0" 81 | An OSError is raised for invalid interfaces. 82 | 83 | Returns 84 | ------- 85 | address : str 86 | The inet addr associated with an interface. 87 | 88 | Raises 89 | ------ 90 | RuntimeError 91 | If a network address could not be determined. 92 | 93 | Examples 94 | -------- 95 | >>> get_address() 96 | '10.33.225.160' 97 | 98 | >>> get_address(ifname='lo') 99 | '127.0.0.1' 100 | """ 101 | 102 | def _get_address(ifname): 103 | ifname = ifname.encode() 104 | with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s: 105 | return socket.inet_ntoa( 106 | fcntl.ioctl( 107 | s.fileno(), 0x8915, struct.pack("256s", ifname[:15]) # SIOCGIFADDR 108 | )[20:24] 109 | ) 110 | 111 | def _try_interfaces(): 112 | prefix_priority = ["ib", "eth", "en", "docker"] 113 | iftypes = {p: [] for p in prefix_priority} 114 | for i in glob.glob("/sys/class/net/*"): 115 | name = i.split("/")[-1] 116 | for p in prefix_priority: 117 | if name.startswith(p): 118 | iftypes[p].append(name) 119 | for p in prefix_priority: 120 | iftype = iftypes[p] 121 | iftype.sort() 122 | for i in iftype: 123 | try: 124 | return _get_address(i) 125 | except OSError: 126 | pass 127 | 128 | raise RuntimeError( 129 | "A network address could not be determined, an interface that has a valid " 130 | "IP address with the environment variable `UCXPY_IFNAME`." 131 | ) 132 | 133 | if ifname is None: 134 | ifname = os.environ.get("UCXPY_IFNAME") 135 | 136 | if ifname is not None: 137 | return _get_address(ifname) 138 | else: 139 | return _try_interfaces() 140 | -------------------------------------------------------------------------------- /ucp/_libs/utils_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | import multiprocessing as mp 5 | 6 | from ucp._libs import ucx_api 7 | from ucp._libs.arr import Array 8 | 9 | mp = mp.get_context("spawn") 10 | 11 | 12 | def blocking_handler(request, exception, finished): 13 | assert exception is None 14 | finished[0] = True 15 | 16 | 17 | def blocking_flush(obj): 18 | finished = [False] 19 | if not hasattr(obj, "progress"): 20 | progress = obj.worker.progress 21 | else: 22 | progress = obj.progress 23 | req = obj.flush(cb_func=blocking_handler, cb_args=(finished,)) 24 | if req is not None: 25 | while not finished[0]: 26 | progress() 27 | 28 | 29 | def blocking_send(worker, ep, msg, tag=0): 30 | msg = Array(msg) 31 | finished = [False] 32 | req = ucx_api.tag_send_nb( 33 | ep, 34 | msg, 35 | msg.nbytes, 36 | tag=tag, 37 | cb_func=blocking_handler, 38 | cb_args=(finished,), 39 | ) 40 | if req is not None: 41 | while not finished[0]: 42 | worker.progress() 43 | 44 | 45 | def blocking_recv(worker, ep, msg, tag=0): 46 | msg = Array(msg) 47 | finished = [False] 48 | req = ucx_api.tag_recv_nb( 49 | worker, 50 | msg, 51 | msg.nbytes, 52 | tag=tag, 53 | cb_func=blocking_handler, 54 | cb_args=(finished,), 55 | ep=ep, 56 | ) 57 | if req is not None: 58 | while not finished[0]: 59 | worker.progress() 60 | 61 | 62 | def non_blocking_handler(request, exception, completed_cb): 63 | if exception is not None: 64 | print(exception) 65 | assert exception is None 66 | completed_cb() 67 | 68 | 69 | def non_blocking_send(worker, ep, msg, started_cb, completed_cb, tag=0): 70 | msg = Array(msg) 71 | started_cb() 72 | req = ucx_api.tag_send_nb( 73 | ep, 74 | msg, 75 | msg.nbytes, 76 | tag=tag, 77 | cb_func=non_blocking_handler, 78 | cb_args=(completed_cb,), 79 | ) 80 | if req is None: 81 | completed_cb() 82 | return req 83 | 84 | 85 | def non_blocking_recv(worker, ep, msg, started_cb, completed_cb, tag=0): 86 | msg = Array(msg) 87 | started_cb() 88 | req = ucx_api.tag_recv_nb( 89 | worker, 90 | msg, 91 | msg.nbytes, 92 | tag=tag, 93 | cb_func=non_blocking_handler, 94 | cb_args=(completed_cb,), 95 | ep=ep, 96 | ) 97 | if req is None: 98 | completed_cb() 99 | return req 100 | 101 | 102 | def blocking_am_send(worker, ep, msg): 103 | msg = Array(msg) 104 | finished = [False] 105 | req = ucx_api.am_send_nbx( 106 | ep, 107 | msg, 108 | msg.nbytes, 109 | cb_func=blocking_handler, 110 | cb_args=(finished,), 111 | ) 112 | if req is not None: 113 | while not finished[0]: 114 | worker.progress() 115 | 116 | 117 | def blocking_am_recv_handler(recv_obj, exception, ret): 118 | assert exception is None 119 | ret[0] = recv_obj 120 | 121 | 122 | def blocking_am_recv(worker, ep): 123 | ret = [None] 124 | ucx_api.am_recv_nb( 125 | ep, 126 | cb_func=blocking_am_recv_handler, 127 | cb_args=(ret,), 128 | ) 129 | while ret[0] is None: 130 | worker.progress() 131 | return ret[0] 132 | -------------------------------------------------------------------------------- /ucp/_version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib.resources 16 | 17 | __version__ = ( 18 | importlib.resources.files(__package__).joinpath("VERSION").read_text().strip() 19 | ) 20 | 21 | try: 22 | __git_commit__ = ( 23 | importlib.resources.files(__package__) 24 | .joinpath("GIT_COMMIT") 25 | .read_text() 26 | .strip() 27 | ) 28 | except FileNotFoundError: 29 | __git_commit__ = "" 30 | 31 | __all__ = ["__git_commit__", "__version__"] 32 | -------------------------------------------------------------------------------- /ucp/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | -------------------------------------------------------------------------------- /ucp/benchmarks/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | -------------------------------------------------------------------------------- /ucp/benchmarks/backends/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import Namespace 3 | from queue import Queue 4 | 5 | 6 | class BaseServer(ABC): 7 | @abstractmethod 8 | def __init__(self, args: Namespace, queue: Queue): 9 | """ 10 | Benchmark server. 11 | 12 | Parameters 13 | ---------- 14 | args: argparse.Namespace 15 | Parsed command-line arguments that will be used as parameters during 16 | the `run` method. 17 | queue: Queue 18 | Queue object where server will put the port it is listening at. 19 | """ 20 | pass 21 | 22 | @property 23 | @abstractmethod 24 | def has_cuda_support() -> bool: 25 | """ 26 | Check whether server implementation supports CUDA memory transfers. 27 | 28 | Returns 29 | ------- 30 | ret: bool 31 | `True` if CUDA is supported, `False` otherwise. 32 | """ 33 | return False 34 | 35 | @abstractmethod 36 | def run(self): 37 | """ 38 | Run the benchmark server. 39 | 40 | The server is executed as follows: 41 | 1. Start the listener and put port where it is listening into the queue 42 | registered in constructor; 43 | 2. Setup any additional context (Active Message registration, memory buffers 44 | to reuse, etc.); 45 | 3. Transfer data back-and-forth with client; 46 | 4. Shutdown server. 47 | """ 48 | pass 49 | 50 | 51 | class BaseClient(ABC): 52 | @abstractmethod 53 | def __init__(self, args: Namespace, queue: Queue, server_address: str, port: int): 54 | """ 55 | Benchmark client. 56 | 57 | Parameters 58 | ---------- 59 | args 60 | Parsed command-line arguments that will be used as parameters during 61 | the `run` method. 62 | queue 63 | Queue object where to put timing results. 64 | server_address 65 | Hostname or IP address where server is listening at. 66 | port 67 | Port where server is listening at. 68 | """ 69 | pass 70 | 71 | @property 72 | @abstractmethod 73 | def has_cuda_support() -> bool: 74 | """ 75 | Check whether client implementation supports CUDA memory transfers. 76 | 77 | Returns 78 | ------- 79 | ret: bool 80 | `True` if CUDA is supported, `False` otherwise. 81 | """ 82 | return False 83 | 84 | @abstractmethod 85 | def run(self): 86 | """ 87 | Run the benchmark client. 88 | 89 | The client is executed as follows: 90 | 1. Connects to listener; 91 | 2. Setup any additional context (Active Message registration, memory buffers 92 | to reuse, etc.); 93 | 3. Transfer data back-and-forth with server; 94 | 4. Shutdown client; 95 | 5. Put timing results into the queue registered in constructor. 96 | """ 97 | pass 98 | 99 | def print_backend_specific_config(self): 100 | """ 101 | Pretty print configuration specific to backend implementation. 102 | """ 103 | pass 104 | -------------------------------------------------------------------------------- /ucp/benchmarks/backends/tornado.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from time import monotonic 3 | 4 | import numpy as np 5 | from tornado.iostream import StreamClosedError 6 | from tornado.tcpclient import TCPClient 7 | from tornado.tcpserver import TCPServer 8 | 9 | from ucp.benchmarks.backends.base import BaseClient, BaseServer 10 | 11 | 12 | class TornadoServer(BaseServer): 13 | has_cuda_support = False 14 | 15 | def __init__(self, args, queue): 16 | self.args = args 17 | self.queue = queue 18 | 19 | def _start_listener(self, server, port): 20 | if port is not None: 21 | server.listen(port) 22 | else: 23 | for i in range(10000, 60000): 24 | try: 25 | server.listen(i) 26 | except OSError: 27 | continue 28 | else: 29 | port = i 30 | break 31 | 32 | return port 33 | 34 | async def run(self): 35 | args = self.args 36 | 37 | event = asyncio.Event() 38 | 39 | class TransferServer(TCPServer): 40 | async def handle_stream(self, stream, address): 41 | if args.reuse_alloc: 42 | recv_msg = np.zeros(args.n_bytes, dtype="u1") 43 | 44 | assert recv_msg.nbytes == args.n_bytes 45 | 46 | for i in range(args.n_iter + args.n_warmup_iter): 47 | if not args.reuse_alloc: 48 | recv_msg = np.zeros(args.n_bytes, dtype="u1") 49 | 50 | try: 51 | await stream.read_into(recv_msg.data) 52 | await stream.write(recv_msg.data) 53 | except StreamClosedError as e: 54 | print(e) 55 | break 56 | 57 | event.set() 58 | 59 | # Set max_buffer_size to 1 GiB for now 60 | server = TransferServer(max_buffer_size=1024**3) 61 | port = self._start_listener(server, args.port) 62 | 63 | self.queue.put(port) 64 | await event.wait() 65 | 66 | 67 | class TornadoClient(BaseClient): 68 | has_cuda_support = False 69 | 70 | def __init__(self, args, queue, server_address, port): 71 | self.args = args 72 | self.queue = queue 73 | self.server_address = server_address 74 | self.port = port 75 | 76 | async def run(self) -> bool: 77 | client = TCPClient() 78 | # Set max_buffer_size to 1 GiB for now 79 | stream = await client.connect( 80 | self.server_address, self.port, max_buffer_size=1024**3 81 | ) 82 | 83 | send_msg = np.arange(self.args.n_bytes, dtype="u1") 84 | assert send_msg.nbytes == self.args.n_bytes 85 | if self.args.reuse_alloc: 86 | recv_msg = np.zeros(self.args.n_bytes, dtype="u1") 87 | assert recv_msg.nbytes == self.args.n_bytes 88 | 89 | if self.args.report_gil_contention: 90 | from gilknocker import KnockKnock 91 | 92 | # Use smallest polling interval possible to ensure, contention will always 93 | # be zero for small messages otherwise and inconsistent for large messages. 94 | knocker = KnockKnock(polling_interval_micros=1) 95 | knocker.start() 96 | 97 | times = [] 98 | for i in range(self.args.n_iter + self.args.n_warmup_iter): 99 | start = monotonic() 100 | 101 | if not self.args.reuse_alloc: 102 | recv_msg = np.zeros(self.args.n_bytes, dtype="u1") 103 | 104 | await stream.write(send_msg.data) 105 | await stream.read_into(recv_msg.data) 106 | 107 | stop = monotonic() 108 | if i >= self.args.n_warmup_iter: 109 | times.append(stop - start) 110 | 111 | if self.args.report_gil_contention: 112 | knocker.stop() 113 | 114 | self.queue.put(times) 115 | if self.args.report_gil_contention: 116 | self.queue.put(knocker.contention_metric) 117 | -------------------------------------------------------------------------------- /ucp/comm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # Copyright (c) 2020 UT-Battelle, LLC. All rights reserved. 3 | # See file LICENSE for terms. 4 | 5 | import asyncio 6 | from typing import Union 7 | 8 | from ._libs import arr, ucx_api 9 | from .utils import get_event_loop 10 | 11 | 12 | def _cb_func(request, exception, event_loop, future): 13 | if event_loop.is_closed() or future.done(): 14 | return 15 | if exception is not None: 16 | future.set_exception(exception) 17 | else: 18 | future.set_result(True) 19 | 20 | 21 | def _call_ucx_api(event_loop, func, *args, **kwargs): 22 | """Help function to avoid duplicated code. 23 | Basically, all the communication functions have the 24 | same structure, which this wrapper implements. 25 | """ 26 | event_loop = event_loop or get_event_loop() 27 | ret = event_loop.create_future() 28 | # All the comm functions takes the call-back function and its arguments 29 | kwargs["cb_func"] = _cb_func 30 | kwargs["cb_args"] = (event_loop, ret) 31 | req = func(*args, **kwargs) 32 | if req is None and not ret.done(): 33 | ret.set_result(True) 34 | return ret 35 | 36 | 37 | def _am_cb_func(recv_obj, exception, event_loop, future): 38 | if event_loop.is_closed() or future.done(): 39 | return 40 | if exception is not None: 41 | future.set_exception(exception) 42 | else: 43 | future.set_result(recv_obj) 44 | 45 | 46 | def tag_send( 47 | ep: ucx_api.UCXEndpoint, 48 | buffer: arr.Array, 49 | nbytes: int, 50 | tag: int, 51 | name="tag_send", 52 | event_loop=None, 53 | ) -> asyncio.Future: 54 | 55 | return _call_ucx_api( 56 | event_loop, ucx_api.tag_send_nb, ep, buffer, nbytes, tag, name=name 57 | ) 58 | 59 | 60 | def am_send( 61 | ep: ucx_api.UCXEndpoint, 62 | buffer: arr.Array, 63 | nbytes: int, 64 | name="am_send", 65 | event_loop=None, 66 | ) -> asyncio.Future: 67 | 68 | return _call_ucx_api(event_loop, ucx_api.am_send_nbx, ep, buffer, nbytes, name=name) 69 | 70 | 71 | def stream_send( 72 | ep: ucx_api.UCXEndpoint, 73 | buffer: arr.Array, 74 | nbytes: int, 75 | name="stream_send", 76 | event_loop=None, 77 | ) -> asyncio.Future: 78 | 79 | return _call_ucx_api( 80 | event_loop, ucx_api.stream_send_nb, ep, buffer, nbytes, name=name 81 | ) 82 | 83 | 84 | def tag_recv( 85 | obj: Union[ucx_api.UCXEndpoint, ucx_api.UCXWorker], 86 | buffer: arr.Array, 87 | nbytes: int, 88 | tag: int, 89 | name="tag_recv", 90 | event_loop=None, 91 | ) -> asyncio.Future: 92 | 93 | worker = obj if isinstance(obj, ucx_api.UCXWorker) else obj.worker 94 | ep = obj if isinstance(obj, ucx_api.UCXEndpoint) else None 95 | 96 | return _call_ucx_api( 97 | event_loop, 98 | ucx_api.tag_recv_nb, 99 | worker, 100 | buffer, 101 | nbytes, 102 | tag, 103 | name=name, 104 | ep=ep, 105 | ) 106 | 107 | 108 | def am_recv( 109 | ep: ucx_api.UCXEndpoint, 110 | name="am_recv", 111 | event_loop=None, 112 | ) -> asyncio.Future: 113 | 114 | event_loop = event_loop or get_event_loop() 115 | ret = event_loop.create_future() 116 | # All the comm functions takes the call-back function and its arguments 117 | cb_args = (event_loop, ret) 118 | ucx_api.am_recv_nb(ep, cb_func=_am_cb_func, cb_args=cb_args, name=name) 119 | return ret 120 | 121 | 122 | def stream_recv( 123 | ep: ucx_api.UCXEndpoint, 124 | buffer: arr.Array, 125 | nbytes: int, 126 | name="stream_recv", 127 | event_loop=None, 128 | ) -> asyncio.Future: 129 | 130 | return _call_ucx_api( 131 | event_loop, ucx_api.stream_recv_nb, ep, buffer, nbytes, name=name 132 | ) 133 | 134 | 135 | def flush_worker(worker: ucx_api.UCXWorker, event_loop=None) -> asyncio.Future: 136 | return _call_ucx_api(event_loop, worker.flush) 137 | 138 | 139 | def flush_ep(ep: ucx_api.UCXEndpoint, event_loop=None) -> asyncio.Future: 140 | return _call_ucx_api(event_loop, ep.flush) 141 | -------------------------------------------------------------------------------- /ucp/continuous_ucx_progress.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | import asyncio 5 | import socket 6 | import weakref 7 | 8 | 9 | class ProgressTask(object): 10 | def __init__(self, worker, event_loop): 11 | """Creates a task that keeps calling worker.progress() 12 | 13 | Notice, class and created task is carefull not to hold a 14 | reference to `worker` so that a danling progress task will 15 | not prevent `worker` to be garbage collected. 16 | 17 | Parameters 18 | ---------- 19 | worker: UCXWorker 20 | The UCX worker context to progress 21 | event_loop: asyncio.EventLoop 22 | The event loop to do progress in. 23 | """ 24 | self.weakref_worker = weakref.ref(worker) 25 | self.event_loop = event_loop 26 | self.asyncio_task = None 27 | 28 | def __del__(self): 29 | if self.asyncio_task is not None: 30 | self.asyncio_task.cancel() 31 | 32 | # Hash and equality is based on the event loop 33 | def __hash__(self): 34 | return hash(self.event_loop) 35 | 36 | def __eq__(self, other): 37 | return hash(self) == hash(other) 38 | 39 | 40 | class NonBlockingMode(ProgressTask): 41 | def __init__(self, worker, event_loop): 42 | super().__init__(worker, event_loop) 43 | self.asyncio_task = event_loop.create_task(self._progress_task()) 44 | 45 | async def _progress_task(self): 46 | """This helper function maintains a UCX progress loop.""" 47 | while True: 48 | worker = self.weakref_worker() 49 | if worker is None or not worker.initialized: 50 | return 51 | worker.progress() 52 | del worker 53 | # Give other co-routines a chance to run. 54 | await asyncio.sleep(0) 55 | 56 | 57 | class BlockingMode(ProgressTask): 58 | def __init__(self, worker, event_loop, epoll_fd): 59 | super().__init__(worker, event_loop) 60 | 61 | # Creating a job that is ready straightaway but with low priority. 62 | # Calling `await self.event_loop.sock_recv(self.rsock, 1)` will 63 | # return when all non-IO tasks are finished. 64 | # See . 65 | self.rsock, wsock = socket.socketpair() 66 | self.rsock.setblocking(0) 67 | wsock.setblocking(0) 68 | wsock.close() 69 | 70 | # Bind an asyncio reader to a UCX epoll file descripter 71 | event_loop.add_reader(epoll_fd, self._fd_reader_callback) 72 | 73 | # Remove the reader and close socket on finalization 74 | weakref.finalize(self, event_loop.remove_reader, epoll_fd) 75 | weakref.finalize(self, self.rsock.close) 76 | 77 | def _fd_reader_callback(self): 78 | worker = self.weakref_worker() 79 | if worker is None or not worker.initialized: 80 | return 81 | worker.progress() 82 | 83 | # Notice, we can safely overwrite `self.dangling_arm_task` 84 | # since previous arm task is finished by now. 85 | assert self.asyncio_task is None or self.asyncio_task.done() 86 | self.asyncio_task = self.event_loop.create_task(self._arm_worker()) 87 | 88 | async def _arm_worker(self): 89 | # When arming the worker, the following must be true: 90 | # - No more progress in UCX (see doc of ucp_worker_arm()) 91 | # - All asyncio tasks that isn't waiting on UCX must be executed 92 | # so that the asyncio's next state is epoll wait. 93 | # See 94 | while True: 95 | worker = self.weakref_worker() 96 | if worker is None or not worker.initialized: 97 | return 98 | worker.progress() 99 | 100 | # Cancel inflight messages that couldn't be completed. This may 101 | # happen if the user called ep.recv() but the remote worker 102 | # errored before sending the message. 103 | if worker.cancel_inflight_messages() > 0: 104 | worker.progress() 105 | 106 | del worker 107 | 108 | # This IO task returns when all non-IO tasks are finished. 109 | # Notice, we do NOT hold a reference to `worker` while waiting. 110 | await self.event_loop.sock_recv(self.rsock, 1) 111 | 112 | worker = self.weakref_worker() 113 | if worker is None or not worker.initialized: 114 | return 115 | if worker.arm(): 116 | # At this point we know that asyncio's next state is 117 | # epoll wait. 118 | break 119 | -------------------------------------------------------------------------------- /ucp/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | from ._libs.exceptions import * # noqa 5 | -------------------------------------------------------------------------------- /ucp/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. 2 | # See file LICENSE for terms. 3 | 4 | import asyncio 5 | import hashlib 6 | import logging 7 | import multiprocessing as mp 8 | import os 9 | import socket 10 | import time 11 | 12 | import numpy as np 13 | 14 | mp = mp.get_context("spawn") 15 | 16 | 17 | def get_event_loop(): 18 | """ 19 | Get running or create new event loop 20 | 21 | In Python 3.10, the behavior of `get_event_loop()` is deprecated and in 22 | the future it will be an alias of `get_running_loop()`. In several 23 | situations, UCX-Py needs to create a new event loop, so this function 24 | will remain for now as an alternative to the behavior of `get_event_loop()` 25 | from Python < 3.10, returning the `get_running_loop()` if an event loop 26 | exists, or returning a new one with `new_event_loop()` otherwise. 27 | """ 28 | try: 29 | return asyncio.get_running_loop() 30 | except RuntimeError: 31 | return asyncio.new_event_loop() 32 | 33 | 34 | def get_ucxpy_logger(): 35 | """ 36 | Get UCX-Py logger with custom formatting 37 | 38 | Returns 39 | ------- 40 | logger : logging.Logger 41 | Logger object 42 | 43 | Examples 44 | -------- 45 | >>> logger = get_ucxpy_logger() 46 | >>> logger.warning("Test") 47 | [1585175070.2911468] [dgx12:1054] UCXPY WARNING Test 48 | """ 49 | 50 | _level_enum = logging.getLevelName(os.getenv("UCXPY_LOG_LEVEL", "WARNING")) 51 | logger = logging.getLogger("ucx") 52 | 53 | # Avoid duplicate logging 54 | logger.propagate = False 55 | 56 | class LoggingFilter(logging.Filter): 57 | def filter(self, record): 58 | record.hostname = socket.gethostname() 59 | record.timestamp = str("%.6f" % time.time()) 60 | return True 61 | 62 | formatter = logging.Formatter( 63 | "[%(timestamp)s] [%(hostname)s:%(process)d] UCXPY %(levelname)s %(message)s" 64 | ) 65 | 66 | handler = logging.StreamHandler() 67 | handler.setFormatter(formatter) 68 | handler.addFilter(LoggingFilter()) 69 | logger.addHandler(handler) 70 | 71 | logger.setLevel(_level_enum) 72 | 73 | return logger 74 | 75 | 76 | def hash64bits(*args): 77 | """64 bit unsigned hash of `args`""" 78 | # 64 bits hexdigest 79 | h = hashlib.sha1(bytes(repr(args), "utf-8")).hexdigest()[:16] 80 | # Convert to an integer and return 81 | return int(h, 16) 82 | 83 | 84 | def hmean(a): 85 | """Harmonic mean""" 86 | if len(a): 87 | return 1 / np.mean(1 / a) 88 | else: 89 | return 0 90 | --------------------------------------------------------------------------------