├── perftest ├── host │ ├── pt-to-pt │ │ ├── stream_latency.args │ │ ├── bw.args │ │ ├── latency.args │ │ └── CMakeLists.txt │ ├── init │ │ └── CMakeLists.txt │ ├── coll │ │ ├── barrier_on_stream.args │ │ ├── sync_on_stream.args │ │ ├── alltoall_on_stream.args │ │ ├── broadcast_on_stream.args │ │ ├── fcollect_on_stream.args │ │ ├── reduction_on_stream.args │ │ ├── reducescatter_on_stream.args │ │ ├── CMakeLists.txt │ │ ├── sync_on_stream.cpp │ │ ├── sync_all_on_stream.cpp │ │ └── barrier_on_stream.cpp │ └── CMakeLists.txt ├── device │ ├── coll │ │ ├── sync_latency.args │ │ ├── barrier_latency.args │ │ ├── bcast_latency.args │ │ ├── alltoall_latency.args │ │ ├── fcollect_latency.args │ │ ├── reduction_latency.args │ │ ├── reducescatter_latency.args │ │ └── CMakeLists.txt │ ├── pt-to-pt │ │ ├── shmem_atomic_latency.args │ │ ├── shmem_g_latency.args │ │ ├── shmem_p_latency.args │ │ ├── shmem_signal_ping_pong_latency.args │ │ ├── shmem_st_bw.args │ │ ├── shmem_get_latency.args │ │ ├── shmem_put_latency.args │ │ ├── shmem_put_ping_pong_latency.args │ │ ├── shmem_p_ping_pong_latency.args │ │ ├── shmem_put_signal_ping_pong_latency.args │ │ ├── shmem_atomic_bw.args │ │ ├── shmem_get_bw.args │ │ ├── shmem_p_bw.args │ │ ├── shmem_g_bw.args │ │ ├── shmem_put_bw.args │ │ ├── shmem_atomic_ping_pong_latency.args │ │ └── CMakeLists.txt │ ├── tile │ │ └── CMakeLists.txt │ └── CMakeLists.txt ├── perftest-mmap-sanity.list ├── perftest-p2p-cudagraph.list ├── perfTestRunnerSlurm.py ├── perftest-mmap-full.list ├── perftest-p2p-pcie.list ├── perftest-ib.list ├── perftest-p2p-nvlink.list ├── README.md └── common │ └── CMakeLists.txt ├── nvshmem4py ├── nvshmem │ ├── bindings │ │ ├── _internal │ │ │ └── __init__.py │ │ ├── device │ │ │ ├── numba │ │ │ │ ├── entry_point.h │ │ │ │ ├── _numbast.py │ │ │ │ └── __init__.py │ │ │ └── __init__.py │ │ └── __init__.py │ ├── core │ │ ├── device │ │ │ ├── __init__.py │ │ │ └── numba │ │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── _internal_tracking.py │ ├── __init__.py │ └── version.py ├── build_assets │ └── numbast │ │ ├── numbast_entry_point.h │ │ └── templates │ │ └── config_nvshmem.yml.j2 ├── requirements_cuda12.txt ├── requirements_cuda13.txt ├── requirements_optional_cuda13.txt ├── requirements_build.txt ├── MANIFEST.in ├── test │ ├── device │ │ └── numba │ │ │ ├── conftest.py │ │ │ ├── utils.py │ │ │ ├── test_device_sync.py │ │ │ └── test_device_barrier.py │ ├── wheel_sanity_test.py │ ├── test_npe.py │ ├── test_get_version.py │ ├── test_ring.py │ ├── test_collective.py │ ├── test_highlevel_bindings.py │ └── utils.py ├── requirements_optional_cuda12.txt ├── perftest │ ├── alltoall_on_stream.py │ ├── fcollect_on_stream.py │ ├── reduction_on_stream.py │ ├── broadcast_on_stream.py │ └── reducescatter_on_stream.py ├── scripts │ └── find_python_versions.sh ├── README.md ├── examples │ └── simple_p2p_kernel.py └── setup.py ├── test ├── apps │ ├── dgl │ │ └── CMakeLists.txt │ ├── cufft │ │ └── CMakeLists.txt │ ├── interop │ │ └── CMakeLists.txt │ └── CMakeLists.txt ├── host │ ├── pt-to-pt │ │ ├── fence.args │ │ ├── quiet.args │ │ ├── CMakeLists.txt │ │ ├── quiet_on_stream.cu │ │ ├── quiet.cpp │ │ └── fence.cpp │ ├── mem │ │ ├── mmap_unmap_loop.args │ │ ├── CMakeLists.txt │ │ ├── calloc.cpp │ │ ├── malloc_simple.cpp │ │ └── malloc_loop.cpp │ ├── coll │ │ ├── collective_launch_choose_grid.args │ │ ├── collective_launch_user_specified_grid.args │ │ └── CMakeLists.txt │ ├── interop │ │ ├── simplelib1.sym │ │ ├── simplelib2.sym │ │ ├── simplelib1.h │ │ ├── simplelib2.h │ │ ├── simplelib1.cu │ │ ├── simplelib2.cu │ │ ├── app.cu │ │ └── CMakeLists.txt │ ├── team │ │ ├── CMakeLists.txt │ │ └── shmem_team_reuse_teams.cpp │ ├── init │ │ ├── kernel_nvshmem.cu │ │ ├── CMakeLists.txt │ │ ├── static_init.cpp │ │ ├── global_exit.cpp │ │ ├── mpi_init.cpp │ │ ├── nvshmemx_init_status.cpp │ │ ├── shmem_init.cpp │ │ ├── init_loop.cpp │ │ └── uid_init.cpp │ └── CMakeLists.txt ├── device │ ├── init │ │ ├── CMakeLists.txt │ │ └── global_exit.cu │ ├── query │ │ ├── CMakeLists.txt │ │ ├── hello.cu │ │ └── hello-team.cu │ ├── sync │ │ ├── CMakeLists.txt │ │ ├── sync_test.cu │ │ └── wait_until_all.cu │ ├── tile │ │ └── CMakeLists.txt │ ├── CMakeLists.txt │ └── coll │ │ └── coll_test.h ├── unit │ ├── CMakeLists.txt │ ├── mem │ │ ├── CMakeLists.txt │ │ ├── heap │ │ │ ├── internal │ │ │ │ └── host │ │ │ │ │ └── nvshmem_nvtx.hpp │ │ │ └── CMakeLists.txt │ │ └── transport │ │ │ └── CMakeLists.txt │ └── host │ │ ├── bootstrap │ │ └── CMakeLists.txt │ │ └── CMakeLists.txt ├── common │ ├── test_teams.h │ ├── data_check.h │ ├── ring_alltoall.h │ └── test-simple-pmi │ │ └── test_simple_pmiutil.h └── README.md ├── .github └── ISSUE_TEMPLATE │ ├── config.yml │ ├── QUESTION.yaml │ └── RFE.yaml ├── nvshmem_transport.sym ├── .gitattributes ├── src ├── modules │ ├── transport │ │ ├── ibgda │ │ │ └── CMakeLists.txt.in │ │ ├── ibrc │ │ │ └── CMakeLists.txt.in │ │ ├── ibdevx │ │ │ ├── CMakeLists.txt.in │ │ │ └── ibdevx.h │ │ ├── ucx │ │ │ └── CMakeLists.txt.in │ │ ├── libfabric │ │ │ └── CMakeLists.txt.in │ │ └── common │ │ │ ├── transport_mlx5_common.h │ │ │ ├── transport_gdr_common.h │ │ │ ├── CMakeLists.txt.in │ │ │ └── mlx5_prm.h │ └── bootstrap │ │ ├── mpi │ │ └── CMakeLists.txt.in │ │ ├── uid │ │ ├── CMakeLists.txt.in │ │ └── ncclSocket │ │ │ ├── commit_info.txt │ │ │ ├── ncclsocket_param.h │ │ │ ├── ncclsocket_nccl.h │ │ │ ├── ncclsocket_debug.h │ │ │ └── ncclsocket_utils.h │ │ ├── pmix │ │ └── CMakeLists.txt.in │ │ ├── shmem │ │ └── CMakeLists.txt.in │ │ ├── common │ │ ├── CMakeLists.txt.in │ │ └── env_defs.h │ │ └── pmi │ │ ├── CMakeLists.txt.in │ │ ├── simple-pmi │ │ └── simple_pmiutil.h │ │ └── pmi-2 │ │ └── COPYRIGHT ├── include │ ├── nvshmem_host.h │ ├── internal │ │ ├── host │ │ │ ├── shared_memory.h │ │ │ ├── sockets.h │ │ │ ├── nvshmemi_coll.h │ │ │ ├── error_codes_internal.h │ │ │ ├── nvshmemi_bootstrap_library.h │ │ │ ├── custom_malloc.h │ │ │ └── cuda_interface_sync.h │ │ ├── host_transport │ │ │ └── nvshmemi_transport_defines.h │ │ ├── common │ │ │ └── error_codes_internal.h │ │ ├── device │ │ │ └── nvshmemi_device.h │ │ ├── bootstrap_host_transport │ │ │ └── nvshmemi_bootstrap_defines.h │ │ └── bootstrap_host │ │ │ └── nvshmemi_bootstrap.h │ ├── non_abi │ │ ├── device │ │ │ └── coll │ │ │ │ └── defines.cuh │ │ ├── nvshmem_version.h.in │ │ └── nvshmem_build_options.h.in │ ├── host │ │ └── nvshmem_macros.h │ ├── device │ │ └── nvshmemx_collective_launch_apis.h │ ├── nvshmem.h │ ├── nvshmemx.h │ ├── bootstrap_device_host │ │ └── nvshmem_uniqueid.h │ └── device_host_transport │ │ └── nvshmem_common_transport.h └── host │ ├── stream │ ├── coll │ │ ├── rdxn │ │ │ ├── reduce_and.cu │ │ │ ├── reduce_or.cu │ │ │ ├── reduce_xor.cu │ │ │ ├── reduce_max.cu │ │ │ ├── reduce_min.cu │ │ │ ├── reduce_sum.cu │ │ │ ├── reduce_prod.cu │ │ │ └── reduce_team.cu │ │ └── reducescatter │ │ │ ├── reducescatter_or.cu │ │ │ ├── reducescatter_and.cu │ │ │ ├── reducescatter_xor.cu │ │ │ ├── reducescatter_max.cu │ │ │ ├── reducescatter_min.cu │ │ │ ├── reducescatter_sum.cu │ │ │ └── reducescatter_prod.cu │ └── comm │ │ └── quiet_on_stream.cu │ ├── coll │ ├── fcollect │ │ ├── fcollect_on_stream.h │ │ └── fcollect_on_stream.cpp │ ├── barrier │ │ └── barrier.h │ ├── rdxn │ │ └── rdxn.h │ ├── reducescatter │ │ └── reducescatter.h │ ├── alltoall │ │ └── alltoall_on_stream.cpp │ └── broadcast │ │ └── broadcast_on_stream.cpp │ ├── topo │ └── topo.h │ ├── transport │ └── p2p │ │ └── p2p.h │ ├── util │ └── cs.cpp │ ├── comm │ ├── rma.cu │ └── fence.cpp │ ├── mem │ └── dlmalloc.h │ └── init │ └── query_host.cpp ├── nvshmem_host.sym ├── nvshmem_bootstrap.sym ├── scripts ├── bitcode_lib_cleanup.sh └── install_hydra.sh ├── pkg └── nvshmem_package_description.txt ├── .pre-commit-config.yaml ├── Compatibility.md ├── examples ├── shmem-based-init.cu ├── hello.cpp ├── gemm_allreduce │ └── nvshmemAlloc.hpp └── dev-guide-ring.cu ├── README.md └── .gitignore /perftest/host/pt-to-pt/stream_latency.args: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/_internal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /perftest/device/coll/sync_latency.args: -------------------------------------------------------------------------------- 1 | -n 1000 -w 10 -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_atomic_latency.args: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /perftest/device/coll/barrier_latency.args: -------------------------------------------------------------------------------- 1 | -n 1000 -w 10 -------------------------------------------------------------------------------- /test/apps/dgl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(dgl.cu) 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false -------------------------------------------------------------------------------- /perftest/device/coll/bcast_latency.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M -------------------------------------------------------------------------------- /test/host/pt-to-pt/fence.args: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 2 | -n 100 3 | -------------------------------------------------------------------------------- /test/host/pt-to-pt/quiet.args: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 2 | -n 100 3 | -------------------------------------------------------------------------------- /perftest/device/coll/alltoall_latency.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M -------------------------------------------------------------------------------- /perftest/device/coll/fcollect_latency.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M -------------------------------------------------------------------------------- /perftest/device/coll/reduction_latency.args: -------------------------------------------------------------------------------- 1 | -n 50 -w 10 -b 1 -e 4M -------------------------------------------------------------------------------- /test/device/init/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(global_exit.cu) 2 | -------------------------------------------------------------------------------- /test/host/mem/mmap_unmap_loop.args: -------------------------------------------------------------------------------- 1 | 2 | -i 3 -b 1G -e 2G -r 1 3 | -------------------------------------------------------------------------------- /perftest/device/coll/reducescatter_latency.args: -------------------------------------------------------------------------------- 1 | -n 50 -w 10 -b 1 -e 4k -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_g_latency.args: -------------------------------------------------------------------------------- 1 | -n 200 -w 20 -t 512 -e 64K -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_p_latency.args: -------------------------------------------------------------------------------- 1 | -t 512 -e 64K -n 50 -w 5 -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_signal_ping_pong_latency.args: -------------------------------------------------------------------------------- 1 | -n 500 -w 50 -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_st_bw.args: -------------------------------------------------------------------------------- 1 | -n 10 -w 10 -t 1024 -c 4 -e 32M -------------------------------------------------------------------------------- /perftest/host/init/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(malloc.cpp) 2 | -------------------------------------------------------------------------------- /test/apps/cufft/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(cufft_smoke_test.cu) 2 | -------------------------------------------------------------------------------- /test/unit/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(host) 2 | add_subdirectory(mem) -------------------------------------------------------------------------------- /nvshmem_transport.sym: -------------------------------------------------------------------------------- 1 | { 2 | global: nvshmemt_init; 3 | local: *; 4 | }; -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_get_latency.args: -------------------------------------------------------------------------------- 1 | -n 200 -w 20 -t 1024 -e 64K -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_put_latency.args: -------------------------------------------------------------------------------- 1 | -e 64K -t 1024 -n 200 -w 20 -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_put_ping_pong_latency.args: -------------------------------------------------------------------------------- 1 | -e 1M -n 500 -w 50 -------------------------------------------------------------------------------- /test/apps/interop/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(nccl_nvshmem_interop.cu) 2 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_p_ping_pong_latency.args: -------------------------------------------------------------------------------- 1 | -t 512 -e 16K -n 500 -w 50 -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_put_signal_ping_pong_latency.args: -------------------------------------------------------------------------------- 1 | -e 1M -n 500 -w 50 -------------------------------------------------------------------------------- /test/host/coll/collective_launch_choose_grid.args: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 2 | -t 32 3 | -------------------------------------------------------------------------------- /test/unit/mem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(transport) 2 | add_subdirectory(heap) -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_atomic_bw.args: -------------------------------------------------------------------------------- 1 | -n 10 -w 10 -c 4 -e 65536 -t 1024 -a inc -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_get_bw.args: -------------------------------------------------------------------------------- 1 | -n 200 -w 20 -b 1024 -e 32M -c 4 -t 1024 2 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_p_bw.args: -------------------------------------------------------------------------------- 1 | -n 10 -w 10 -t 1024 -c 4 -b 1024 -e 64K -s 1 -------------------------------------------------------------------------------- /perftest/host/coll/barrier_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 1000 -w 10 2 | -n 1000 -w 10 --cudagraph -------------------------------------------------------------------------------- /perftest/host/coll/sync_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 1000 -w 10 2 | -n 1000 -w 10 --cudagraph 3 | -------------------------------------------------------------------------------- /perftest/host/pt-to-pt/bw.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 4 -e 128M --dir write --issue host 2 | -------------------------------------------------------------------------------- /test/unit/host/bootstrap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_unit_test(bootstrap_coll.cpp) 2 | -------------------------------------------------------------------------------- /perftest/host/pt-to-pt/latency.args: -------------------------------------------------------------------------------- 1 | -n 10 -w 10 -b 1 -e 128M --dir write --issue host 2 | -------------------------------------------------------------------------------- /test/host/coll/collective_launch_user_specified_grid.args: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 2 | -c 1 -t 32 3 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_g_bw.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -t 1024 -c 8 -b 1024 -e 65536 -d double 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | nvshmem4py/nvshmem/bindings/device/numba/_numbast.py filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /perftest/host/coll/alltoall_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M 2 | -n 100 -w 10 -b 1 -e 4M --cudagraph -------------------------------------------------------------------------------- /perftest/host/coll/broadcast_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M 2 | -n 100 -w 10 -b 1 -e 4M --cudagraph -------------------------------------------------------------------------------- /perftest/host/coll/fcollect_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 1000 -w 10 -b 1 -e 4M 2 | -n 1000 -w 10 -b 1 -e 4M --cudagraph -------------------------------------------------------------------------------- /perftest/host/coll/reduction_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4M 2 | -n 100 -w 10 -b 1 -e 4M --cudagraph -------------------------------------------------------------------------------- /perftest/host/coll/reducescatter_on_stream.args: -------------------------------------------------------------------------------- 1 | -n 100 -w 10 -b 1 -e 4k 2 | -n 100 -w 10 -b 1 -e 4k --cudagraph -------------------------------------------------------------------------------- /src/modules/transport/ibgda/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | nvshmem_add_transport(nvshmem_transport_ibgda ibgda.cpp ON ON ON ON) 2 | -------------------------------------------------------------------------------- /src/modules/transport/ibrc/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | nvshmem_add_transport(nvshmem_transport_ibrc ibrc.cpp ON ON ON OFF) 2 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_put_bw.args: -------------------------------------------------------------------------------- 1 | -n 200 -w 20 -c 4 -t 1024 -e 32M 2 | --bidir -n 200 -w 20 -c 4 -t 1024 -e 32M -------------------------------------------------------------------------------- /perftest/perftest-mmap-sanity.list: -------------------------------------------------------------------------------- 1 | /host/coll/broadcast_on_stream 2 | /host/pt-to-pt/bw 3 | /device/pt-to-pt/shmem_put_bw -------------------------------------------------------------------------------- /src/modules/transport/ibdevx/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | nvshmem_add_transport(nvshmem_transport_ibdevx ibdevx.cpp ON OFF ON ON) 2 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/shmem_atomic_ping_pong_latency.args: -------------------------------------------------------------------------------- 1 | add 2 | and 3 | compare_swap 4 | inc 5 | or 6 | set 7 | swap 8 | xor -------------------------------------------------------------------------------- /nvshmem_host.sym: -------------------------------------------------------------------------------- 1 | NVSHMEM { 2 | global: nvshmem_*; 3 | nvshmemid_*; 4 | nvshmemx_*; 5 | local: *; 6 | }; 7 | -------------------------------------------------------------------------------- /nvshmem4py/build_assets/numbast/numbast_entry_point.h: -------------------------------------------------------------------------------- 1 | #define __NVSHMEM_NUMBA_SUPPORT__ 2 | #include 3 | #include 4 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/device/numba/entry_point.h: -------------------------------------------------------------------------------- 1 | #define __NVSHMEM_NUMBA_SUPPORT__ 2 | #include 3 | #include 4 | -------------------------------------------------------------------------------- /perftest/device/tile/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(tile_allreduce_latency.cu) 2 | nvshmem_add_perftest(tile_allgather_latency.cu) 3 | -------------------------------------------------------------------------------- /nvshmem_bootstrap.sym: -------------------------------------------------------------------------------- 1 | { 2 | global: nvshmemi_bootstrap_plugin_init; 3 | nvshmemi_bootstrap_plugin_pre_init; 4 | local: *; 5 | }; 6 | -------------------------------------------------------------------------------- /perftest/host/pt-to-pt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(bw.cpp) 2 | nvshmem_add_perftest(latency.cpp) 3 | nvshmem_add_perftest(stream_latency.cu) 4 | -------------------------------------------------------------------------------- /test/host/interop/simplelib1.sym: -------------------------------------------------------------------------------- 1 | CUFFT { 2 | global: simplelib1_init; 3 | simplelib1_finalize; 4 | simplelib1_dowork; 5 | local: *; 6 | }; 7 | -------------------------------------------------------------------------------- /test/host/interop/simplelib2.sym: -------------------------------------------------------------------------------- 1 | CUFFT { 2 | global: simplelib2_init; 3 | simplelib2_finalize; 4 | simplelib2_dowork; 5 | local: *; 6 | }; 7 | -------------------------------------------------------------------------------- /test/unit/mem/heap/internal/host/nvshmem_nvtx.hpp: -------------------------------------------------------------------------------- 1 | #ifndef _NVSHMEM_NVTX_HPP_ 2 | #define _NVSHMEM_NVTX_HPP_ 3 | 4 | #define NVTX_FUNC_RANGE_IN_GROUP(G) ; 5 | 6 | #endif -------------------------------------------------------------------------------- /src/modules/transport/ucx/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | nvshmem_add_transport(nvshmem_transport_ucx ucx.cpp OFF ON OFF OFF) 2 | target_link_libraries(nvshmem_transport_ucx PRIVATE ucx::ucs ucx::ucp) 3 | -------------------------------------------------------------------------------- /test/device/query/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(hello.cu) 2 | nvshmem_add_test(hello-team.cu) 3 | nvshmem_add_test(info.cu) 4 | nvshmem_add_test(ptr.cu) 5 | nvshmem_add_test(mc_ptr.cu) 6 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/device/numba/_numbast.py: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b4a5b3944fdd88661742e667444c2cd82fcc2cddb20409dbfd718b4fd1ddfc4a 3 | size 5738544 4 | -------------------------------------------------------------------------------- /scripts/bitcode_lib_cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | myVar="$(cat $1 | grep -E '!([0-9]+) = !\{[^"]+"nvvm-reflect-ftz"' | cut -d ' ' -f 1)" 4 | awk '!/nvvm-reflect-ftz/' $1 | sed "/^\!llvm\.module\.flags = /s/$myVar, //" > $2 -------------------------------------------------------------------------------- /test/unit/host/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_UNIT_TEST_PREFIX "host_") 2 | macro(nvshmem_add_unit_test SOURCE) 3 | nvshmem_add_unit_test_prefix(${SOURCE} ${NVSHMEM_UNIT_TEST_PREFIX}) 4 | endmacro() 5 | 6 | add_subdirectory(bootstrap) 7 | -------------------------------------------------------------------------------- /nvshmem4py/requirements_cuda12.txt: -------------------------------------------------------------------------------- 1 | # These are the run-time dependencies of nvshmem4py 2 | # They are non-negotiables 3 | nvidia-nvshmem-cu12 4 | cuda-python>=12.0,<=12.9 5 | cuda.core==0.4 6 | cuda.pathfinder==1.2.3 7 | numpy 8 | Cython>=0.29.24 9 | -------------------------------------------------------------------------------- /nvshmem4py/requirements_cuda13.txt: -------------------------------------------------------------------------------- 1 | # These are the run-time dependencies of nvshmem4py 2 | # They are non-negotiables 3 | nvidia-nvshmem-cu13 4 | cuda-python>=13.0,<14.0 5 | cuda.core==0.4 6 | cuda.pathfinder==1.2.3 7 | numpy 8 | Cython>=0.29.24 9 | -------------------------------------------------------------------------------- /src/modules/bootstrap/mpi/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | set(SOURCE_LIST bootstrap_mpi.c) 2 | 3 | nvshmem_add_bootstrap(nvshmem_bootstrap_mpi ${SOURCE_LIST}) 4 | 5 | find_package(MPI REQUIRED) 6 | 7 | target_link_libraries(nvshmem_bootstrap_mpi PRIVATE MPI::MPI_C) 8 | -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | set(SOURCE_LIST bootstrap_uid.cpp ncclSocket/ncclsocket_socket.cpp) 2 | 3 | nvshmem_add_bootstrap(nvshmem_bootstrap_uid ${SOURCE_LIST}) 4 | 5 | target_include_directories(nvshmem_bootstrap_uid ncclSocket) 6 | -------------------------------------------------------------------------------- /test/host/interop/simplelib1.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "nvshmem.h" 3 | #include "cuda.h" 4 | #include "cuda_runtime.h" 5 | 6 | extern "C" { 7 | void simplelib1_init(); 8 | int simplelib1_dowork(); 9 | void simplelib1_finalize(); 10 | } 11 | -------------------------------------------------------------------------------- /test/host/interop/simplelib2.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include "nvshmem.h" 3 | #include "cuda.h" 4 | #include "cuda_runtime.h" 5 | 6 | extern "C" { 7 | void simplelib2_init(); 8 | int simplelib2_dowork(); 9 | void simplelib2_finalize(); 10 | } 11 | -------------------------------------------------------------------------------- /test/host/team/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(shmem_team_max.cpp) 2 | nvshmem_add_test(shmem_team_dup.cpp) 3 | nvshmem_add_test(shmem_team_reuse_teams.cpp) 4 | nvshmem_add_test(shmem_team_split_2d.cpp) 5 | nvshmem_add_test(shmem_team_translate_2.cpp) 6 | -------------------------------------------------------------------------------- /perftest/host/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_TEST_PREFIX "perf_host_") 2 | macro(nvshmem_add_perftest SOURCE) 3 | nvshmem_add_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX}) 4 | endmacro() 5 | add_subdirectory(coll) 6 | add_subdirectory(init) 7 | add_subdirectory(pt-to-pt) 8 | -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/ncclSocket/commit_info.txt: -------------------------------------------------------------------------------- 1 | commit 68c00a0000fd021070b61edc14b7547325a1c6ff (HEAD -> master, tag: v2.19.4-1, origin/stable, origin/master, origin/HEAD) 2 | Author: Sylvain Jeaugey 3 | Date: Wed Nov 8 18:49:01 2023 -0800 4 | 5 | NCCL 2.19.4-1 -------------------------------------------------------------------------------- /src/modules/transport/libfabric/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | nvshmem_add_transport(nvshmem_transport_libfabric libfabric.cpp ON ON OFF OFF) 2 | target_include_directories(nvshmem_transport_libfabric PRIVATE ${LIBFABRIC_HOME}/include) 3 | target_link_libraries(nvshmem_transport_libfabric PRIVATE ${FABRIC_lib}) 4 | -------------------------------------------------------------------------------- /perftest/perftest-p2p-cudagraph.list: -------------------------------------------------------------------------------- 1 | /host/coll/barrier_all_on_stream 2 | /host/coll/barrier_on_stream 3 | /host/coll/sync_all_on_stream 4 | /host/coll/sync_on_stream 5 | /host/coll/alltoall_on_stream 6 | /host/coll/broadcast_on_stream 7 | /host/coll/fcollect_on_stream 8 | /host/coll/reduction_on_stream 9 | -------------------------------------------------------------------------------- /src/include/nvshmem_host.h: -------------------------------------------------------------------------------- 1 | /** 2 | * NVSHmem Host Include 3 | * This file exists so that we can cleanly include only the nvshmem host library headers 4 | */ 5 | 6 | #ifndef NVSHMEM_HOST_H 7 | #define NVSHMEM_HOST_H 8 | #include "host/nvshmem_api.h" 9 | #include "host/nvshmemx_api.h" 10 | #endif 11 | -------------------------------------------------------------------------------- /test/apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_TEST_PREFIX "apps_") 2 | macro(nvshmem_add_test SOURCE) 3 | nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON) 4 | endmacro() 5 | add_subdirectory(cufft) 6 | add_subdirectory(dgl) 7 | if(NVSHMEM_USE_NCCL) 8 | add_subdirectory(interop) 9 | endif() 10 | -------------------------------------------------------------------------------- /test/unit/mem/transport/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(GTest_FOUND) 2 | nvshmem_add_gtest_unit_test_prefix(P2P_unit_tests.cpp "transport_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_transport.cpp") 3 | nvshmem_add_gtest_unit_test_prefix(remote_unit_tests.cpp "transport_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_transport.cpp") 4 | endif() -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_and.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, AND) 11 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_or.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, OR) 11 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_xor.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, XOR) 11 | -------------------------------------------------------------------------------- /src/host/coll/fcollect/fcollect_on_stream.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H 8 | #define NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H 9 | #include "fcollect_common.h" 10 | 11 | #endif /* NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H */ 12 | -------------------------------------------------------------------------------- /src/modules/bootstrap/pmix/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | set(SOURCE_LIST bootstrap_pmix.c) 2 | 3 | nvshmem_add_bootstrap(nvshmem_bootstrap_pmix ${SOURCE_LIST}) 4 | 5 | find_library(PMIX_lib NAMES pmix HINTS "${PMIX_HOME}/lib") 6 | 7 | target_link_libraries(nvshmem_bootstrap_pmix PRIVATE ${PMIX_lib}) 8 | target_include_directories(nvshmem_bootstrap_pmix PRIVATE ${PMIX_HOME}/include) 9 | -------------------------------------------------------------------------------- /test/host/init/kernel_nvshmem.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void kernel_nvshmem(int* destination) { 5 | int mype = nvshmem_my_pe(); 6 | int npes = nvshmem_n_pes(); 7 | assert(npes > 0); 8 | int peer = (mype + 1) % npes; 9 | nvshmem_int_p(destination, 3 * peer + 14, peer); 10 | nvshmem_barrier_all(); 11 | } 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_or.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, OR) 11 | -------------------------------------------------------------------------------- /test/host/coll/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(alltoall.cu) 2 | nvshmem_add_test(barrier.cu) 3 | nvshmem_add_test(barrier_all.cu) 4 | nvshmem_add_test(broadcast.cu) 5 | nvshmem_add_test(fcollect.cu) 6 | nvshmem_add_test(collective_launch_choose_grid.cu) 7 | nvshmem_add_test(collective_launch_user_specified_grid.cu) 8 | nvshmem_add_test(reduce.cu) 9 | nvshmem_add_test(reducescatter.cu) 10 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_and.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, AND) 11 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_xor.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, XOR) 11 | -------------------------------------------------------------------------------- /pkg/nvshmem_package_description.txt: -------------------------------------------------------------------------------- 1 | 2 | NVSHMEM is a parallel programming interface based on OpenSHMEM 3 | that provides efficient and scalable communication for NVIDIA GPU 4 | clusters. NVSHMEM creates a global address space for data that spans 5 | the memory of multiple GPUs and can be accessed with fine-grained 6 | GPU-initiated operations, CPU-initiated operations, and operations 7 | on CUDA(R) streams. 8 | -------------------------------------------------------------------------------- /test/device/sync/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(sync_test.cu) 2 | nvshmem_add_test(test_all_any_some.cu) 3 | nvshmem_add_test(test_any.cu) 4 | nvshmem_add_test(test_some.cu) 5 | nvshmem_add_test(test_vector.cu) 6 | nvshmem_add_test(wait_until.cu) 7 | nvshmem_add_test(wait_until_all.cu) 8 | nvshmem_add_test(wait_until_any.cu) 9 | nvshmem_add_test(wait_until_some.cu) 10 | nvshmem_add_test(wait_until_vector.cu) 11 | -------------------------------------------------------------------------------- /nvshmem4py/requirements_optional_cuda13.txt: -------------------------------------------------------------------------------- 1 | # These are optional dependencies 2 | # nvshmem4py is aware of and interoperable with them 3 | # but doesn't depend on them 4 | mpi4py 5 | --index-url https://download.pytorch.org/whl/cu130 6 | --extra-index-url https://pypi.org/simple 7 | torch==2.9.0 8 | --index-url https://pypi.org/simple 9 | ml-dtypes 10 | # These are cu13 packages 11 | nvidia-cuda-nvcc 12 | nvidia-nvjitlink -------------------------------------------------------------------------------- /test/host/mem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(align_bins.cpp) 2 | nvshmem_add_test(align_free_reorder.cpp) 3 | nvshmem_add_test(calloc.cpp) 4 | nvshmem_add_test(malloc_bins.cpp) 5 | nvshmem_add_test(malloc_free_loop.cpp) 6 | nvshmem_add_test(mmap_unmap_loop.cu) 7 | nvshmem_add_test(malloc_free_reorder.cpp) 8 | nvshmem_add_test(malloc_loop.cpp) 9 | nvshmem_add_test(malloc_simple.cpp) 10 | nvshmem_add_test(register_buffer.cu) 11 | -------------------------------------------------------------------------------- /test/unit/mem/heap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(GTest_FOUND) 2 | nvshmem_add_gtest_unit_test_prefix(nvshmemi_symmetric_heap_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp") 3 | nvshmem_add_gtest_unit_test_prefix(static_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp") 4 | nvshmem_add_gtest_unit_test_prefix(dynamic_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp") 5 | endif() -------------------------------------------------------------------------------- /test/device/tile/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(tile_put.cu) 2 | nvshmem_add_test(tile_get.cu) 3 | nvshmem_add_test(tile_bcast_pred.cu) 4 | nvshmem_add_test(tile_reduce.cu) 5 | nvshmem_add_test(tile_allreduce.cu) 6 | nvshmem_add_test(tile_allreduce_pred.cu) 7 | nvshmem_add_test(tile_allgather.cu) 8 | nvshmem_add_test(tile_allgather_pred.cu) 9 | nvshmem_add_test(tile_allreduce_1D.cu) 10 | nvshmem_add_test(tile_allgather_1D.cu) 11 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_max.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MAX) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MAX) 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_min.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MIN) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MIN) 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_sum.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, SUM) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, SUM) 12 | -------------------------------------------------------------------------------- /nvshmem4py/requirements_build.txt: -------------------------------------------------------------------------------- 1 | # These are the build-time python/pip requirements for nvshmem4py 2 | # They are mostly required to run Cybind and build wheels 3 | networkx 4 | numpy 5 | pycparser 6 | build 7 | Cython>=0.29.24 8 | setuptools==68 9 | setuptools_scm 10 | testresources 11 | wheel 12 | auditwheel 13 | patchelf 14 | # The following are Numbast requirements 15 | pybind11 16 | pyyaml 17 | click 18 | scikit-build-core 19 | ninja 20 | 21 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_prod.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, PROD) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, PROD) 12 | -------------------------------------------------------------------------------- /src/include/internal/host/shared_memory.h: -------------------------------------------------------------------------------- 1 | #ifndef SHARED_MEMORY_H 2 | #define SHARED_MEMORY_H 3 | 4 | #include "internal/host/nvshmem_internal.h" // for nvshmemi_shared_memory_info 5 | 6 | int shared_memory_create(const char *name, size_t sz, nvshmemi_shared_memory_info *info); 7 | int shared_memory_open(const char *name, size_t sz, nvshmemi_shared_memory_info *info); 8 | void shared_memory_close(char *shm_name, nvshmemi_shared_memory_info *info); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /perftest/device/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_TEST_PREFIX "perf_device_") 2 | 3 | macro(nvshmem_add_perftest SOURCE) 4 | nvshmem_add_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX}) 5 | endmacro() 6 | if(NVSHMEM_BUILD_BITCODE_LIBRARY) 7 | macro(nvshmem_add_cubin_perftest SOURCE) 8 | nvshmem_add_cubin_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX}) 9 | endmacro() 10 | endif() 11 | add_subdirectory(tile) 12 | add_subdirectory(coll) 13 | add_subdirectory(pt-to-pt) 14 | -------------------------------------------------------------------------------- /perftest/host/coll/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(alltoall_on_stream.cpp) 2 | nvshmem_add_perftest(barrier_all_on_stream.cpp) 3 | nvshmem_add_perftest(barrier_on_stream.cpp) 4 | nvshmem_add_perftest(broadcast_on_stream.cpp) 5 | nvshmem_add_perftest(fcollect_on_stream.cpp) 6 | nvshmem_add_perftest(reduction_on_stream.cpp) 7 | nvshmem_add_perftest(reducescatter_on_stream.cpp) 8 | nvshmem_add_perftest(sync_all_on_stream.cpp) 9 | nvshmem_add_perftest(sync_on_stream.cpp) 10 | -------------------------------------------------------------------------------- /perftest/perfTestRunnerSlurm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from subprocess import check_call,CalledProcessError #2.6 3 | 4 | check_call(["sbatch", "-N", "1", "--qos=short", "-p", "dgx-1p", "perfTestRunner.py"]) 5 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "dgx-1v", "perfTestRunner.py"]) 6 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "hsw_p100", "perfTestRunner.py"]) 7 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "hsw_v100", "perfTestRunner.py"]) 8 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_max.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MAX) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MAX) 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_min.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MIN) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MIN) 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_sum.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, SUM) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, SUM) 12 | -------------------------------------------------------------------------------- /src/host/stream/coll/reducescatter/reducescatter_prod.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reducescatter_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, PROD) 11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, PROD) 12 | -------------------------------------------------------------------------------- /test/host/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_TEST_PREFIX "host_") 2 | macro(nvshmem_add_test SOURCE) 3 | nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON) 4 | endmacro() 5 | 6 | macro(nvshmem_add_test_no_device SOURCE) 7 | nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} OFF) 8 | endmacro() 9 | 10 | add_subdirectory(coll) 11 | add_subdirectory(init) 12 | add_subdirectory(interop) 13 | add_subdirectory(mem) 14 | add_subdirectory(pt-to-pt) 15 | add_subdirectory(team) 16 | -------------------------------------------------------------------------------- /nvshmem4py/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pyproject.toml 2 | include setup.py 3 | include README.md 4 | include License.txt 5 | include requirements_cuda11.txt 6 | include requirements_cuda12.txt 7 | include requirements_cuda13.txt 8 | include nvshmem/bindings/_internal/*.pyx 9 | include nvshmem/bindings/_internal/*.pxd 10 | include nvshmem/bindings/*.pxd 11 | include nvshmem/bindings/*.pyx 12 | include nvshmem/bindings/_internal/*.cpp 13 | include nvshmem/core/*.py 14 | recursive-include nvshmem * 15 | -------------------------------------------------------------------------------- /test/host/pt-to-pt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(fence.cpp) 2 | nvshmem_add_test(g.cpp) 3 | nvshmem_add_test(get.cpp) 4 | nvshmem_add_test(iget.cpp) 5 | nvshmem_add_test(iput.cpp) 6 | nvshmem_add_test(p.cpp) 7 | nvshmem_add_test(put.cpp) 8 | nvshmem_add_test(quiet_on_stream.cu) 9 | nvshmem_add_test(quiet.cpp) 10 | nvshmem_add_test(signal_on_stream.cpp) 11 | nvshmem_add_test(wait_until_all_on_stream.cpp) 12 | nvshmem_add_test(wait_until_all_vector_on_stream.cpp) 13 | nvshmem_add_test(wait.cpp) 14 | -------------------------------------------------------------------------------- /test/common/test_teams.h: -------------------------------------------------------------------------------- 1 | #ifndef TEST_TEAMS_H 2 | #define TEST_TEAMS_H 3 | 4 | #include "nvshmem.h" 5 | #include "nvshmemx.h" 6 | #include 7 | #include 8 | #include 9 | 10 | extern std::unordered_map map_team_to_string; 11 | extern std::unordered_map map_string_to_team; 12 | 13 | bool get_next_team(nvshmem_team_t *team); 14 | void init_test_teams(); 15 | void finalize_test_teams(); 16 | 17 | #endif /* TEST_TEAMS_H */ 18 | -------------------------------------------------------------------------------- /test/device/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(NVSHMEM_TEST_PREFIX "device_") 2 | macro(nvshmem_add_test SOURCE) 3 | nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON) 4 | endmacro() 5 | if(NVSHMEM_BUILD_BITCODE_LIBRARY) 6 | macro(nvshmem_add_cubin_test SOURCE) 7 | nvshmem_add_cubin_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX}) 8 | endmacro() 9 | endif() 10 | add_subdirectory(tile) 11 | add_subdirectory(coll) 12 | add_subdirectory(init) 13 | add_subdirectory(pt-to-pt) 14 | add_subdirectory(query) 15 | add_subdirectory(sync) 16 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/device/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/ncclSocket/ncclsocket_param.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_PARAM_H_ 8 | #define NCCL_SOCKET_PARAM_H_ 9 | 10 | #include // for getenv 11 | 12 | static inline const char *ncclGetEnv(const char *name) { 13 | return getenv(name); 14 | } 15 | 16 | #endif -------------------------------------------------------------------------------- /src/include/non_abi/device/coll/defines.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _NVSHMEMI_DEVICE_COLL_DEFINES_H_ 8 | #define _NVSHMEMI_DEVICE_COLL_DEFINES_H_ 9 | 10 | #include "alltoall.cuh" 11 | #include "barrier.cuh" 12 | #include "broadcast.cuh" 13 | #include "fcollect.cuh" 14 | #include "reduce.cuh" 15 | #include "reducescatter.cuh" 16 | #include "broadcast.cuh" 17 | #include "fcollect.cuh" 18 | 19 | #endif /* NVSHMEMI_DEVICE_COLL_DEFINES_H */ 20 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/core/device/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | __all__ = ["numba"] 12 | -------------------------------------------------------------------------------- /src/host/topo/topo.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef __TOPO_H 8 | #define __TOPO_H 9 | #include "internal/host/nvshmemi_types.h" // for nvshmemi_state_t 10 | 11 | int nvshmemi_get_devices_by_distance(int *device_arr, int max_dev_per_pe, 12 | struct nvshmem_transport *tcurr); 13 | int nvshmemi_detect_same_device(nvshmemi_state_t *state); 14 | int nvshmemi_build_transport_map(nvshmemi_state_t *state); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/modules/bootstrap/shmem/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | set(SOURCE_LIST bootstrap_shmem.c) 2 | 3 | nvshmem_add_bootstrap(nvshmem_bootstrap_shmem ${SOURCE_LIST}) 4 | 5 | find_library( 6 | SHMEM_LIB 7 | NAMES oshmem 8 | HINTS ${SHMEM_HOME} 9 | PATH_SUFFIXES lib lib64) 10 | find_path(SHMEM_INCLUDE NAME shmem.h HINTS ${SHMEM_HOME} 11 | PATH_SUFFIXES include 12 | ) 13 | add_library(shmem IMPORTED INTERFACE) 14 | target_link_libraries(shmem INTERFACE ${SHMEM_LIB}) 15 | target_include_directories(shmem INTERFACE ${SHMEM_INCLUDE}) 16 | 17 | target_link_libraries(nvshmem_bootstrap_shmem PRIVATE shmem) 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/QUESTION.yaml: -------------------------------------------------------------------------------- 1 | name: NVSHMEM question 2 | description: Ask the NVSHMEM team a question 3 | title: "[Question]: " 4 | labels: ["question"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/nvshmem/api/using.html), in particular the FAQ section, and the [release notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html). 11 | --- 12 | - type: textarea 13 | id: question 14 | attributes: 15 | label: Question -------------------------------------------------------------------------------- /nvshmem4py/test/device/numba/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from utils import uid_init, mpi_init 4 | from nvshmem.core import finalize 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption("--init-type", action="store", default="uid", help="Method to initialize NVSHMEM", choices=["uid", "mpi"]) 8 | 9 | @pytest.fixture(scope="session", autouse=True) 10 | def nvshmem_init_fini(request): 11 | init_type = request.config.getoption("--init-type") 12 | if init_type == "uid": 13 | uid_init() 14 | elif init_type == "mpi": 15 | mpi_init() 16 | 17 | yield 18 | 19 | finalize() 20 | 21 | -------------------------------------------------------------------------------- /src/include/host/nvshmem_macros.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _NVSHMEM_MACROS_H_ 3 | #define _NVSHMEM_MACROS_H_ 4 | 5 | #include 6 | #include "non_abi/nvshmem_build_options.h" 7 | 8 | #ifdef __CUDA_ARCH__ 9 | #ifdef NVSHMEMI_HOST_ONLY 10 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__ 11 | #else 12 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__ __device__ 13 | #endif 14 | #else 15 | #define NVSHMEMI_HOSTDEVICE_PREFIX 16 | #endif 17 | 18 | #if defined NVSHMEM_HOSTLIB_ONLY 19 | #undef NVSHMEMI_HOSTDEVICE_PREFIX 20 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__ __device__ __attribute__((always_inline)) 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/modules/bootstrap/common/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | add_library(nvshmem_bootstrap_common STATIC bootstrap_util.cpp) 2 | 3 | set_target_properties(nvshmem_bootstrap_common PROPERTIES 4 | POSITION_INDEPENDENT_CODE ON 5 | CXX_STANDARD_REQUIRED ON 6 | CUDA_STANDARD_REQUIRED ON 7 | CXX_STANDARD 11 8 | CUDA_STANDARD 11 9 | CUDA_SEPARABLE_COMPILATION ON 10 | ) 11 | 12 | target_include_directories(nvshmem_bootstrap_common INTERFACE 13 | ${CMAKE_CURRENT_SOURCE_DIR} 14 | ) 15 | 16 | target_include_directories(nvshmem_bootstrap_common PRIVATE 17 | ${CMAKE_SOURCE_DIR}/include 18 | ) 19 | -------------------------------------------------------------------------------- /nvshmem4py/requirements_optional_cuda12.txt: -------------------------------------------------------------------------------- 1 | # These are optional dependencies 2 | # nvshmem4py is aware of and interoperable with them 3 | # but doesn't depend on them 4 | mpi4py 5 | --index-url https://download.pytorch.org/whl/cu129 6 | --extra-index-url https://pypi.org/simple 7 | torch==2.8.0 8 | --index-url https://pypi.org/simple 9 | ml-dtypes 10 | # NOTE! If you are installing PyTorch, you should install these requirements AFTER PyTorch. 11 | # Pytorch has a hard pin to these libraries. Because of CUDA-compatibility, these versions work 12 | # But older ones that pytorch pulls in will not. 13 | nvidia-cuda-nvcc-cu12 14 | nvidia-nvjitlink-cu12 15 | -------------------------------------------------------------------------------- /src/include/internal/host_transport/nvshmemi_transport_defines.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef __NVSHMEMI_TRANSPORT_DEFINES_H 8 | #define __NVSHMEMI_TRANSPORT_DEFINES_H 9 | 10 | #define NVSHMEM_MEM_HANDLE_SIZE 512 11 | 12 | #define NVSHMEM_PCIE_BDF_BUFFER_LEN 50 13 | 14 | typedef struct pcie_identifier { 15 | int dev_id; 16 | int bus_id; 17 | int domain_id; 18 | } pcie_id_t; 19 | 20 | typedef struct nvshmem_mem_handle { 21 | char reserved[NVSHMEM_MEM_HANDLE_SIZE]; 22 | } nvshmem_mem_handle_t; 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/include/internal/host/sockets.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef SOCKETS_H 8 | #define SOCKETS_H 9 | 10 | #include 11 | 12 | typedef struct ipcHandle_st { 13 | int socket; 14 | char *socketName; 15 | } ipcHandle; 16 | 17 | int ipcOpenSocket(ipcHandle *&handle, pid_t, pid_t); 18 | 19 | int ipcCloseSocket(ipcHandle *handle); 20 | 21 | int ipcRecvFd(ipcHandle *handle, int *fd); 22 | 23 | int ipcSendFd(ipcHandle *handle, const int fd, pid_t process, pid_t); 24 | int ipcCloseFd(int fd); 25 | 26 | #endif /* SOCKETS_H */ 27 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # This is config file for git pre commit hooks 2 | # It requires the pre-commit package from https://pre-commit.com/ 3 | # pre-commit is used to run git pre-commit hooks with 4 | # a lot of pre-built hooks available on the website 5 | # 6 | # Install the pre-commit package and run pre-commit install 7 | # from the git repository 8 | 9 | # The following hook would install clang-format hook and 10 | # run it for every commit. It would apply clang-format fixes 11 | # and leave them as unstaged. 12 | repos: 13 | - repo: https://github.com/pre-commit/mirrors-clang-format 14 | rev: v13.0.0 15 | hooks: 16 | - id: clang-format 17 | types: [c, c++, cuda] 18 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | 12 | from .nvshmem import * 13 | 14 | # Define what gets exposed when users do `import nvshmem.bindings` 15 | __all__ = [name for name in dir() if not name.startswith("_")] 16 | -------------------------------------------------------------------------------- /src/host/coll/barrier/barrier.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef NVSHMEMI_BARRIER_CPU_H 8 | #define NVSHMEMI_BARRIER_CPU_H 9 | #include 10 | #include "device_host/nvshmem_types.h" 11 | 12 | int nvshmemi_call_barrier_on_stream_kernel(nvshmem_team_t team, cudaStream_t stream); 13 | int nvshmemi_call_sync_on_stream_kernel(nvshmem_team_t team, cudaStream_t stream); 14 | void nvshmemxi_barrier_all_on_stream(cudaStream_t); 15 | void nvshmemxi_barrier_on_stream(nvshmem_team_t team, cudaStream_t stream); 16 | void nvshmemxi_sync_all_on_stream(cudaStream_t); 17 | 18 | #endif /* NVSHMEMI_BARRIER_CPU_H */ 19 | -------------------------------------------------------------------------------- /src/host/transport/p2p/p2p.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _P2P_H 8 | #define _P2P_H 9 | 10 | #include // IWYU pragma: keep 11 | // IWYU pragma: no_include 12 | #include 13 | #include "internal/host_transport/nvshmemi_transport_defines.h" 14 | 15 | typedef struct { 16 | int ndev; 17 | CUdevice *cudev; 18 | int *devid; 19 | CUdeviceptr *curetval; 20 | CUdevice cudevice; 21 | int device_id; 22 | uint64_t hostHash; 23 | pcie_id_t *pcie_ids; 24 | char pcie_bdf[NVSHMEM_PCIE_BDF_BUFFER_LEN]; 25 | } transport_p2p_state_t; 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /nvshmem4py/perftest/alltoall_on_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a Python implementation of the `alltoall_on_stream` NVSHMEM Perftest 3 | 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported. 5 | """ 6 | import argparse 7 | 8 | from cuda.core.experimental._event import Event 9 | from cuda.core.experimental import Device, system 10 | import cuda.core 11 | 12 | import nvshmem.core 13 | 14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark 15 | 16 | if __name__ == '__main__': 17 | args = build_parser() 18 | print_runtime_options(args) 19 | uid_init() 20 | run_coll_benchmark(args, "alltoall") 21 | nvshmem.core.finalize() 22 | -------------------------------------------------------------------------------- /nvshmem4py/perftest/fcollect_on_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a Python implementation of the `fcollect_on_stream` NVSHMEM Perftest 3 | 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported. 5 | """ 6 | import argparse 7 | 8 | from cuda.core.experimental._event import Event 9 | from cuda.core.experimental import Device, system 10 | import cuda.core 11 | 12 | import nvshmem.core 13 | 14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark 15 | 16 | if __name__ == '__main__': 17 | args = build_parser() 18 | print_runtime_options(args) 19 | uid_init() 20 | run_coll_benchmark(args, "fcollect") 21 | nvshmem.core.finalize() 22 | -------------------------------------------------------------------------------- /nvshmem4py/perftest/reduction_on_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a Python implementation of the `reduction_on_stream` NVSHMEM Perftest 3 | 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported. 5 | """ 6 | import argparse 7 | 8 | from cuda.core.experimental._event import Event 9 | from cuda.core.experimental import Device, system 10 | import cuda.core 11 | 12 | import nvshmem.core 13 | 14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark 15 | 16 | if __name__ == '__main__': 17 | args = build_parser() 18 | print_runtime_options(args) 19 | uid_init() 20 | run_coll_benchmark(args, "reduce") 21 | nvshmem.core.finalize() 22 | -------------------------------------------------------------------------------- /nvshmem4py/perftest/broadcast_on_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a Python implementation of the `broadcast_on_stream` NVSHMEM Perftest 3 | 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported. 5 | """ 6 | import argparse 7 | 8 | from cuda.core.experimental._event import Event 9 | from cuda.core.experimental import Device, system 10 | import cuda.core 11 | 12 | import nvshmem.core 13 | 14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark 15 | 16 | if __name__ == '__main__': 17 | args = build_parser() 18 | print_runtime_options(args) 19 | uid_init() 20 | run_coll_benchmark(args, "broadcast") 21 | nvshmem.core.finalize() 22 | -------------------------------------------------------------------------------- /nvshmem4py/perftest/reducescatter_on_stream.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a Python implementation of the `reducescatter_on_stream` NVSHMEM Perftest 3 | 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported. 5 | """ 6 | import argparse 7 | 8 | from cuda.core.experimental._event import Event 9 | from cuda.core.experimental import Device, system 10 | import cuda.core 11 | 12 | import nvshmem.core 13 | 14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark 15 | 16 | if __name__ == '__main__': 17 | args = build_parser() 18 | print_runtime_options(args) 19 | uid_init() 20 | run_coll_benchmark(args, "reducescatter") 21 | nvshmem.core.finalize() 22 | -------------------------------------------------------------------------------- /Compatibility.md: -------------------------------------------------------------------------------- 1 | # Compatibility with NVSHMEM 2 | 3 | NVSHMEM follows semantic versioning for its releases and packages per commit i.e `MAJOR.MINOR.PATCH.TWEAK`. 4 | - Each component of the version is monotonically increasing number. So, if the author makes non-source change e.g. `test`, `perftest`, etc, it would require updating `TWEAK` component of the version. 5 | - If the author makes a change to the source file, but not the ABI or API, it is PATCH change by 1 and `TWEAK` resets. 6 | - If the author makes a change to the API/ABI definition in a backward compat way, it is MINOR change by 1 and TWEAK/PATCH reset to 0. 7 | - If the author makes a change to the ABI/API definition in the non-backward compat way, it is MAJOR change by 1 and TWEAK/PATCH/MINOR resets to 0. 8 | -------------------------------------------------------------------------------- /test/host/init/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_test(cuobject_init.cpp) 2 | nvshmem_add_test(nvrtc_api.cpp) 3 | if(NOT CUDA_VERSION VERSION_LESS 12000) 4 | # This test depends on NVJitLink which is not available in CUDA 11.x 5 | nvshmem_add_test(nvrtc_numba_ltoir.cpp) 6 | endif() 7 | nvshmem_add_test(nvshmemx_init_status.cpp) 8 | nvshmem_add_test(static_init.cpp) 9 | nvshmem_add_test(init_loop.cpp) 10 | nvshmem_add_test(global_exit.cpp) 11 | 12 | if(NVSHMEM_MPI_SUPPORT) 13 | nvshmem_add_test(mpi_init.cpp) 14 | nvshmem_add_test(uid_init.cpp) 15 | nvshmem_add_test(nvshmemx_init_with_device.cpp) 16 | nvshmem_add_test_no_device(nvshmemx_hostlib_init_attr.cpp) 17 | endif() 18 | 19 | if(NVSHMEM_SHMEM_SUPPORT) 20 | nvshmem_add_test(shmem_init.cpp) 21 | endif() 22 | -------------------------------------------------------------------------------- /nvshmem4py/test/wheel_sanity_test.py: -------------------------------------------------------------------------------- 1 | import nvshmem 2 | 3 | import os 4 | 5 | def test_import_modules(): 6 | print("Testing import modules") 7 | # Import modules 8 | import nvshmem 9 | import nvshmem.core 10 | import nvshmem.bindings 11 | 12 | # Import core APIs 13 | from nvshmem.core import init, finalize 14 | 15 | # Import bindings 16 | print("Testing import bindings") 17 | attr = nvshmem.bindings.InitAttr() 18 | 19 | # Can't run these because it assumes stuff about nvshmem state 20 | from nvshmem.bindings import hostlib_finalize, hostlib_init_attr, uniqueid, check_status 21 | 22 | # Get version info 23 | print(nvshmem.core.get_version()) 24 | 25 | if __name__ == '__main__': 26 | test_import_modules() 27 | -------------------------------------------------------------------------------- /test/common/data_check.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _DATA_CHECK_H_ 8 | #define _DATA_CHECK_H_ 9 | 10 | #include "cuda_runtime.h" 11 | template 12 | int init_data_ring(T *buf, size_t size, int disp, int iters, int mype, int npes, int *nextpe, 13 | int *prevpe, int seed, cudaStream_t cstrm); 14 | template 15 | int init_data_alltoall(T *buf, size_t size, int disp, int iters, int mype, int npes, int seed, 16 | cudaStream_t cstrm); 17 | template 18 | int check_data_ring(T *buf, cudaStream_t); 19 | template 20 | int check_data_alltoall(T *buf, cudaStream_t); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | """ 12 | __init__.py for nvshmem Python package 13 | """ 14 | import os 15 | 16 | __package_name__ = os.getenv("PACKAGE_NAME", "nvshmem4py") 17 | 18 | # Version is autogenerated by setuptools_scm 19 | from .version import version, __version__, __version_tuple__ 20 | -------------------------------------------------------------------------------- /perftest/device/coll/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(alltoall_latency.cu) 2 | nvshmem_add_perftest(barrier_latency.cu) 3 | nvshmem_add_perftest(bcast_latency.cu) 4 | nvshmem_add_perftest(fcollect_latency.cu) 5 | nvshmem_add_perftest(reducescatter_latency.cu) 6 | nvshmem_add_perftest(reduction_latency.cu) 7 | nvshmem_add_perftest(sync_latency.cu) 8 | 9 | if(NVSHMEM_BUILD_BITCODE_LIBRARY) 10 | nvshmem_add_cubin_perftest(alltoall_latency.cu) 11 | nvshmem_add_cubin_perftest(barrier_latency.cu) 12 | nvshmem_add_cubin_perftest(bcast_latency.cu) 13 | nvshmem_add_cubin_perftest(fcollect_latency.cu) 14 | nvshmem_add_cubin_perftest(reducescatter_latency.cu) 15 | nvshmem_add_cubin_perftest(reduction_latency.cu) 16 | nvshmem_add_cubin_perftest(sync_latency.cu) 17 | endif() 18 | -------------------------------------------------------------------------------- /test/common/ring_alltoall.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _RING_ALL_TO_ALL_H_ 8 | #define _RING_ALL_TO_ALL_H_ 9 | 10 | #include 11 | 12 | #define MAX_MSG_SIZE 65536 13 | #define ITER 100 14 | 15 | typedef void (*launch_alltoall_ptr_t)(void *, void *, size_t, int, int, cudaStream_t); 16 | typedef void (*launch_ring_ptr_t)(void *, void *, size_t, int, int, cudaStream_t); 17 | 18 | int setup(bool is_scalar, int disp, size_t max_size = MAX_MSG_SIZE, uint64_t max_iter = ITER, 19 | bool local_dest = false, int *argc = NULL, char ***argv = NULL); 20 | void cleanup(); 21 | template 22 | int test(launch_alltoall_ptr_t, launch_ring_ptr_t); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/include/internal/host/nvshmemi_coll.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include "host/nvshmem_macros.h" 14 | #include "device_host/nvshmem_types.h" 15 | 16 | #ifndef NVSHMEMI_COLL_H 17 | #define NVSHMEMI_COLL_H 18 | 19 | NVSHMEMI_HOSTDEVICE_PREFIX void nvshmemi_barrier(nvshmem_team_t team); 20 | 21 | #endif /* NVSHMEMI_COLL_H */ 22 | -------------------------------------------------------------------------------- /test/device/query/hello.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #include "utils.h" 12 | 13 | #define N 4 14 | 15 | __global__ void hello_world(void) { 16 | printf("Hello World from device PE %d <%d> of %d\n", nvshmem_my_pe(), threadIdx.x, 17 | nvshmem_n_pes()); 18 | } 19 | 20 | int main(int argc, char **argv) { 21 | init_wrapper(&argc, &argv); 22 | 23 | nvshmem_barrier_all(); /* Ensure NVSHMEM device init has completed */ 24 | 25 | printf("Hello World from host PE %d of %d\n", nvshmem_my_pe(), nvshmem_n_pes()); 26 | 27 | hello_world<<<1, N>>>(); 28 | 29 | finalize_wrapper(); 30 | return 0; 31 | } 32 | -------------------------------------------------------------------------------- /test/host/pt-to-pt/quiet_on_stream.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "nvshmem.h" 9 | #include "nvshmemx.h" 10 | 11 | #include "utils.h" 12 | 13 | #define NUM_ITERS 100 14 | 15 | int main(int argc, char **argv) { 16 | int status = 0; 17 | int num_iters = NUM_ITERS; 18 | cudaStream_t cstrm; 19 | 20 | init_wrapper(&argc, &argv); 21 | 22 | CUDA_CHECK(cudaStreamCreate(&cstrm)); 23 | 24 | for (int i = 0; i < num_iters; i++) { 25 | nvshmemx_quiet_on_stream(cstrm); 26 | CUDA_CHECK(cudaStreamSynchronize(cstrm)); 27 | } 28 | 29 | CUDA_CHECK(cudaStreamDestroy(cstrm)); 30 | 31 | nvshmem_barrier_all(); 32 | finalize_wrapper(); 33 | 34 | return status; 35 | } 36 | -------------------------------------------------------------------------------- /src/include/internal/common/error_codes_internal.h: -------------------------------------------------------------------------------- 1 | /**** 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ****/ 6 | 7 | #ifndef NVSHMEM_ERROR_CODES_INTERNAL_H_ 8 | #define NVSHMEM_ERROR_CODES_INTERNAL_H_ 9 | 10 | typedef enum { 11 | NVSHMEMI_SUCCESS = 0, 12 | NVSHMEMI_UNHANDLED_CUDA_ERROR = 1, 13 | NVSHMEMI_SYSTEM_ERROR = 2, 14 | NVSHMEMI_INTERNAL_ERROR = 3, 15 | NVSHMEMI_INVALID_ARGUMENT = 4, 16 | NVSHMEMI_INVALID_USAGE = 5, 17 | NVSHMEMI_GET_CUCTX_FAILED = 6, 18 | NVSHMEMI_NOT_BOOTSTRAPPED = 7, 19 | NVSHMEMI_NOT_INITIALIZED = 8, 20 | NVSHMEMI_CUDA_GET_DEVICE_FAILED = 9, 21 | NVSHMEMI_INIT_DEVICE_ONLY_STATE_FAILED = 10, 22 | NVSHMEMI_ERROR_SKIPPED = 11, 23 | NVSHMEMI_NUM_RESULTS = 12 24 | } nvshmemResult_t; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/include/internal/host/error_codes_internal.h: -------------------------------------------------------------------------------- 1 | /**** 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ****/ 6 | 7 | #ifndef NVSHMEM_ERROR_CODES_INTERNAL_H_ 8 | #define NVSHMEM_ERROR_CODES_INTERNAL_H_ 9 | 10 | typedef enum { 11 | NVSHMEMI_SUCCESS = 0, 12 | NVSHMEMI_UNHANDLED_CUDA_ERROR = 1, 13 | NVSHMEMI_SYSTEM_ERROR = 2, 14 | NVSHMEMI_INTERNAL_ERROR = 3, 15 | NVSHMEMI_INVALID_ARGUMENT = 4, 16 | NVSHMEMI_INVALID_USAGE = 5, 17 | NVSHMEMI_GET_CUCTX_FAILED = 6, 18 | NVSHMEMI_NOT_BOOTSTRAPPED = 7, 19 | NVSHMEMI_NOT_INITIALIZED = 8, 20 | NVSHMEMI_CUDA_GET_DEVICE_FAILED = 9, 21 | NVSHMEMI_INIT_DEVICE_ONLY_STATE_FAILED = 10, 22 | NVSHMEMI_ERROR_SKIPPED = 11, 23 | NVSHMEMI_NUM_RESULTS = 12 24 | } nvshmemResult_t; 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/version.py: -------------------------------------------------------------------------------- 1 | # file generated by setuptools-scm 2 | # don't change, don't track in version control 3 | 4 | __all__ = [ 5 | "__version__", 6 | "__version_tuple__", 7 | "version", 8 | "version_tuple", 9 | "__commit_id__", 10 | "commit_id", 11 | ] 12 | 13 | TYPE_CHECKING = False 14 | if TYPE_CHECKING: 15 | from typing import Tuple 16 | from typing import Union 17 | 18 | VERSION_TUPLE = Tuple[Union[int, str], ...] 19 | COMMIT_ID = Union[str, None] 20 | else: 21 | VERSION_TUPLE = object 22 | COMMIT_ID = object 23 | 24 | version: str 25 | __version__: str 26 | __version_tuple__: VERSION_TUPLE 27 | version_tuple: VERSION_TUPLE 28 | commit_id: COMMIT_ID 29 | __commit_id__: COMMIT_ID 30 | 31 | __version__ = version = '0.1.2' 32 | __version_tuple__ = version_tuple = (0, 1, 2) 33 | 34 | __commit_id__ = commit_id = None 35 | -------------------------------------------------------------------------------- /examples/shmem-based-init.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include 14 | #include "shmem.h" 15 | #include "nvshmem.h" 16 | 17 | int main(int c, char *v[]) { 18 | nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER; 19 | 20 | shmem_init(); 21 | nvshmemx_init_attr(NVSHMEMX_INIT_WITH_SHMEM, &attr); 22 | 23 | nvshmem_finalize(); 24 | shmem_finalize(); 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /nvshmem4py/test/test_npe.py: -------------------------------------------------------------------------------- 1 | from numba import cuda 2 | import cupy as cp 3 | import argparse 4 | 5 | from utils import uid_init, mpi_init 6 | 7 | from nvshmem.bindings.device.numba import n_pes, sync_all 8 | 9 | def test_npe(): 10 | 11 | @cuda.jit() 12 | def kernel_nvshmem(destination): 13 | npes = n_pes() 14 | sync_all() 15 | destination[0] = npes 16 | 17 | npes = cp.zeros(1, dtype="int32") 18 | kernel_nvshmem[1, 1](npes) 19 | 20 | assert npes[0] > 0 21 | print(f"{npes[0]=}") 22 | 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid") 27 | args = parser.parse_args() 28 | if args.init_type == "uid": 29 | uid_init() 30 | elif args.init_type == "mpi": 31 | mpi_init() 32 | 33 | test_npe() -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/ncclSocket/ncclsocket_nccl.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_NCCL_H_ 8 | #define NCCL_SOCKET_NCCL_H_ 9 | 10 | /* Error type */ 11 | typedef enum { ncclSuccess = 0, 12 | ncclUnhandledCudaError = 1, 13 | ncclSystemError = 2, 14 | ncclInternalError = 3, 15 | ncclInvalidArgument = 4, 16 | ncclInvalidUsage = 5, 17 | ncclRemoteError = 6, 18 | ncclInProgress = 7, 19 | ncclNumResults = 8 } ncclResult_t; 20 | 21 | 22 | #endif -------------------------------------------------------------------------------- /examples/hello.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | 11 | int main(int argc, char **argv) { 12 | char hostname[256]; 13 | 14 | int ret = gethostname(hostname, 256); 15 | if (ret < 0) { 16 | printf("Failed to get hostname\n"); 17 | return 1; 18 | } 19 | 20 | printf("[%s][%ld] Starting up...\n", hostname, (long)getpid()); 21 | 22 | nvshmem_init(); 23 | 24 | int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 25 | cudaSetDevice(mype_node); 26 | void *ptr = nvshmem_malloc(1); // initialize NVSHMEM after device is set 27 | 28 | printf("[%s][%ld] Hello from PE %d of %d\n", hostname, (long)getpid(), nvshmem_my_pe(), 29 | nvshmem_n_pes()); 30 | 31 | nvshmem_finalize(); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /src/host/stream/comm/quiet_on_stream.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "internal/host/util.h" 8 | #include "internal/non_abi/nvshmemi_h_to_d_sync_defs.cuh" 9 | 10 | static int nvshmemi_quiet_maxblocksize = -1; 11 | 12 | void nvshmemi_call_proxy_quiet_entrypoint(cudaStream_t cstrm) { 13 | if (nvshmemi_quiet_maxblocksize == -1) { 14 | int tmp; 15 | CUDA_RUNTIME_CHECK(cudaOccupancyMaxPotentialBlockSize( 16 | &tmp, (int *)&nvshmemi_quiet_maxblocksize, nvshmemi_proxy_quiet_entrypoint)); 17 | } 18 | int status = cudaLaunchKernel((const void *)nvshmemi_proxy_quiet_entrypoint, 1, 19 | nvshmemi_quiet_maxblocksize, NULL, 0, cstrm); 20 | if (status) { 21 | NVSHMEMI_ERROR_PRINT("cudaLaunchKernel() failed in nvshmem_quiet_on_stream \n"); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/modules/bootstrap/pmi/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | set(SOURCE_LIST bootstrap_pmi.cpp) 2 | 3 | if(NVSHMEM_BUILD_PMI_BOOTSTRAP) 4 | nvshmem_add_bootstrap(nvshmem_bootstrap_pmi ${SOURCE_LIST}) 5 | 6 | find_library(PMI_lib NAMES pmi HINTS "${PMI_HOME}/lib") 7 | 8 | target_link_libraries(nvshmem_bootstrap_pmi PRIVATE ${PMI_lib}) 9 | target_compile_definitions(nvshmem_bootstrap_pmi PRIVATE NVSHMEM_CUSTOM_BOOTSTRAP_BUILD) 10 | target_include_directories(nvshmem_bootstrap_pmi PRIVATE ${PMI_HOME}/include) 11 | endif() 12 | 13 | if(NVSHMEM_BUILD_PMI2_BOOTSTRAP) 14 | nvshmem_add_bootstrap(nvshmem_bootstrap_pmi2 ${SOURCE_LIST}) 15 | 16 | find_library(PMI2_lib NAMES pmi2 HINTS "${PM2I_HOME}/lib") 17 | 18 | target_compile_definitions(nvshmem_bootstrap_pmi2 PRIVATE NVSHMEM_BUILD_PMI2) 19 | target_link_libraries(nvshmem_bootstrap_pmi2 PRIVATE ${PMI2_lib}) 20 | target_include_directories(nvshmem_bootstrap_pmi2 PRIVATE ${PMI2_HOME}/include) 21 | endif() 22 | -------------------------------------------------------------------------------- /perftest/perftest-mmap-full.list: -------------------------------------------------------------------------------- 1 | /host/coll/broadcast_on_stream 2 | /host/coll/reducescatter_on_stream 3 | /host/coll/fcollect_on_stream 4 | /host/coll/reduction_on_stream 5 | /host/coll/alltoall_on_stream 6 | /host/pt-to-pt/latency 7 | /host/pt-to-pt/bw 8 | /device/coll/bcast_latency 9 | /device/coll/fcollect_latency 10 | /device/coll/alltoall_latency 11 | /device/pt-to-pt/shmem_p_latency 12 | /device/pt-to-pt/shmem_put_bw 13 | /device/pt-to-pt/shmem_st_bw 14 | /device/pt-to-pt/shmem_p_ping_pong_latency 15 | /device/pt-to-pt/shmem_atomic_bw 16 | /device/pt-to-pt/shmem_g_bw 17 | /device/pt-to-pt/shmem_p_bw 18 | /device/pt-to-pt/shmem_get_bw 19 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency 20 | /device/pt-to-pt/shmem_get_latency 21 | /device/pt-to-pt/shmem_put_ping_pong_latency 22 | /device/pt-to-pt/shmem_g_latency 23 | /device/pt-to-pt/shmem_put_latency 24 | /device/pt-to-pt/shmem_signal_ping_pong_latency 25 | /device/pt-to-pt/shmem_put_signal_ping_pong_latency 26 | -------------------------------------------------------------------------------- /src/include/non_abi/nvshmem_version.h.in: -------------------------------------------------------------------------------- 1 | /* Note - For packaging reasons, do not move this file from src/include/non_abi */ 2 | 3 | #pragma once 4 | 5 | // clang-format off 6 | 7 | #define NVSHMEM_VENDOR_MAJOR_VERSION @PROJECT_VERSION_MAJOR@ 8 | #define NVSHMEM_VENDOR_MINOR_VERSION @PROJECT_VERSION_MINOR@ 9 | #define NVSHMEM_VENDOR_PATCH_VERSION @PROJECT_VERSION_PATCH@ 10 | #define NVSHMEM_VENDOR_PACKAGE_VERSION @PROJECT_VERSION_TWEAK@ 11 | #define NVSHMEM_TRANSPORT_PLUGIN_MAJOR_VERSION @TRANSPORT_VERSION_MAJOR@ 12 | #define NVSHMEM_TRANSPORT_PLUGIN_MINOR_VERSION @TRANSPORT_VERSION_MINOR@ 13 | #define NVSHMEM_TRANSPORT_PLUGIN_PATCH_VERSION @TRANSPORT_VERSION_PATCH@ 14 | #define NVSHMEM_BOOTSTRAP_PLUGIN_MAJOR_VERSION @BOOTSTRAP_VERSION_MAJOR@ 15 | #define NVSHMEM_BOOTSTRAP_PLUGIN_MINOR_VERSION @BOOTSTRAP_VERSION_MINOR@ 16 | #define NVSHMEM_BOOTSTRAP_PLUGIN_PATCH_VERSION @BOOTSTRAP_VERSION_PATCH@ 17 | #define NVSHMEM_BUILD_VARS @INFO_BUILD_VARS@ 18 | 19 | // clang-format on 20 | -------------------------------------------------------------------------------- /examples/gemm_allreduce/nvshmemAlloc.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | class nvshmemAllocation { 11 | public: 12 | nvshmemAllocation() = default; 13 | ~nvshmemAllocation() { dealloc(); } 14 | 15 | void reset(size_t capacity) { 16 | dealloc(); 17 | alloc(capacity * sizeof(T)); 18 | _capacity = capacity; 19 | } 20 | 21 | T* get() { return _data; } 22 | size_t size() { return _capacity; } 23 | void free() { dealloc(); } 24 | 25 | private: 26 | void dealloc() { 27 | if (_capacity) { 28 | nvshmem_free((void*)_data); 29 | } 30 | _capacity = 0; 31 | } 32 | 33 | void alloc(size_t size) { 34 | _data = (T*)nvshmem_malloc(size); 35 | assert(_data); 36 | } 37 | 38 | T* _data = NULL; 39 | size_t _capacity = 0; 40 | }; 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/RFE.yaml: -------------------------------------------------------------------------------- 1 | name: NVSHMEM request for enhancement 2 | description: Request for enhancement 3 | title: "[RFE]: " 4 | labels: ["enhancement"] 5 | body: 6 | - type: markdown 7 | attributes: 8 | value: | 9 | 10 | Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nvshmem/issues?q=is%3Aissue%20state%3Aopen%20label%3Aenhancement). 11 | 12 | --- 13 | - type: textarea 14 | id: rfe-description 15 | attributes: 16 | label: Please provide the below details to ensure we understand your needs 17 | description: | 18 | * What is the goal of this request? 19 | * Who will benefit from this feature? 20 | * Is this request for a specific GPU architecture or network infrastructure? 21 | * How will this feature improve current workflows or processes? 22 | * What is the priority level of this request? -------------------------------------------------------------------------------- /test/host/init/static_init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "nvshmem.h" 9 | #include "nvshmemx.h" 10 | #include "utils.h" 11 | 12 | int main(int c, char *v[]) { 13 | int mype_node, npes_node; 14 | int dev_count; 15 | 16 | nvshmem_init(); 17 | 18 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 19 | npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 20 | CUDA_CHECK(cudaGetDeviceCount(&dev_count)); 21 | int npes_per_gpu = (npes_node + dev_count - 1) / dev_count; 22 | CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu)); 23 | 24 | #ifdef _NVSHMEM_DEBUG 25 | int mype = nvshmem_my_pe(); 26 | int npes = nvshmem_n_pes(); 27 | #endif 28 | DEBUG_PRINT("[%d of %d] hello shmem world! \n", mype, npes); 29 | 30 | nvshmem_barrier_all(); 31 | 32 | nvshmem_finalize(); 33 | 34 | return 0; 35 | } 36 | -------------------------------------------------------------------------------- /test/host/init/global_exit.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "nvshmem.h" 9 | #include "nvshmemx.h" 10 | #include "utils.h" 11 | 12 | int main(int c, char *v[]) { 13 | int mype, status = 0; 14 | 15 | init_wrapper(&c, &v); 16 | 17 | mype = nvshmem_my_pe(); 18 | #ifdef _NVSHMEM_DEBUG 19 | int npes = nvshmem_n_pes(); 20 | #endif 21 | 22 | DEBUG_PRINT("[%d of %d] hello world! \n", mype, npes); 23 | 24 | nvshmem_barrier_all(); 25 | 26 | if (mype == 0) { 27 | nvshmem_global_exit(0); 28 | /* Note, this should be unreachable. return a unique error code if we reach here. */ 29 | status = 2; 30 | } else { 31 | sleep(60); 32 | fprintf(stderr, "Was able to get to the end of the test.\n"); 33 | finalize_wrapper(); 34 | return 1; 35 | } 36 | 37 | return status; 38 | } 39 | -------------------------------------------------------------------------------- /perftest/perftest-p2p-pcie.list: -------------------------------------------------------------------------------- 1 | /device/pt-to-pt/shmem_p_latency 2 | /device/pt-to-pt/shmem_p_bw 3 | /device/pt-to-pt/shmem_g_latency 4 | /device/pt-to-pt/shmem_g_bw 5 | /device/pt-to-pt/shmem_st_bw 6 | /device/pt-to-pt/shmem_p_ping_pong_latency 7 | /device/pt-to-pt/shmem_put_latency 8 | /device/pt-to-pt/shmem_put_ping_pong_latency 9 | /device/pt-to-pt/shmem_signal_ping_pong_latency 10 | /device/pt-to-pt/shmem_put_bw 11 | /device/pt-to-pt/shmem_get_bw 12 | /device/coll/barrier_latency 13 | /device/coll/bcast_latency 14 | /device/coll/fcollect_latency 15 | /device/coll/alltoall_latency 16 | /device/coll/reduction_latency 17 | /device/coll/sync_latency 18 | /host/pt-to-pt/bw 19 | /host/pt-to-pt/latency 20 | /host/pt-to-pt/stream_latency 21 | /host/coll/barrier_all_on_stream 22 | /host/coll/barrier_on_stream 23 | /host/coll/sync_all_on_stream 24 | /host/coll/sync_on_stream 25 | /host/coll/alltoall_on_stream 26 | /host/coll/broadcast_on_stream 27 | /host/coll/fcollect_on_stream 28 | /host/coll/reduction_on_stream 29 | -------------------------------------------------------------------------------- /src/host/util/cs.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | 10 | #include "non_abi/nvshmemx_error.h" 11 | 12 | static pthread_mutex_t global_mutex; 13 | 14 | void nvshmemu_thread_cs_init() { 15 | int status = pthread_mutex_init(&global_mutex, NULL); 16 | NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex init failed \n"); 17 | } 18 | 19 | void nvshmemu_thread_cs_finalize() { 20 | int status = pthread_mutex_destroy(&global_mutex); 21 | NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex destroy failed \n"); 22 | } 23 | 24 | void nvshmemu_thread_cs_enter() { 25 | int status = pthread_mutex_lock(&global_mutex); 26 | NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex lock failed \n"); 27 | } 28 | 29 | void nvshmemu_thread_cs_exit() { 30 | int status = pthread_mutex_unlock(&global_mutex); 31 | NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex unlock failed \n"); 32 | } 33 | -------------------------------------------------------------------------------- /perftest/perftest-ib.list: -------------------------------------------------------------------------------- 1 | /device/pt-to-pt/shmem_atomic_ping_pong_latency 2 | /device/pt-to-pt/shmem_put_latency 3 | /device/pt-to-pt/shmem_put_ping_pong_latency 4 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency 5 | /device/pt-to-pt/shmem_put_signal_ping_pong_latency 6 | /device/pt-to-pt/shmem_signal_ping_pong_latency 7 | /device/pt-to-pt/shmem_put_bw 8 | /device/pt-to-pt/shmem_get_bw 9 | /device/pt-to-pt/shmem_g_bw 10 | /device/pt-to-pt/shmem_g_latency 11 | /device/coll/barrier_latency 12 | /device/coll/bcast_latency 13 | /device/coll/fcollect_latency 14 | /device/coll/alltoall_latency 15 | /device/coll/reduction_latency 16 | /device/coll/sync_latency 17 | /host/pt-to-pt/bw 18 | /host/pt-to-pt/latency 19 | /host/pt-to-pt/stream_latency 20 | /host/coll/barrier_all_on_stream 21 | /host/coll/barrier_on_stream 22 | /host/coll/sync_all_on_stream 23 | /host/coll/sync_on_stream 24 | /host/coll/alltoall_on_stream 25 | /host/coll/broadcast_on_stream 26 | /host/coll/fcollect_on_stream 27 | /host/coll/reduction_on_stream 28 | /host/init/malloc 29 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | from .init_fini import * 12 | from .nvshmem_types import * 13 | from .memory import * 14 | from .interop.cupy import * 15 | from .interop.torch import * 16 | from .direct import * 17 | from .collective import * 18 | from .rma import * 19 | from .teams import * 20 | 21 | import os 22 | 23 | # Define public exports 24 | __all__ = memory.__all__ + init_fini.__all__ + nvshmem_types.__all__ + \ 25 | interop.cupy.__all__ + interop.torch.__all__ + direct.__all__ + \ 26 | collective.__all__ + rma.__all__ + teams.__all__ 27 | -------------------------------------------------------------------------------- /nvshmem4py/scripts/find_python_versions.sh: -------------------------------------------------------------------------------- 1 | declare -A seen 2 | IFS=':' read -ra paths <<< "$PATH" 3 | for dir in "${paths[@]}"; do 4 | for p in "$dir"/python3.[0-9]*; do 5 | [[ -e "$p" ]] || continue # Skip if no match or broken glob 6 | # Ignore invalid versions 7 | if [[ $(basename "$p") =~ ^python3\.[0-9]+(\.[0-9]+)*$ ]]; then 8 | : 9 | else 10 | continue 11 | fi 12 | if [ -x "$p" ]; then 13 | v=$( 14 | "$p" -c 'import sys; print("%d.%d" % (sys.version_info[0], sys.version_info[1]))' 2>/dev/null 15 | ) 16 | if [[ $v =~ ^3.[0-9]+$ ]]; then 17 | major=$(echo $v | cut -d. -f1) 18 | minor=$(echo $v | cut -d. -f2) 19 | if (( major > 3 || (major == 3 && minor >= 9) )); then 20 | if [[ -z ${seen[$v]} ]]; then 21 | seen[$v]=1 22 | echo "$v|$p" 23 | fi 24 | fi 25 | fi 26 | fi 27 | done 28 | done 29 | -------------------------------------------------------------------------------- /src/host/comm/rma.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "internal/host/nvshmem_internal.h" 9 | #include "internal/non_abi/nvshmemi_h_to_d_rma_defs.cuh" 10 | 11 | int nvshmemi_proxy_rma_launcher(void *args[], cudaStream_t cstrm, bool is_nbi, bool is_signal) { 12 | if (is_signal && is_nbi) { 13 | return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_signal_entrypoint, 1, 1, args, 0, 14 | cstrm); 15 | } else if (is_nbi) { 16 | return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_entrypoint, 1, 1, args, 0, cstrm); 17 | } else if (is_signal) { 18 | return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_signal_entrypoint_blocking, 1, 1, 19 | args, 0, cstrm); 20 | } else { 21 | return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_entrypoint_blocking, 1, 1, args, 0, 22 | cstrm); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test/host/interop/simplelib1.cu: -------------------------------------------------------------------------------- 1 | #include "nvshmem.h" 2 | #include "nvshmemx.h" 3 | #include "simplelib1.h" 4 | 5 | __device__ int num_errors_d; 6 | __global__ void simplelib1_nvshmem_kernel(int *array) { 7 | int my_pe = nvshmem_my_pe(); 8 | int n_pes = nvshmem_n_pes(); 9 | int next_pe = (my_pe + 1) % n_pes; 10 | int prev_pe = (my_pe - 1 + n_pes) % n_pes; 11 | nvshmem_int_p(array, my_pe, next_pe); 12 | nvshmem_barrier_all(); 13 | 14 | if (array[0] != prev_pe) { 15 | printf("simplelib1: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]); 16 | num_errors_d = 1; 17 | } 18 | } 19 | 20 | int simplelib1_dowork() { 21 | int *array = (int *)nvshmem_calloc(1, sizeof(int)); 22 | int num_errors = 0; 23 | simplelib1_nvshmem_kernel<<<1, 1>>>(array); 24 | cudaDeviceSynchronize(); 25 | cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int)); 26 | nvshmem_free(array); 27 | return num_errors; 28 | } 29 | 30 | void simplelib1_init() { nvshmem_init(); } 31 | 32 | void simplelib1_finalize() { nvshmem_finalize(); } 33 | -------------------------------------------------------------------------------- /test/host/interop/simplelib2.cu: -------------------------------------------------------------------------------- 1 | #include "nvshmem.h" 2 | #include "nvshmemx.h" 3 | #include "simplelib2.h" 4 | 5 | __device__ int num_errors_d; 6 | __global__ void simplelib2_nvshmem_kernel(int *array) { 7 | int my_pe = nvshmem_my_pe(); 8 | int n_pes = nvshmem_n_pes(); 9 | int next_pe = (my_pe + 1) % n_pes; 10 | int prev_pe = (my_pe - 1 + n_pes) % n_pes; 11 | nvshmem_int_p(array, my_pe, next_pe); 12 | nvshmem_barrier_all(); 13 | 14 | if (array[0] != prev_pe) { 15 | printf("simplelib2: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]); 16 | num_errors_d = 1; 17 | } 18 | } 19 | 20 | int simplelib2_dowork() { 21 | int *array = (int *)nvshmem_calloc(1, sizeof(int)); 22 | int num_errors = 0; 23 | simplelib2_nvshmem_kernel<<<1, 1>>>(array); 24 | cudaDeviceSynchronize(); 25 | cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int)); 26 | nvshmem_free(array); 27 | return num_errors; 28 | } 29 | 30 | void simplelib2_init() { nvshmem_init(); } 31 | 32 | void simplelib2_finalize() { nvshmem_finalize(); } 33 | -------------------------------------------------------------------------------- /src/include/internal/device/nvshmemi_device.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _NVSHMEMI_DEVICE_STATE_H_ 8 | #define _NVSHMEMI_DEVICE_STATE_H_ 9 | 10 | #include 11 | #if !defined __CUDACC_RTC__ 12 | #include 13 | #endif 14 | 15 | int nvshmemi_setup_collective_launch(); 16 | int nvshmemi_teardown_collective_launch(); 17 | 18 | int nvshmemi_check_state_and_init_d(); 19 | typedef struct { 20 | cudaStream_t stream; 21 | cudaEvent_t begin_event; 22 | cudaEvent_t end_event; 23 | } collective_launch_params_t; 24 | 25 | typedef struct { 26 | int multi_processor_count; 27 | int cooperative_launch; 28 | } cuda_device_attributes_t; 29 | 30 | typedef struct nvshmemi_device_state { 31 | bool is_initialized; 32 | int cuda_device_id; 33 | cuda_device_attributes_t cu_dev_attrib; 34 | collective_launch_params_t claunch_params; 35 | } nvshmemi_device_state_t; 36 | 37 | extern nvshmemi_device_state_t nvshmemi_device_only_state; 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/modules/transport/ibdevx/ibdevx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _IBRC_H 8 | #define _IBRC_H 9 | 10 | #define NVSHMEMT_IBDEVX_DBSIZE 8 11 | /* 64 bytes per WQE BB shift = log2(64) for easy multiplication. */ 12 | #define NVSHMEMT_IBDEVX_WQE_BB_SHIFT 6 13 | 14 | /* Atomic mode for our transport */ 15 | #define NVSHMEMT_IBDEVX_MLX5_QPC_ATOMIC_MODE_UP_TO_64B 0x3 16 | 17 | #define NVSHMEMT_IBDEVX_MLX5_SEND_WQE_DS 0x10 18 | 19 | /* Indicates to DEVX that we should be using an SRQ. */ 20 | #define NVSHMEMT_IBDEVX_SRQ_TYPE_VALUE 0x1 21 | 22 | /* Enables remote read/write/atomic access for a QP */ 23 | #define NVSHMEMT_IBDEVX_INIT2R2R_PARAM_MASK 0xE 24 | 25 | /* Important byte masks. */ 26 | #define NVSHMEMT_IBDEVX_MASK_UPPER_BYTE_32 0x00FFFFFF 27 | #define NVSHMEMT_IBDEVX_MASK_LOWER_3_BYTES_32 0xFF000000 28 | 29 | /* OPMOD Constants for AMOs. */ 30 | #define NVSHMEMT_IBDEVX_4_BYTE_EXT_AMO_OPMOD 0x08000000 31 | #define NVSHMEMT_IBDEVX_8_BYTE_EXT_AMO_OPMOD 0x09000000 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/core/device/numba/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | import os 12 | import warnings 13 | 14 | from nvshmem.core.nvshmem_types import NvshmemWarning 15 | 16 | if os.path.exists(os.path.join(os.path.dirname(__file__), "rma.py")): 17 | from .rma import * 18 | from .direct import * 19 | from .amo import * 20 | from .collective import * 21 | from .mem import * 22 | __all__ = rma.__all__ + direct.__all__ + amo.__all__ + collective.__all__ + mem.__all__ 23 | else: 24 | warnings.warn("Numba device bindings are not enabled", NvshmemWarning) 25 | rma = None 26 | direct = None 27 | amo = None 28 | -------------------------------------------------------------------------------- /perftest/perftest-p2p-nvlink.list: -------------------------------------------------------------------------------- 1 | /device/pt-to-pt/shmem_atomic_ping_pong_latency 2 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency 3 | /device/pt-to-pt/shmem_p_latency 4 | /device/pt-to-pt/shmem_p_bw 5 | /device/pt-to-pt/shmem_g_latency 6 | /device/pt-to-pt/shmem_g_bw 7 | /device/pt-to-pt/shmem_st_bw 8 | /device/pt-to-pt/shmem_p_ping_pong_latency 9 | /device/pt-to-pt/shmem_put_latency 10 | /device/pt-to-pt/shmem_put_ping_pong_latency 11 | /device/pt-to-pt/shmem_signal_ping_pong_latency 12 | /device/pt-to-pt/shmem_put_bw 13 | /device/pt-to-pt/shmem_get_bw 14 | /device/coll/barrier_latency 15 | /device/coll/bcast_latency 16 | /device/coll/fcollect_latency 17 | /device/coll/alltoall_latency 18 | /device/coll/reduction_latency 19 | /device/coll/sync_latency 20 | /host/pt-to-pt/bw 21 | /host/pt-to-pt/latency 22 | /host/pt-to-pt/stream_latency 23 | /host/coll/barrier_all_on_stream 24 | /host/coll/barrier_on_stream 25 | /host/coll/sync_all_on_stream 26 | /host/coll/sync_on_stream 27 | /host/coll/alltoall_on_stream 28 | /host/coll/broadcast_on_stream 29 | /host/coll/fcollect_on_stream 30 | /host/coll/reduction_on_stream 31 | -------------------------------------------------------------------------------- /src/include/device/nvshmemx_collective_launch_apis.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #ifndef _NVSHMEMX_COLLECTIVE_LAUNCH_APIS_H_ 14 | #define _NVSHMEMX_COLLECTIVE_LAUNCH_APIS_H_ 15 | 16 | #include 17 | 18 | #if !defined __CUDACC_RTC__ 19 | int nvshmemx_collective_launch(const void *func, dim3 gridDims, dim3 blockDims, void **args, 20 | size_t sharedMem, cudaStream_t stream); 21 | int nvshmemx_collective_launch_query_gridsize(const void *func, dim3 blockDims, void **args, 22 | size_t sharedMem, int *gridsize); 23 | #endif 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/ncclSocket/ncclsocket_debug.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_DEBUG_H_ 8 | #define NCCL_SOCKET_DEBUG_H_ 9 | 10 | #include "bootstrap_util.h" // for BOOTSTRAP_DEBUG_PRINT, BOOTSTRAP_ERROR_P... 11 | 12 | extern thread_local int ncclDebugNoWarn; 13 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; 14 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; 15 | 16 | #define WARN(...) BOOTSTRAP_ERROR_PRINT(__VA_ARGS__) 17 | #define INFO(FLAGS, ...) BOOTSTRAP_DEBUG_PRINT(__VA_ARGS__) 18 | #define TRACE_CALL(...) BOOTSTRAP_DEBUG_PRINT(__VA_ARGS__) 19 | #define TRACE(...) //nop 20 | 21 | #endif -------------------------------------------------------------------------------- /nvshmem4py/README.md: -------------------------------------------------------------------------------- 1 | NVSHMEM4Py Overview 2 | ******************* 3 | 4 | NVSHMEM4Py is a Python package that provides a Pythonic interface to NVSHMEM 5 | 6 | NVSHMEM4Py follows the NVSHMEM SLA. The details of the NVSHMEM SLA [is available here](https://docs.nvidia.com/nvshmem/api/sla.html). 7 | 8 | Quick Links 9 | **************** 10 | 11 | NVSHMEM4Py is a component of NVSHMEM™. Please see the following public links for information on building and working wih NVSHMEM: 12 | 13 | [Project Homepage](https://developer.nvidia.com/nvshmem) 14 | 15 | [Release Notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html) 16 | 17 | [Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/index.html) 18 | 19 | [Best Practice Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/best-practice-guide/index.html) 20 | 21 | [API Documentation](https://docs.nvidia.com/nvshmem/api/index.html) 22 | 23 | [Devzone Topic Page](https://forums.developer.nvidia.com/tag/nvshmem) 24 | 25 | The maintainers of the NVSHMEM project can also be contacted by e-mail at nvshmem@nvidia.com 26 | -------------------------------------------------------------------------------- /test/host/pt-to-pt/quiet.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include "nvshmem.h" 11 | #include "nvshmemx.h" 12 | #include "utils.h" 13 | 14 | #define NUM_ITERS 100 15 | 16 | int main(int argc, char **argv) { 17 | int status = 0; 18 | int num_iters = NUM_ITERS; 19 | 20 | init_wrapper(&argc, &argv); 21 | 22 | while (1) { 23 | int c; 24 | c = getopt(argc, argv, "n:h"); 25 | if (c == -1) break; 26 | 27 | switch (c) { 28 | case 'n': 29 | num_iters = strtol(optarg, NULL, 0); 30 | break; 31 | default: 32 | case 'h': 33 | printf("-n [No of iterations] \n"); 34 | goto out; 35 | } 36 | } 37 | nvshmem_barrier_all(); 38 | for (int i = 0; i < num_iters; i++) { 39 | nvshmem_quiet(); 40 | } 41 | 42 | out: 43 | nvshmem_barrier_all(); 44 | finalize_wrapper(); 45 | 46 | return status; 47 | } 48 | -------------------------------------------------------------------------------- /perftest/README.md: -------------------------------------------------------------------------------- 1 | Performance tests accept command line arguments. Below is the full list of options, 2 | any given test will use only a subset of these options. 3 | * -b, --min_size 4 | * -e, --max_size 5 | * -f, --step 6 | * -n, --iters 7 | * -w, --warmup_iters 8 | * -c, --ctas (used in some device pt-to-pt tests) 9 | * -t, --threads_per_cta (used in some device pt-to-pt tests) 10 | * -d, --datatype: 11 | * -o, --reduce_op 12 | * -s, --scope 13 | * -i, --stride stride between elements 14 | * -a, --atomic_op , compare_swap> 15 | * --bidir: run bidirectional test 16 | * --msgrate: report message rate (MMPs) 17 | * --dir: (whether to run put or get operations) 18 | * --issue: (applicable in some host pt-to-pt tests) 19 | -------------------------------------------------------------------------------- /test/host/pt-to-pt/fence.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include "nvshmem.h" 11 | #include "nvshmemx.h" 12 | #include "utils.h" 13 | 14 | #define NUM_ITERS 100 15 | 16 | int main(int argc, char *argv[]) { 17 | int status = 0; 18 | int num_iters = NUM_ITERS; 19 | 20 | init_wrapper(&argc, &argv); 21 | 22 | while (1) { 23 | int c; 24 | c = getopt(argc, argv, "n:h"); 25 | if (c == -1) break; 26 | 27 | switch (c) { 28 | case 'n': 29 | num_iters = strtol(optarg, NULL, 0); 30 | break; 31 | default: 32 | case 'h': 33 | printf("-n [No of iterations] \n"); 34 | goto out; 35 | } 36 | } 37 | 38 | nvshmem_barrier_all(); 39 | for (int i = 0; i < num_iters; i++) { 40 | nvshmem_fence(); 41 | } 42 | 43 | out: 44 | nvshmem_barrier_all(); 45 | finalize_wrapper(); 46 | 47 | return status; 48 | } 49 | -------------------------------------------------------------------------------- /src/include/nvshmem.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #ifndef _NVSHMEM_H_ 14 | #define _NVSHMEM_H_ 15 | 16 | #include "non_abi/nvshmem_build_options.h" 17 | /* NVRTC only compiles device code. Leave out host headers */ 18 | #if !defined __CUDACC_RTC__ && !defined __clang_llvm_bitcode_lib__ && \ 19 | !defined __NVSHMEM_NUMBA_SUPPORT__ 20 | #include "nvshmem_host.h" 21 | #endif 22 | /* NVSHMEM4PY hostlib can't parse device headers */ 23 | #if !defined NVSHMEM_HOSTLIB_ONLY 24 | #include "device/nvshmem_defines.h" 25 | #include "device/nvshmem_coll_defines.cuh" 26 | #include "device/nvshmemx_defines.h" 27 | #include "device/nvshmemx_coll_defines.cuh" 28 | #endif 29 | #endif 30 | -------------------------------------------------------------------------------- /src/include/nvshmemx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include "non_abi/nvshmem_build_options.h" 14 | 15 | #ifndef _NVSHMEMX_H_ 16 | #define _NVSHMEMX_H_ 17 | 18 | /* NVRTC only compiles device code. Leave out host headers */ 19 | #if !defined __CUDACC_RTC__ && !defined __clang_llvm_bitcode_lib__ && \ 20 | !defined __NVSHMEM_NUMBA_SUPPORT__ 21 | #include "host/nvshmemx_api.h" 22 | #include "device/tile/nvshmemx_tile_api.hpp" 23 | #include "device/nvshmemx_collective_launch_apis.h" 24 | #endif 25 | #if !defined NVSHMEM_HOSTLIB_ONLY 26 | #include "device/nvshmemx_defines.h" 27 | #include "device/nvshmemx_coll_defines.cuh" 28 | #include "device/tile/nvshmemx_tile_api_defines.cuh" 29 | #endif 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NVSHMEM Overview 2 | **************** 3 | 4 | NVSHMEM™ is a parallel programming interface based on OpenSHMEM that provides efficient and 5 | scalable communication for NVIDIA GPU clusters. NVSHMEM creates a global address space for 6 | data that spans the memory of multiple GPUs and can be accessed with fine-grained 7 | GPU-initiated operations, CPU-initiated operations, and operations on CUDA® streams. 8 | 9 | Quick Links 10 | **************** 11 | 12 | Please see the following public links for information on building and working wih NVSHMEM: 13 | 14 | [Project Homepage](https://developer.nvidia.com/nvshmem) 15 | 16 | [Release Notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html) 17 | 18 | [Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/index.html) 19 | 20 | [Best Practice Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/best-practice-guide/index.html) 21 | 22 | [API Documentation](https://docs.nvidia.com/nvshmem/api/index.html) 23 | 24 | [Devzone Topic Page](https://forums.developer.nvidia.com/tag/nvshmem) 25 | 26 | The maintainers of the NVSHMEM project can also be contacted by e-mail at nvshmem@nvidia.com 27 | -------------------------------------------------------------------------------- /test/device/init/global_exit.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "nvshmem.h" 9 | #include "nvshmemx.h" 10 | #include "utils.h" 11 | 12 | __global__ void test_kernel() { nvshmem_global_exit(0); } 13 | 14 | int main(int c, char *v[]) { 15 | int status = 0; 16 | 17 | init_wrapper(&c, &v); 18 | 19 | #ifdef _NVSHMEM_DEBUG 20 | int mype = nvshmem_my_pe(); 21 | int npes = nvshmem_n_pes(); 22 | #endif 23 | 24 | DEBUG_PRINT("[%d of %d] hello world! \n", mype, npes); 25 | 26 | nvshmem_barrier_all(); 27 | 28 | if (mype == 0) { 29 | test_kernel<<<1, 1, 0>>>(); 30 | CUDA_CHECK(cudaDeviceSynchronize()); 31 | /* Note, this should be unreachable. return a unique error code if we reach here. */ 32 | status = 2; 33 | } else { 34 | sleep(60); /* This is added to allow the PE0's global_exit to abort the program before PE1+ 35 | finalize themselves */ 36 | fprintf(stderr, "Was able to get to the end of the test.\n"); 37 | finalize_wrapper(); 38 | return 1; 39 | } 40 | 41 | return status; 42 | } 43 | -------------------------------------------------------------------------------- /test/common/test-simple-pmi/test_simple_pmiutil.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 | /* 3 | * (C) 2001 by Argonne National Laboratory. 4 | * See License.txt in top-level directory. 5 | */ 6 | /*TODO: NVIDIA copyright*/ 7 | 8 | #ifndef _SIMPLE_PMI_UTIL_H_ 9 | #define _SIMPLE_PMI_UTIL_H_ 10 | 11 | /* maximum sizes for arrays */ 12 | #define SPMIU_MAXLINE 1024 13 | #define SPMIU_IDSIZE 32 14 | 15 | /* we don't have access to MPIU_Assert and friends here in the PMI code */ 16 | #if defined(HAVE_ASSERT_H) 17 | #include 18 | #define SPMIU_Assert(expr) assert(expr) 19 | #else 20 | #define SPMIU_Assert(expr) 21 | #endif 22 | 23 | #if defined HAVE_ARPA_INET_H 24 | #include 25 | #endif /* HAVE_ARPA_INET_H */ 26 | 27 | /* prototypes for SPMIU routines */ 28 | void SPMIU_Set_rank(int PMI_rank); 29 | void SPMIU_SetServer(void); 30 | void SPMIU_printf(int print_flag, const char *fmt, ...); 31 | int SPMIU_readline(int fd, char *buf, int max); 32 | int SPMIU_writeline(int fd, char *buf); 33 | int SPMIU_parse_keyvals(char *st); 34 | void SPMIU_dump_keyvals(void); 35 | char *SPMIU_getval(const char *keystr, char *valstr, int vallen); 36 | void SPMIU_chgval(const char *keystr, char *valstr); 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/modules/bootstrap/pmi/simple-pmi/simple_pmiutil.h: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ 2 | /* 3 | * (C) 2001 by Argonne National Laboratory. 4 | * See License.txt in top-level directory. 5 | */ 6 | /*TODO: NVIDIA copyright*/ 7 | 8 | #ifndef _SIMPLE_PMI_UTIL_H_ 9 | #define _SIMPLE_PMI_UTIL_H_ 10 | 11 | /* maximum sizes for arrays */ 12 | #define SPMIU_MAXLINE 1024 13 | #define SPMIU_IDSIZE 32 14 | 15 | /* we don't have access to MPIU_Assert and friends here in the PMI code */ 16 | #if defined(HAVE_ASSERT_H) 17 | #include 18 | #define SPMIU_Assert(expr) assert(expr) 19 | #else 20 | #define SPMIU_Assert(expr) 21 | #endif 22 | 23 | #if defined HAVE_ARPA_INET_H 24 | #include 25 | #endif /* HAVE_ARPA_INET_H */ 26 | 27 | /* prototypes for SPMIU routines */ 28 | void SPMIU_Set_rank(int PMI_rank); 29 | void SPMIU_SetServer(void); 30 | void SPMIU_printf(int print_flag, const char *fmt, ...); 31 | int SPMIU_readline(int fd, char *buf, int max); 32 | int SPMIU_writeline(int fd, char *buf); 33 | int SPMIU_parse_keyvals(char *st); 34 | void SPMIU_dump_keyvals(void); 35 | char *SPMIU_getval(const char *keystr, char *valstr, int vallen); 36 | void SPMIU_chgval(const char *keystr, char *valstr); 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /nvshmem4py/test/test_get_version.py: -------------------------------------------------------------------------------- 1 | import cffi 2 | import argparse 3 | 4 | import numpy as np 5 | from numba import cuda 6 | 7 | from utils import uid_init, mpi_init 8 | 9 | from nvshmem.bindings.device.numba import vendor_get_version_info, info_get_name 10 | 11 | def test_get_version(): 12 | ffi = cffi.FFI() 13 | 14 | @cuda.jit(lto=True) 15 | def kernel(arr, name): 16 | ptr = ffi.from_buffer(arr) 17 | ptr2 = ffi.from_buffer(arr[1:]) 18 | ptr3 = ffi.from_buffer(arr[2:]) 19 | vendor_get_version_info(ptr, ptr2, ptr3) 20 | 21 | nameptr = ffi.from_buffer(name) 22 | info_get_name(nameptr) 23 | 24 | 25 | arr = np.zeros(3, dtype=np.int32) 26 | name = np.zeros(100, dtype=np.int8) 27 | 28 | kernel[1, 1](arr, name) 29 | print(f"ver: {arr[0]}.{arr[1]}.{arr[2]}") 30 | print("".join(chr(i) for i in name)) 31 | 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid") 36 | args = parser.parse_args() 37 | if args.init_type == "uid": 38 | uid_init() 39 | elif args.init_type == "mpi": 40 | mpi_init() 41 | 42 | test_get_version() -------------------------------------------------------------------------------- /src/include/internal/host/nvshmemi_bootstrap_library.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef NVSHMEMI_BOOTSTRAP_LIBRARY_H 8 | #define NVSHMEMI_BOOTSTRAP_LIBRARY_H 9 | 10 | #include 11 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" 12 | 13 | enum { BOOTSTRAP_MPI = 0, BOOTSTRAP_SHMEM, BOOTSTRAP_PMI, BOOTSTRAP_PLUGIN, BOOTSTRAP_UID }; 14 | 15 | typedef struct bootstrap_attr { 16 | bootstrap_attr() : initialize_shmem(0), mpi_comm(NULL), meta_data(NULL), uid_args(NULL) {} 17 | int initialize_shmem; 18 | void *mpi_comm; 19 | void *meta_data; 20 | void *uid_args; 21 | } bootstrap_attr_t; 22 | 23 | int bootstrap_set_bootattr(int flags, void *nvshmem_attr, bootstrap_attr_t *boot_attr); 24 | int bootstrap_preinit(int flags, bootstrap_handle_t *handle); 25 | int bootstrap_init(int flags, bootstrap_attr_t *attr, bootstrap_handle_t *handle, 26 | int *bootstrap_mode); 27 | void bootstrap_finalize(); 28 | 29 | int bootstrap_loader_preinit(const char *plugin, bootstrap_handle_t *handle); 30 | int bootstrap_loader_init(const char *plugin, void *arg, bootstrap_handle_t *handle); 31 | int bootstrap_loader_finalize(bootstrap_handle_t *handle); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /nvshmem4py/test/test_ring.py: -------------------------------------------------------------------------------- 1 | import cffi 2 | import argparse 3 | 4 | from cuda.core.experimental import Device 5 | 6 | from numba import cuda, int32 7 | 8 | import nvshmem 9 | from nvshmem.bindings.device.numba import my_pe, n_pes, int_p 10 | from nvshmem.bindings import barrier_all, my_pe as h_my_pe 11 | 12 | from utils import uid_init, mpi_init 13 | 14 | 15 | def test_ring(dev: Device): 16 | ffi = cffi.FFI() 17 | 18 | @cuda.jit(lto=True) 19 | def app_kernel(dest): 20 | ptr = ffi.from_buffer(dest) 21 | mype = my_pe() 22 | npes = n_pes() 23 | peer = int32((mype + 1) % npes) 24 | 25 | int_p(ptr, mype, peer) 26 | 27 | dest = nvshmem.core.array((1,), dtype="int32") 28 | 29 | app_kernel[1, 1, 0](dest) 30 | 31 | barrier_all() 32 | dev.sync() 33 | 34 | print(f"{h_my_pe()}: received message {dest[0]}") 35 | 36 | nvshmem.core.free_array(dest) 37 | nvshmem.core.finalize() 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid") 43 | args = parser.parse_args() 44 | if args.init_type == "uid": 45 | dev = uid_init() 46 | elif args.init_type == "mpi": 47 | dev = mpi_init() 48 | 49 | test_ring(dev) -------------------------------------------------------------------------------- /src/modules/transport/common/transport_mlx5_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _TRANSPORT_MLX5_COMMON_H 8 | #define _TRANSPORT_MLX5_COMMON_H 9 | 10 | #include // IWYU pragma: keep 11 | // IWYU pragma: no_include 12 | 13 | bool nvshmemt_ib_common_query_mlx5_caps(struct ibv_context *context); 14 | int nvshmemt_ib_common_query_endianness_conversion_size(uint32_t *endianness_mode, 15 | struct ibv_context *context); 16 | int nvshmemt_ib_common_check_nic_ext_atomic_support(struct ibv_context *context); 17 | 18 | /* These values are not defined on all systems. 19 | * However, they can be traced back to a kernel enum with 20 | * these values. 21 | */ 22 | #ifndef MLX5DV_UAR_ALLOC_TYPE_BF 23 | #define MLX5DV_UAR_ALLOC_TYPE_BF 0x0 24 | #endif 25 | 26 | #ifndef MLX5DV_UAR_ALLOC_TYPE_NC 27 | #define MLX5DV_UAR_ALLOC_TYPE_NC 0x1 28 | #endif 29 | 30 | enum { 31 | MLX5_ATOMIC_CAP_OP_SUPPORT_CAS = 0x1, 32 | MLX5_ATOMIC_CAP_OP_SUPPORT_FA = 0x2, 33 | MLX5_ATOMIC_CAP_OP_SUPPORT_MASKED_CAS = 0x4, 34 | MLX5_ATOMIC_CAP_OP_SUPPORT_MASKED_FA = 0x8, 35 | }; 36 | 37 | enum { 38 | MLX5_ATOMIC_CAP_SIZE_SUPPORT_4B = 0x4, 39 | MLX5_ATOMIC_CAP_SIZE_SUPPORT_8B = 0x8, 40 | }; 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /src/include/internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | #ifndef NVSHMEMI_BOOTSTRAP_DEFINES_H 7 | #define NVSHMEMI_BOOTSTRAP_DEFINES_H 8 | 9 | #include 10 | 11 | typedef struct bootstrap_init_ops { 12 | void *cookie; 13 | int (*get_unique_id)(void *cookie); 14 | } bootstrap_init_ops_t; 15 | 16 | enum { 17 | BOOTSTRAP_OPTIONS_STYLE_INFO = 0, 18 | BOOTSTRAP_OPTIONS_STYLE_RST = 1, 19 | BOOTSTRAP_OPTIONS_STYLE_MAX = INT_MAX 20 | }; 21 | 22 | typedef struct bootstrap_handle { 23 | int version; 24 | int pg_rank; 25 | int pg_size; 26 | int mype_node; 27 | int npes_node; 28 | int (*allgather)(const void *sendbuf, void *recvbuf, int bytes, 29 | struct bootstrap_handle *handle); 30 | int (*alltoall)(const void *sendbuf, void *recvbuf, int bytes, struct bootstrap_handle *handle); 31 | int (*barrier)(struct bootstrap_handle *handle); 32 | void (*global_exit)(int status); 33 | int (*finalize)(struct bootstrap_handle *handle); 34 | int (*show_info)(struct bootstrap_handle *handle, int style); 35 | bootstrap_init_ops_t *pre_init_ops; 36 | void *comm_state; 37 | } bootstrap_handle_v1; 38 | 39 | typedef bootstrap_handle_v1 bootstrap_handle_t; 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /scripts/install_hydra.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script downloads hydra from a static link. 4 | # And installs it at the user-specificed location 5 | 6 | set -e 7 | 8 | if [ "$#" -ne 2 ]; then 9 | echo "Usage: ./install_hydra.sh src_dir builddir" 10 | echo " src_dir: location where hydra source will be downloaded" 11 | echo " builddir: installation directory" 12 | exit 1 13 | fi 14 | 15 | srcdir=$1 16 | builddir=$2 17 | 18 | if test -f $builddir/bin/nvshmrun.hydra; then 19 | echo "hydra already installed" 20 | exit 0 21 | fi 22 | 23 | mkdir -p $srcdir 24 | cd $srcdir 25 | # Download hydra-4.3.2 source 26 | wget https://www.mpich.org/static/downloads/4.3.2/hydra-4.3.2.tar.gz 27 | gunzip hydra-4.3.2.tar.gz 28 | tar -xvf hydra-4.3.2.tar 29 | 30 | # Install hydra 31 | cd hydra-4.3.2 32 | touch aclocal.m4; 33 | touch Makefile.am; 34 | touch Makefile.in; 35 | touch ./modules/mpl/aclocal.m4; 36 | touch ./modules/mpl/Makefile.am; 37 | touch ./modules/mpl/Makefile.in; 38 | 39 | ./configure --prefix=$builddir --enable-cuda=no --enable-nvml=no 40 | make 41 | make install 42 | rm -f -- $builddir/include/mpl* 43 | mv $builddir/bin/mpiexec.hydra $builddir/bin/nvshmrun.hydra 44 | # create a soft link with name nvshmrun 45 | ln -s nvshmrun.hydra $builddir/bin/nvshmrun 46 | rm -f $builddir/bin/mpiexec $builddir/bin/mpirun 47 | 48 | echo "Hydra binaries have been installed in $builddir/bin" 49 | -------------------------------------------------------------------------------- /src/include/non_abi/nvshmem_build_options.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #cmakedefine NVSHMEM_COMPLEX_SUPPORT 4 | #cmakedefine NVSHMEM_DEBUG 5 | #cmakedefine NVSHMEM_DEVEL 6 | #cmakedefine NVSHMEM_TRACE 7 | #cmakedefine NVSHMEM_DEFAULT_PMI2 8 | #cmakedefine NVSHMEM_DEFAULT_PMIX 9 | #cmakedefine NVSHMEM_DEFAULT_UCX 10 | #cmakedefine NVSHMEM_GPU_COLL_USE_LDST 11 | #cmakedefine NVSHMEM_IBDEVX_SUPPORT 12 | #cmakedefine NVSHMEM_IBRC_SUPPORT 13 | #cmakedefine NVSHMEM_LIBFABRIC_SUPPORT 14 | #cmakedefine NVSHMEM_MPI_SUPPORT 15 | #cmakedefine NVSHMEM_NVTX 16 | #cmakedefine NVSHMEM_PMIX_SUPPORT 17 | #cmakedefine NVSHMEM_SHMEM_SUPPORT 18 | #cmakedefine NVSHMEM_TIMEOUT_DEVICE_POLLING 19 | #cmakedefine NVSHMEM_UCX_SUPPORT 20 | #cmakedefine NVSHMEM_USE_DLMALLOC 21 | #cmakedefine NVSHMEM_USE_NCCL 22 | #cmakedefine NVSHMEM_USE_GDRCOPY 23 | #cmakedefine NVSHMEM_USE_MLX5DV 24 | #cmakedefine NVSHMEM_VERBOSE 25 | #cmakedefine NVSHMEM_BUILD_TESTS 26 | #cmakedefine NVSHMEM_BUILD_EXAMPLES 27 | #cmakedefine NVSHMEM_IBGDA_SUPPORT 28 | #cmakedefine NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY 29 | #cmakedefine NVSHMEM_ENABLE_ALL_DEVICE_INLINING 30 | #cmakedefine NVSHMEM_HOSTLIB_ONLY 31 | 32 | #if defined NVSHMEM_HOSTLIB_ONLY 33 | #undef NVSHMEM_IBGDA_SUPPORT 34 | #undef NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY 35 | #define NVSHMEM_ENABLE_ALL_DEVICE_INLINING 36 | #endif 37 | 38 | #if defined __clang_llvm_bitcode_lib__ 39 | #define NVSHMEM_ENABLE_ALL_DEVICE_INLINING 40 | #endif -------------------------------------------------------------------------------- /perftest/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(nvshmem_perftest_helper STATIC utils.cu) 2 | 3 | set_target_properties(nvshmem_perftest_helper PROPERTIES 4 | POSITION_INDEPENDENT_CODE ON 5 | CXX_STANDARD_REQUIRED ON 6 | CUDA_STANDARD_REQUIRED ON 7 | CXX_STANDARD ${PERFTEST_CXX_STANDARD} 8 | CUDA_STANDARD ${PERFTEST_CXX_STANDARD} 9 | CUDA_SEPARABLE_COMPILATION ON 10 | ) 11 | 12 | target_include_directories(nvshmem_perftest_helper 13 | PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 14 | target_link_libraries(nvshmem_perftest_helper PUBLIC CUDA::cudart CUDA::cuda_driver) 15 | target_link_libraries(nvshmem_perftest_helper PUBLIC nvshmem_host nvshmem_device) 16 | 17 | if(NVSHMEM_SHMEM_SUPPORT) 18 | target_compile_definitions(nvshmem_perftest_helper PUBLIC NVSHMEMTEST_SHMEM_SUPPORT) 19 | target_include_directories(nvshmem_perftest_helper PUBLIC SHMEM_INCLUDE) 20 | endif() 21 | 22 | if(NVSHMEM_MPI_SUPPORT) 23 | target_compile_definitions(nvshmem_perftest_helper PUBLIC NVSHMEMTEST_MPI_SUPPORT) 24 | target_include_directories(nvshmem_perftest_helper PUBLIC $) 25 | endif() 26 | 27 | target_compile_options(nvshmem_perftest_helper 28 | PRIVATE $<$:-O0;-g;> 29 | $<$,$>:-Xptxas -v> 30 | $<$,$>:-O0;-g;-G> 31 | $<$,$>:-t4> 32 | ) 33 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Contributing to NVSHMEM tests 2 | 3 | ## What is an unit test ? 4 | A unit-test under `test/unit` is limited to testing 1 top-level `nvshmem` internal API and mocking rest of the code/framework to bootstrap/teardown the aforementioned API to run either on bare-metal env or in a namespaced env (docker, VM, etc) with installed dependencies. Typically, these are rarely to never ran on GPU/NIC device. The test could include or depend directly on any nvshmem internal header files. 5 | 6 | ## What is a functional test ? 7 | A functional-test under `test/functional` is limited to testing N top-level `nvshmem` external APIs of a given library. Typically, this should rarely to never demand mocking rest of the code/framework to bootstrap/teardown the aforementioned APIs and would run on a bare-metal env on one or multiple CPU/GPU/NIC devices (single or multi-node). The test must not include or depend directly on any nvshmem internal header file or sources. 8 | 9 | ## What is an integration test ? 10 | A integration-test under `test/integration` is limited to testing N x M top-level `nvshmem` and other consumer libraries API/interfaces. Typically, this should rarely to never demand mocking rest of the code in its neighbourhood and would run on a bare-metal env on one or multiple CPU/GPU/NIC devices (single or multi-node). Similar to functional test, it must not include or depend directly on any nvshmem internal header file or sources. 11 | -------------------------------------------------------------------------------- /src/host/stream/coll/rdxn/reduce_team.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "reduce_common.cuh" 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh" 9 | 10 | /* This is a special kernel that is launched only with 11 | one thread and is used during team creation in nvshmemi_team_plit_strided fn */ 12 | template 13 | __global__ void nvshmemi_reduce_kernel(int start, int stride, int size, TYPE *dst, 14 | const TYPE *source, size_t nreduce, TYPE *pWrk, 15 | volatile long *pSync, volatile long *sync_counter) { 16 | #ifdef __CUDA_ARCH__ 17 | gpu_rdxn_on_demand_2(start, stride, size, dst, source, nreduce, pWrk, pSync, 18 | sync_counter); 19 | #endif 20 | } 21 | 22 | template __global__ void nvshmemi_reduce_kernel( 23 | int, int, int, unsigned char *, unsigned char const *, unsigned long, unsigned char *, 24 | long volatile *, long volatile *); 25 | template __global__ void nvshmemi_reduce_kernel(int, int, int, int *, int const *, 26 | unsigned long, int *, 27 | long volatile *, long volatile *); 28 | -------------------------------------------------------------------------------- /test/host/init/mpi_init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #include "utils.h" 12 | 13 | int main(int c, char *v[]) { 14 | int rank, nranks; 15 | int mype_node, npes_node; 16 | MPI_Comm mpi_comm; 17 | nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER; 18 | int dev_count; 19 | MPI_Init(&c, &v); 20 | 21 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 22 | MPI_Comm_size(MPI_COMM_WORLD, &nranks); 23 | 24 | DEBUG_PRINT("MPI: [%d of %d] hello MPI world! \n", rank, nranks); 25 | 26 | mpi_comm = MPI_COMM_WORLD; 27 | attr.mpi_comm = &mpi_comm; 28 | nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr); 29 | 30 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 31 | npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 32 | CUDA_CHECK(cudaGetDeviceCount(&dev_count)); 33 | int npes_per_gpu = (npes_node + dev_count - 1) / dev_count; 34 | CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu)); 35 | 36 | #ifdef _NVSHMEM_DEBUG 37 | int mype, npes; 38 | mype = nvshmem_my_pe(); 39 | npes = nvshmem_n_pes(); 40 | DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes); 41 | #endif 42 | 43 | MPI_Barrier(MPI_COMM_WORLD); 44 | 45 | nvshmem_finalize(); 46 | 47 | MPI_Finalize(); 48 | 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /test/host/init/nvshmemx_init_status.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include "nvshmem.h" 9 | #include "nvshmemx.h" 10 | #include "utils.h" 11 | 12 | int main(int c, char *v[]) { 13 | int mype_node, npes_node; 14 | int dev_count; 15 | 16 | assert(nvshmemx_init_status() == NVSHMEM_STATUS_NOT_INITIALIZED); 17 | nvshmem_init(); 18 | assert(nvshmemx_init_status() == NVSHMEM_STATUS_IS_BOOTSTRAPPED); 19 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 20 | npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 21 | CUDA_CHECK(cudaGetDeviceCount(&dev_count)); 22 | int npes_per_gpu = (npes_node + dev_count - 1) / dev_count; 23 | CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu)); 24 | 25 | #ifdef _NVSHMEM_DEBUG 26 | int mype = nvshmem_my_pe(); 27 | int npes = nvshmem_n_pes(); 28 | #endif 29 | DEBUG_PRINT("[%d of %d] hello shmem world! \n", mype, npes); 30 | 31 | nvshmem_barrier_all(); 32 | assert(nvshmemx_init_status() >= NVSHMEM_STATUS_IS_INITIALIZED); 33 | 34 | if (npes_per_gpu > 1) assert(nvshmemx_init_status() >= NVSHMEM_STATUS_LIMITED_MPG); 35 | 36 | nvshmem_finalize(); 37 | assert(nvshmemx_init_status() == NVSHMEM_STATUS_IS_BOOTSTRAPPED); 38 | 39 | nvshmem_init(); 40 | assert(nvshmemx_init_status() >= NVSHMEM_STATUS_IS_INITIALIZED); 41 | nvshmem_finalize(); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /nvshmem4py/build_assets/numbast/templates/config_nvshmem.yml.j2: -------------------------------------------------------------------------------- 1 | Name: NVSHMEM Device Bindings 2 | Version: {{CONFIG_VERSION}} 3 | Entry Point: {{ENTRY_POINT_PATH}} 4 | # The list of files, from which the APIs is allow-listed. 5 | File List: 6 | - {{NVSHMEM_HOME}}/src/include/device/nvshmem_coll_defines.cuh 7 | - {{NVSHMEM_HOME}}/src/include/device/nvshmem_defines.h 8 | - {{NVSHMEM_HOME}}/src/include/device/nvshmemx_coll_defines.cuh 9 | - {{NVSHMEM_HOME}}/src/include/device/nvshmemx_defines.h 10 | 11 | GPU Arch: 12 | # sm_70 is used to parse the declarations of nvshmem API. When code is jitted 13 | # at runtime, Numba will use the runtime CC to generate code. 14 | - sm_70 15 | 16 | Clang Include Paths: 17 | - {{NVSHMEM_HOME}}/src/include 18 | {{CUDA13_CCCL_INCLUDE_PATH}} 19 | 20 | Macro-expanded Function Prefixes: 21 | - nvshmem_ 22 | - nvshmemx_ 23 | 24 | Predefined Macros: 25 | - __NVSHMEM_NUMBA_SUPPORT__ 26 | 27 | Output Name: {{OUTPUT_NAME}} 28 | 29 | Cooperative Launch Required Functions Regex: 30 | {{COOPERATIVE_LAUNCH_REQUIRED_FUNCTIONS}} 31 | 32 | API Prefix Removal: 33 | Function: 34 | - "nvshmem_" 35 | - "nvshmemx_" 36 | 37 | Additional Import: 38 | - "nvshmem.bindings" 39 | 40 | Module Callbacks: 41 | setup: "lambda x: nvshmem.bindings.cumodule_init(int(x))" 42 | teardown: "lambda x: nvshmem.bindings.cumodule_finalize(int(x))" 43 | 44 | Shim Include Override: "\"entry_point.h\"" 45 | 46 | Exclude: 47 | Function: 48 | - "nvshmem_ptr" 49 | - "nvshmem_mc_ptr" -------------------------------------------------------------------------------- /test/host/init/shmem_init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #ifdef NVSHMEMTEST_SHMEM_SUPPORT 12 | #include "shmem.h" 13 | #include "shmemx.h" 14 | #endif 15 | #include "utils.h" 16 | 17 | int main(int c, char *v[]) { 18 | int nv_npes_node, nv_mype_node; 19 | nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER; 20 | int dev_count; 21 | shmem_init(); 22 | DEBUG_PRINT("shmem_init done\n"); 23 | 24 | nvshmemx_init_attr(NVSHMEMX_INIT_WITH_SHMEM, &attr); 25 | #ifdef _NVSHMEM_DEBUG 26 | int nv_mype, nv_npes; 27 | nv_mype = nvshmem_my_pe(); 28 | nv_npes = nvshmem_n_pes(); 29 | DEBUG_PRINT("NVSHMEM: [%d of %d] hello nvshmem world! \n", nv_mype, nv_npes); 30 | #endif 31 | nv_mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 32 | nv_npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 33 | DEBUG_PRINT("NVSHMEM TEAM NODE: [%d of %d] hello nvshmem team node world! \n", nv_mype_node, 34 | nv_npes_node); 35 | CUDA_CHECK(cudaGetDeviceCount(&dev_count)); 36 | int npes_per_gpu = (nv_npes_node + dev_count - 1) / dev_count; 37 | CUDA_CHECK(cudaSetDevice(nv_mype_node / npes_per_gpu)); 38 | 39 | nvshmem_barrier_all(); 40 | shmem_barrier_all(); 41 | 42 | nvshmem_finalize(); 43 | 44 | shmem_finalize(); 45 | 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/core/_internal_tracking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | """ 12 | Internal tracking for NVShmem 13 | 14 | This file contains things like buffer management, status, etc. 15 | """ 16 | from enum import IntEnum 17 | 18 | """ 19 | Map of Device IDs from cuda.core to MemoryResource (NvshmemResource) objects 20 | Used to avoid re-creating NvshmemResources every time someone calls nvshmem.core.allocate() 21 | """ 22 | _mr_references = {} 23 | 24 | """ 25 | class for Internal Init Status 26 | """ 27 | class InternalInitStatus(IntEnum): 28 | UNINITIALIZED = 0 29 | INITIALIZED = 1 30 | DE_INITIALIZED = 2 # Keeps bootstrap 31 | 32 | """ 33 | Set to True after initializing. Used for safety checks before functions 34 | """ 35 | _is_initialized = {"status": InternalInitStatus.UNINITIALIZED} 36 | 37 | """ 38 | Each NVSHMEM process needs to be assocaited with a device. We cache that here. 39 | """ 40 | _cached_device = {"device": None} 41 | 42 | """ 43 | Debug mode is used to avoid redundant calls to Device() 44 | """ 45 | _debug_mode = False 46 | -------------------------------------------------------------------------------- /nvshmem4py/test/test_collective.py: -------------------------------------------------------------------------------- 1 | import cffi 2 | import argparse 3 | 4 | from cuda.core.experimental import Device 5 | 6 | from numba import cuda, int32 7 | 8 | import nvshmem 9 | from nvshmem.bindings import my_pe, n_pes 10 | from nvshmem.bindings.device.numba import int_p, barrier_all 11 | 12 | from utils import uid_init, mpi_init 13 | 14 | def test_collective(dev: Device): 15 | 16 | ffi = cffi.FFI() 17 | 18 | 19 | @cuda.jit(lto=True) 20 | def reduce_ring(dest, mype, npes): 21 | target = ffi.from_buffer(dest) 22 | peer = int32((mype + 1) % npes) 23 | lvalue = mype 24 | 25 | for i in range(npes): 26 | int_p(target, lvalue, peer) 27 | barrier_all() 28 | lvalue = target[0] + mype 29 | barrier_all() 30 | 31 | mype = my_pe() 32 | npes = n_pes() 33 | 34 | dest = nvshmem.core.array((1,), dtype="int32") 35 | 36 | reduce_ring[1, 1, 0](dest, mype, npes) 37 | 38 | dev.sync() 39 | 40 | print(f"{my_pe()}: received message {dest[0]}") 41 | 42 | nvshmem.core.free_array(dest) 43 | nvshmem.core.finalize() 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid") 49 | args = parser.parse_args() 50 | if args.init_type == "uid": 51 | dev = uid_init() 52 | elif args.init_type == "mpi": 53 | dev = mpi_init() 54 | 55 | test_collective(dev) -------------------------------------------------------------------------------- /test/host/init/init_loop.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #ifdef NVSHMEMTEST_MPI_SUPPORT 6 | #include 7 | #endif 8 | #include 9 | #include 10 | #include "utils.h" 11 | 12 | #define INIT_DEFAULT_ITERS 150 13 | 14 | int main(int argc, char *argv[]) { 15 | int devices, num_iters = 0; 16 | const char *test_num_iter = getenv("NVSHMEMTEST_INIT_NUM_ITERS"); 17 | read_args(argc, argv); 18 | init_wrapper(&argc, &argv); 19 | nvshmem_barrier_all(); 20 | nvshmem_finalize(); 21 | 22 | if (test_num_iter) { 23 | num_iters = atoi(test_num_iter); 24 | } 25 | 26 | if (num_iters <= 0) { 27 | num_iters = INIT_DEFAULT_ITERS; 28 | } 29 | 30 | for (int i = 0; i < num_iters; i++) { 31 | printf("Step %d\n", i); 32 | nvshmem_init(); 33 | int *destination = NULL; 34 | if (use_mmap) { 35 | destination = (int *)allocate_mmap_buffer(sizeof(int), _mem_handle_type, use_egm); 36 | free_mmap_buffer(destination); 37 | } else { 38 | destination = (int *)nvshmem_malloc(sizeof(int)); 39 | nvshmem_free(destination); 40 | } 41 | nvshmem_finalize(); 42 | printf("Step %d done\n", i); 43 | } 44 | nvshmem_init(); /* finalize_wrapper will call nvshmem_finalize(); 45 | this is the corresponding init for it */ 46 | finalize_wrapper(); /* should finalize boostrap stuff as well */ 47 | } 48 | -------------------------------------------------------------------------------- /src/host/mem/dlmalloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | /* 8 | * mspace is an opaque type representing an independent 9 | * region of space that supports mspace_malloc, etc. 10 | * */ 11 | 12 | #include 13 | 14 | #ifndef _DLMALLOC_H 15 | #define _DLMALLOC_H 16 | typedef void* mspace; 17 | 18 | /*XXX: same definitions in dlmalloc.c because dlmalloc.c does not include me*/ 19 | #define NVSHMEM_SINGLE_HEAP 1 20 | #ifndef MALLOC_ALIGNMENT 21 | #if NVSHMEM_SINGLE_HEAP 22 | #define MALLOC_ALIGNMENT ((size_t)512U) 23 | #else 24 | #define MALLOC_ALIGNMENT ((size_t)256U) 25 | #endif 26 | #endif 27 | 28 | #if defined __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | /* API to create and destroy mspace*/ 33 | mspace create_mspace_with_base(void* base, size_t capacity, int locked); 34 | void mspace_add_new_chunk(mspace msp, void* ptr, size_t bytes); 35 | size_t destroy_mspace(mspace msp); 36 | 37 | /* API that prevents large chunks from being allocated with system alloc*/ 38 | int mspace_track_large_chunks(mspace msp, int enable); 39 | 40 | /* API for allocation and deallocation*/ 41 | void* mspace_malloc(mspace msp, size_t bytes); 42 | void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size); 43 | void mspace_free(mspace msp, void* mem); 44 | void* mspace_memalign(mspace msp, size_t alignment, size_t bytes); 45 | void* mspace_realloc(mspace msp, void* ptr, size_t bytes); 46 | 47 | #if defined __cplusplus 48 | } 49 | #endif 50 | #endif 51 | -------------------------------------------------------------------------------- /examples/dev-guide-ring.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | __global__ void simple_shift(int *destination) { 19 | int mype = nvshmem_my_pe(); 20 | int npes = nvshmem_n_pes(); 21 | int peer = (mype + 1) % npes; 22 | 23 | nvshmem_int_p(destination, mype, peer); 24 | } 25 | 26 | int main(void) { 27 | int mype_node, msg; 28 | cudaStream_t stream; 29 | 30 | nvshmem_init(); 31 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 32 | cudaSetDevice(mype_node); 33 | cudaStreamCreate(&stream); 34 | 35 | int *destination = (int *)nvshmem_malloc(sizeof(int)); 36 | 37 | simple_shift<<<1, 1, 0, stream>>>(destination); 38 | nvshmemx_barrier_all_on_stream(stream); 39 | cudaMemcpyAsync(&msg, destination, sizeof(int), cudaMemcpyDeviceToHost, stream); 40 | 41 | cudaStreamSynchronize(stream); 42 | printf("%d: received message %d\n", nvshmem_my_pe(), msg); 43 | 44 | nvshmem_free(destination); 45 | nvshmem_finalize(); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /test/host/mem/calloc.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "utils.h" 12 | 13 | #define NELEM (1 << 20) 14 | 15 | int main(int argc, char **argv) { 16 | long *dev_buf, *host_buf; 17 | size_t i, err = 0; 18 | 19 | init_wrapper(&argc, &argv); 20 | 21 | /* Check count == 0 */ 22 | dev_buf = (long *)nvshmem_calloc(0, sizeof(long)); 23 | if (dev_buf != NULL) { 24 | printf("Error, zero element calloc did not return NULL\n"); 25 | ++err; 26 | } 27 | nvshmem_free(dev_buf); 28 | 29 | /* Check size == 0 */ 30 | dev_buf = (long *)nvshmem_calloc(NELEM, 0); 31 | if (dev_buf != NULL) { 32 | printf("Error, zero size calloc did not return NULL\n"); 33 | ++err; 34 | } 35 | nvshmem_free(dev_buf); 36 | 37 | /* Check that memory is cleared: calloc, set, free, calloc */ 38 | dev_buf = (long *)nvshmem_calloc(NELEM, sizeof(long)); 39 | cudaMemset(dev_buf, 0xAA, NELEM * sizeof(long)); 40 | nvshmem_free(dev_buf); 41 | 42 | host_buf = (long *)calloc(NELEM, sizeof(long)); 43 | dev_buf = (long *)nvshmem_calloc(NELEM, sizeof(long)); 44 | cudaMemcpy(host_buf, dev_buf, NELEM * sizeof(long), cudaMemcpyDeviceToHost); 45 | 46 | for (i = 0; i < NELEM; i++) 47 | if (host_buf[i]) ++err; 48 | 49 | free(host_buf); 50 | nvshmem_free(dev_buf); 51 | finalize_wrapper(); 52 | 53 | return err != 0; 54 | } 55 | -------------------------------------------------------------------------------- /src/modules/transport/common/transport_gdr_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _TRANSPORT_GDR_COMMON_H 8 | #define _TRANSPORT_GDR_COMMON_H 9 | 10 | #include // IWYU pragma: keep 11 | // IWYU pragma: no_include 12 | #include 13 | 14 | #include "gdrapi.h" 15 | 16 | struct gdrcopy_function_table { 17 | gdr_t (*open)(); 18 | int (*close)(gdr_t g); 19 | int (*pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, 20 | uint32_t va_space, gdr_mh_t *handle); 21 | int (*unpin_buffer)(gdr_t g, gdr_mh_t handle); 22 | int (*get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info); 23 | int (*map)(gdr_t g, gdr_mh_t handle, void **va, size_t size); 24 | int (*unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size); 25 | int (*copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); 26 | int (*copy_to_mapping)(gdr_mh_t handle, const void *map_d_ptr, void *h_ptr, size_t size); 27 | void (*runtime_get_version)(int *major, int *minor); 28 | int (*driver_get_version)(gdr_t g, int *major, int *minor); 29 | }; 30 | 31 | bool nvshmemt_gdrcopy_ftable_init(struct gdrcopy_function_table *gdrcopy_ftable, gdr_t *gdr_desc, 32 | void **gdrcopy_handle, int log_level); 33 | void nvshmemt_gdrcopy_ftable_fini(struct gdrcopy_function_table *gdrcopy_ftable, gdr_t *gdr_desc, 34 | void **gdrcopy_handle); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/modules/bootstrap/pmi/pmi-2/COPYRIGHT: -------------------------------------------------------------------------------- 1 | 2 | COPYRIGHT 3 | 4 | The following is a notice of limited availability of the code, and disclaimer 5 | which must be included in the prologue of the code and in all source listings 6 | of the code. 7 | 8 | Copyright Notice 9 | + 2002 University of Chicago 10 | 11 | Permission is hereby granted to use, reproduce, prepare derivative works, and 12 | to redistribute to others. This software was authored by: 13 | 14 | Mathematics and Computer Science Division 15 | Argonne National Laboratory, Argonne IL 60439 16 | 17 | (and) 18 | 19 | Department of Computer Science 20 | University of Illinois at Urbana-Champaign 21 | 22 | 23 | GOVERNMENT LICENSE 24 | 25 | Portions of this material resulted from work developed under a U.S. 26 | Government Contract and are subject to the following license: the Government 27 | is granted for itself and others acting on its behalf a paid-up, nonexclusive, 28 | irrevocable worldwide license in this computer software to reproduce, prepare 29 | derivative works, and perform publicly and display publicly. 30 | 31 | DISCLAIMER 32 | 33 | This computer code material was prepared, in part, as an account of work 34 | sponsored by an agency of the United States Government. Neither the United 35 | States, nor the University of Chicago, nor any of their employees, makes any 36 | warranty express or implied, or assumes any legal liability or responsibility 37 | for the accuracy, completeness, or usefulness of any information, apparatus, 38 | product, or process disclosed, or represents that its use would not infringe 39 | privately owned rights. 40 | -------------------------------------------------------------------------------- /src/modules/transport/common/CMakeLists.txt.in: -------------------------------------------------------------------------------- 1 | macro(add_helper_library LIBRARY_NAME NEED_CUDA SOURCE_LIST) 2 | add_library(${LIBRARY_NAME} STATIC ${LINK_REQUIREMENTS} ${SOURCE_LIST}) 3 | 4 | set_target_properties(${LIBRARY_NAME} PROPERTIES 5 | POSITION_INDEPENDENT_CODE ON 6 | CXX_STANDARD_REQUIRED ON 7 | CUDA_STANDARD_REQUIRED ON 8 | CXX_STANDARD 11 9 | CUDA_STANDARD 11 10 | CUDA_SEPARABLE_COMPILATION ON 11 | ) 12 | 13 | target_include_directories(${LIBRARY_NAME} PUBLIC 14 | ${CMAKE_CURRENT_SOURCE_DIR} 15 | ) 16 | 17 | target_include_directories(${LIBRARY_NAME} PRIVATE 18 | ${CMAKE_SOURCE_DIR}/include 19 | ${CUDAToolkit_INCLUDE_DIRS} 20 | ) 21 | 22 | if(NEED_CUDA) 23 | target_link_libraries(${LIBRARY_NAME} PRIVATE CUDA::cudart_static) 24 | endif() 25 | endmacro() 26 | 27 | add_helper_library(nvshmem_transport_common OFF transport_common.cpp) 28 | 29 | if(NVSHMEM_USE_GDRCOPY) 30 | add_helper_library(nvshmem_transport_gdr_common OFF transport_gdr_common.cpp) 31 | target_include_directories(nvshmem_transport_gdr_common PUBLIC ${GDRCOPY_INCLUDE}) 32 | endif() 33 | 34 | if(NVSHMEM_BUILD_IBDEVX_TRANSPORT OR NVSHMEM_BUILD_IBGDA_TRANSPORT OR NVSHMEM_BUILD_IBRC_TRANSPORT) 35 | add_helper_library(nvshmem_transport_ib_common ON transport_ib_common.cpp) 36 | 37 | if(NVSHMEM_BUILD_IBDEVX_TRANSPORT OR NVSHMEM_BUILD_IBGDA_TRANSPORT) 38 | add_helper_library(nvshmem_transport_mlx5_common OFF transport_mlx5_common.cpp) 39 | target_link_libraries(nvshmem_transport_mlx5_common PRIVATE MLX5_lib) 40 | endif() 41 | endif() 42 | -------------------------------------------------------------------------------- /test/host/team/shmem_team_reuse_teams.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * * 4 | * * See License.txt for license information 5 | * */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include "utils.h" 11 | 12 | int main(int argc, char **argv) { 13 | int i, me, npes; 14 | int ret = 0, errors = 0; 15 | 16 | init_wrapper(&argc, &argv); 17 | 18 | me = nvshmem_my_pe(); 19 | npes = nvshmem_n_pes(); 20 | 21 | if (me == 0) printf("Reuse teams test\n"); 22 | 23 | nvshmem_team_t old_team, new_team; 24 | ret = nvshmem_team_split_strided(NVSHMEM_TEAM_WORLD, 0, 1, npes, NULL, 0, &old_team); 25 | if (ret) ++errors; 26 | 27 | /* A total of npes-1 iterations are performed, where the active set in iteration i 28 | * includes PEs i..npes-1. The size of the team decreases by 1 each iteration. */ 29 | for (i = 1; i < npes; i++) { 30 | if (me == i) { 31 | printf("%3d: creating new team (start, stride, size): %3d, %3d, %3d\n", me, 32 | nvshmem_team_translate_pe(old_team, 1, NVSHMEM_TEAM_WORLD), 1, 33 | nvshmem_team_n_pes(old_team) - 1); 34 | } 35 | 36 | ret = nvshmem_team_split_strided(old_team, 1, 1, nvshmem_team_n_pes(old_team) - 1, NULL, 0, 37 | &new_team); 38 | if (old_team != NVSHMEM_TEAM_INVALID && ret) ++errors; 39 | 40 | nvshmem_team_destroy(old_team); 41 | old_team = new_team; 42 | } 43 | nvshmem_team_destroy(old_team); 44 | finalize_wrapper(); 45 | 46 | return errors != 0; 47 | } 48 | -------------------------------------------------------------------------------- /perftest/device/pt-to-pt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | nvshmem_add_perftest(shmem_atomic_bw.cu) 2 | nvshmem_add_perftest(shmem_atomic_latency.cu) 3 | nvshmem_add_perftest(shmem_atomic_ping_pong_latency.cu) 4 | nvshmem_add_perftest(shmem_g_bw.cu) 5 | nvshmem_add_perftest(shmem_g_latency.cu) 6 | nvshmem_add_perftest(shmem_get_bw.cu) 7 | nvshmem_add_perftest(shmem_get_latency.cu) 8 | nvshmem_add_perftest(shmem_p_bw.cu) 9 | nvshmem_add_perftest(shmem_p_latency.cu) 10 | nvshmem_add_perftest(shmem_p_ping_pong_latency.cu) 11 | nvshmem_add_perftest(shmem_put_atomic_ping_pong_latency.cu) 12 | nvshmem_add_perftest(shmem_put_bw.cu) 13 | nvshmem_add_perftest(shmem_put_latency.cu) 14 | nvshmem_add_perftest(shmem_put_ping_pong_latency.cu) 15 | nvshmem_add_perftest(shmem_put_signal_ping_pong_latency.cu) 16 | nvshmem_add_perftest(shmem_signal_ping_pong_latency.cu) 17 | nvshmem_add_perftest(shmem_st_bw.cu) 18 | 19 | if(NVSHMEM_BUILD_BITCODE_LIBRARY) 20 | nvshmem_add_cubin_perftest(shmem_atomic_bw.cu) 21 | nvshmem_add_cubin_perftest(shmem_atomic_latency.cu) 22 | nvshmem_add_cubin_perftest(shmem_atomic_ping_pong_latency.cu) 23 | nvshmem_add_cubin_perftest(shmem_g_latency.cu) 24 | nvshmem_add_cubin_perftest(shmem_get_latency.cu) 25 | nvshmem_add_cubin_perftest(shmem_p_latency.cu) 26 | nvshmem_add_cubin_perftest(shmem_p_ping_pong_latency.cu) 27 | nvshmem_add_cubin_perftest(shmem_put_atomic_ping_pong_latency.cu) 28 | nvshmem_add_cubin_perftest(shmem_put_latency.cu) 29 | nvshmem_add_cubin_perftest(shmem_put_ping_pong_latency.cu) 30 | nvshmem_add_cubin_perftest(shmem_put_signal_ping_pong_latency.cu) 31 | nvshmem_add_cubin_perftest(shmem_signal_ping_pong_latency.cu) 32 | endif() 33 | -------------------------------------------------------------------------------- /test/device/coll/coll_test.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef COLL_TEST_H 8 | #define COLL_TEST_H 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "nvshmem.h" 14 | #include "nvshmemx.h" 15 | 16 | #ifdef NVSHMEMTEST_MPI_SUPPORT 17 | #include "mpi.h" 18 | #endif 19 | #include "utils.h" 20 | #include 21 | #include 22 | 23 | #define ELEMS_PER_THREAD 32 24 | #define NVSHM_TEST_NUM_TPB 32 25 | #undef MAX_ELEMS 26 | #define MAX_ELEMS (ELEMS_PER_THREAD * NVSHM_TEST_NUM_TPB) 27 | #define MAX_NPES 128 28 | #define MAX_ITER 32 29 | #define LARGEST_DT uint64_t 30 | 31 | #define CUDA_RUNTIME_CHECK(stmt) \ 32 | do { \ 33 | cudaError_t result = (stmt); \ 34 | if (cudaSuccess != result) { \ 35 | fprintf(stderr, "[%s:%d] cuda failed with %s \n", __FILE__, __LINE__, \ 36 | cudaGetErrorString(result)); \ 37 | status = -1; \ 38 | goto out; \ 39 | } \ 40 | assert(cudaSuccess == result); \ 41 | } while (0) 42 | 43 | #endif /*COLL_TEST_H*/ 44 | -------------------------------------------------------------------------------- /test/device/sync/sync_test.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "nvshmem.h" 8 | #include "nvshmemx.h" 9 | #include "utils.h" 10 | 11 | __device__ int error_d; 12 | 13 | __global__ void test_nvshmem_test_kernel(uint64_t *remote, int mype, int npes) { 14 | nvshmemx_signal_op(remote, (uint64_t)mype + 1, NVSHMEM_SIGNAL_SET, (mype + 1) % npes); 15 | 16 | while (!nvshmem_uint64_test(remote, NVSHMEM_CMP_NE, 0)) 17 | ; 18 | if (*remote != ((uint64_t)mype + npes - 1) % npes + 1) { 19 | printf("PE %d received incorrect value", mype); 20 | error_d = 1; 21 | } 22 | } 23 | 24 | int main(int argc, char *argv[]) { 25 | read_args(argc, argv); 26 | init_wrapper(&argc, &argv); 27 | const int mype = nvshmem_my_pe(); 28 | const int npes = nvshmem_n_pes(); 29 | 30 | int zero = 0, ret_val; 31 | cudaMemcpyToSymbol(error_d, &zero, sizeof(int), 0); 32 | cudaDeviceSynchronize(); 33 | 34 | uint64_t *remote; 35 | if (use_mmap) { 36 | remote = 37 | (uint64_t *)allocate_mmap_buffer(sizeof(uint64_t), _mem_handle_type, use_egm, true); 38 | } else { 39 | remote = (uint64_t *)nvshmem_malloc(sizeof(uint64_t)); 40 | cudaMemset(remote, 0, sizeof(uint64_t)); 41 | } 42 | nvshmem_barrier_all(); 43 | /* The kernel is designed for 1 thread */ 44 | test_nvshmem_test_kernel<<<1, 1>>>(remote, mype, npes); 45 | cudaDeviceSynchronize(); 46 | 47 | cudaMemcpyFromSymbol(&ret_val, error_d, sizeof(int), 0); 48 | 49 | finalize_wrapper(); 50 | 51 | return ret_val; 52 | } 53 | -------------------------------------------------------------------------------- /test/host/mem/malloc_simple.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #include "cuda_runtime.h" 12 | #include "utils.h" 13 | 14 | #define MAX_SIZE 128 * 1024 * 1024 15 | 16 | int main(int argc, char **argv) { 17 | int status = 0; 18 | int mype; 19 | size_t size; 20 | char *buffer = NULL; 21 | char size_string[100]; 22 | 23 | size = (size_t)MAX_SIZE * 2; 24 | sprintf(size_string, "%zu", size); 25 | 26 | status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1); 27 | if (status) { 28 | ERROR_PRINT("setenv failed \n"); 29 | status = -1; 30 | goto out; 31 | } 32 | 33 | init_wrapper(&argc, &argv); 34 | 35 | mype = nvshmem_my_pe(); 36 | #ifdef _NVSHMEM_DEBUG 37 | npes = nvshmem_n_pes(); 38 | #endif 39 | 40 | for (size = 1; size <= MAX_SIZE; size *= 2) { 41 | buffer = (char *)nvshmem_malloc(size); 42 | if (!buffer) { 43 | ERROR_PRINT("shmem_malloc failed \n"); 44 | status = -1; 45 | goto out; 46 | } 47 | 48 | cudaMemset(buffer, 0, size); 49 | 50 | if (!mype) 51 | DEBUG_PRINT("[%d of %d] allocated symmetric object: %p size: %zu bytes \n", mype, npes, 52 | buffer, size); 53 | 54 | nvshmem_free(buffer); 55 | 56 | if (!mype) DEBUG_PRINT("[%d of %d] free symmetric object: %p \n", mype, npes, buffer); 57 | } 58 | 59 | out: 60 | finalize_wrapper(); 61 | return status; 62 | } 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | *.dirstamp 35 | *.deps 36 | *.log 37 | *.status 38 | 39 | #emacs backup files 40 | #++++ 41 | *.*~ 42 | 43 | # Vim backup files 44 | *.*.swp 45 | 46 | # IDE trash folder 47 | .idea 48 | .vscode 49 | 50 | # Generated sources 51 | src/device/comm/transfer_device.cu 52 | src/include/non_abi/nvshmem_build_options.h 53 | src/include/non_abi/nvshmem_version.h 54 | test/common/include/non_abi/device/pt-to-pt/transfer_device.cuh 55 | 56 | # gtest artifacts 57 | _deps 58 | 59 | # tarballs 60 | *.tar 61 | 62 | # Build and install targets 63 | build/ 64 | bin/ 65 | install/ 66 | perftest_install/ 67 | examples_install/ 68 | test_install/ 69 | git_commit.txt 70 | version.txt 71 | 72 | # local debug for coverity 73 | .gitlab-ci-4-coverity*.yml 74 | .gitlab-ci-4-coverage*.yml 75 | .Makefile* 76 | .version.mk 77 | .common.mk 78 | 79 | # CMake artifacts 80 | *CMakeFiles* 81 | CMakeCache.txt 82 | *Makefile 83 | CPack*Config.cmake 84 | NVSHMEM*.cmake 85 | *CTest*.cmake 86 | cmake_install.cmake 87 | 88 | # nvshmem4py objects 89 | nvshmem4py/CMakeFiles/ 90 | nvshmem4py/*.cmake 91 | nvshmem4py/nvshmem/bindings/*.cpp 92 | nvshmem4py/nvshmem/bindings/_internal/*.cpp 93 | nvshmem4py/*.egg-info 94 | nvshmem4py/pyproject.toml 95 | 96 | # Filesystem objects 97 | .DS_STORE 98 | .nfs* 99 | -------------------------------------------------------------------------------- /src/include/bootstrap_device_host/nvshmem_uniqueid.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _NVSHMEM_UNIQUEID_H_ 3 | #define _NVSHMEM_UNIQUEID_H_ 4 | 5 | #define UNIQUEID_PADDING 124 6 | #define UNIQUEID_ARGS_INVALID -1 7 | #if !defined __CUDACC_RTC__ 8 | #include // for NULL 9 | #define NVSHMEMX_UNIQUEID_INITIALIZER \ 10 | { \ 11 | (1 << 16) + sizeof(nvshmemx_uniqueid_t), /* version */ \ 12 | { \ 13 | 0 \ 14 | } \ 15 | } 16 | 17 | #define NVSHMEMX_UNIQUEID_ARGS_INITIALIZER \ 18 | { \ 19 | (1 << 16) + sizeof(nvshmemx_uniqueid_args_t), /* version */ \ 20 | NULL, /* id */ \ 21 | UNIQUEID_ARGS_INVALID, /* myrank */ \ 22 | UNIQUEID_ARGS_INVALID /* nranks */ \ 23 | } 24 | #endif 25 | typedef struct { 26 | int version; 27 | char internal[UNIQUEID_PADDING]; 28 | } nvshmemx_uniqueid_v1; 29 | static_assert(sizeof(nvshmemx_uniqueid_v1) == 128, "uniqueid_v1 must be 128 bytes."); 30 | 31 | typedef nvshmemx_uniqueid_v1 nvshmemx_uniqueid_t; 32 | 33 | typedef struct { 34 | int version; 35 | nvshmemx_uniqueid_v1 *id; 36 | int myrank; 37 | int nranks; 38 | } nvshmemx_uniqueid_args_v1; 39 | static_assert(sizeof(nvshmemx_uniqueid_args_v1) == 24, "uniqueid_args_v1 must be 24 bytes."); 40 | 41 | typedef nvshmemx_uniqueid_args_v1 nvshmemx_uniqueid_args_t; 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /src/host/coll/rdxn/rdxn.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef NVSHMEMI_RDXN_COMMON_CPU_H 8 | #define NVSHMEMI_RDXN_COMMON_CPU_H 9 | #include 10 | #include 11 | 12 | #include "cpu_coll.h" 13 | #include "non_abi/nvshmem_build_options.h" 14 | #include "device_host/nvshmem_common.cuh" 15 | #include "internal/host/nvshmem_internal.h" 16 | #include "device_host/nvshmem_types.h" 17 | #include "internal/host/util.h" 18 | #ifdef NVSHMEM_USE_NCCL 19 | #include "nccl.h" 20 | #endif 21 | 22 | template 23 | void nvshmemi_call_rdxn_on_stream_kernel(nvshmem_team_t team, TYPE *dest, const TYPE *source, 24 | size_t nreduce, cudaStream_t stream); 25 | 26 | template 27 | int nvshmemi_reduce_on_stream(nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce, 28 | cudaStream_t stream) { 29 | #ifdef NVSHMEM_USE_NCCL 30 | nvshmemi_team_t *teami = nvshmemi_team_pool[team]; 31 | if (teami->nvls_rsc_base_ptr == NULL && nvshmemi_use_nccl && 32 | nvshmemi_get_nccl_op() != ncclNumOps && nvshmemi_get_nccl_dt() != ncclNumTypes) { 33 | NCCL_CHECK(nccl_ftable.AllReduce(source, dest, nreduce, nvshmemi_get_nccl_dt(), 34 | nvshmemi_get_nccl_op(), (ncclComm_t)teami->nccl_comm, 35 | stream)); 36 | } else 37 | #endif /* NVSHMEM_USE_NCCL */ 38 | { 39 | nvshmemi_call_rdxn_on_stream_kernel(team, dest, source, nreduce, stream); 40 | } 41 | return 0; 42 | } 43 | 44 | #endif /* NVSHMEMI_RDXN_COMMON_CPU_H */ 45 | -------------------------------------------------------------------------------- /src/host/comm/fence.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "host/nvshmem_api.h" // IWYU pragma: keep 8 | #include 9 | #include 10 | 11 | #include "internal/host/nvshmem_internal.h" 12 | #include "internal/host/nvshmemi_types.h" 13 | #include "internal/host/nvshmem_nvtx.hpp" 14 | #include "non_abi/nvshmemx_error.h" 15 | #include "internal/host_transport/transport.h" 16 | #include "internal/host/util.h" 17 | 18 | void nvshmem_fence(void) { 19 | NVTX_FUNC_RANGE_IN_GROUP(MEMORDER); 20 | NVSHMEMI_CHECK_INIT_STATUS(); 21 | 22 | int status; 23 | int tbitmap = nvshmemi_state->transport_bitmap; 24 | for (int j = 0; j < nvshmemi_state->num_initialized_transports; j++) { 25 | if (tbitmap & 1) { 26 | struct nvshmem_transport *tcurr = 27 | ((nvshmem_transport_t *)nvshmemi_state->transports)[j]; 28 | if ((tcurr->attr & NVSHMEM_TRANSPORT_ATTR_NO_ENDPOINTS)) { 29 | for (int s = 0; s < nvshmemi_options.MAX_PEER_STREAMS; s++) { 30 | cudaStream_t custrm = nvshmemi_state->custreams[s]; 31 | CUDA_RUNTIME_CHECK_GOTO(cudaStreamSynchronize(custrm), status, out); 32 | } 33 | } else if (tcurr->host_ops.fence) { 34 | for (int k = 0; k < nvshmemi_state->npes; k++) { 35 | status = tcurr->host_ops.fence(tcurr, k, 0, NVSHMEMX_QP_HOST); 36 | NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out, 37 | "nvshmem_fence() failed \n"); 38 | } 39 | } 40 | } 41 | tbitmap >>= 1; 42 | } 43 | out: 44 | return; 45 | } 46 | -------------------------------------------------------------------------------- /src/include/internal/bootstrap_host/nvshmemi_bootstrap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | #ifndef NVSHMEMI_BOOTSTRAP_H 7 | #define NVSHMEMI_BOOTSTRAP_H 8 | 9 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" 10 | #include "non_abi/nvshmem_version.h" 11 | /* Version = major * 10000 + minor * 100 + patch*/ 12 | /* ABI Introduced in NVSHMEM 2.8.0 */ 13 | #define NVSHMEMI_BOOTSTRAP_ABI_VERSION \ 14 | (NVSHMEM_BOOTSTRAP_PLUGIN_MAJOR_VERSION * 10000 + \ 15 | NVSHMEM_BOOTSTRAP_PLUGIN_MINOR_VERSION * 100 + NVSHMEM_BOOTSTRAP_PLUGIN_PATCH_VERSION) 16 | 17 | #define NVSHMEM_BOOTSTRAP_MAJOR_VERSION(ver) (ver / 10000) 18 | #define NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(ver) (ver / 100) 19 | 20 | static bool nvshmemi_is_bootstrap_compatible(int bootstrap_version, int nvshmem_version, 21 | bool boot_backward_compatible) { 22 | if (NVSHMEM_BOOTSTRAP_MAJOR_VERSION(bootstrap_version) != 23 | NVSHMEM_BOOTSTRAP_MAJOR_VERSION(nvshmem_version)) { 24 | return false; 25 | } 26 | 27 | if (NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(nvshmem_version) < 28 | NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(bootstrap_version)) { 29 | if (boot_backward_compatible) { 30 | return true; 31 | } 32 | return false; 33 | } 34 | return true; 35 | } 36 | 37 | #if __cplusplus 38 | extern "C" { 39 | #endif 40 | int nvshmemi_bootstrap_plugin_init(void *mpi_comm, bootstrap_handle_t *handle, 41 | const int nvshmem_version); 42 | int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t *handle, const int nvshmem_version); 43 | #if __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /test/host/interop/app.cu: -------------------------------------------------------------------------------- 1 | #include "simplelib1.h" 2 | #include "simplelib2.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | __device__ int num_errors_d; 11 | __global__ void app_nvshmem_kernel(int *array) { 12 | int my_pe = nvshmem_my_pe(); 13 | int n_pes = nvshmem_n_pes(); 14 | int next_pe = (my_pe + 1) % n_pes; 15 | int prev_pe = (my_pe - 1 + n_pes) % n_pes; 16 | nvshmem_int_p(array, my_pe, next_pe); 17 | nvshmem_barrier_all(); 18 | 19 | if (array[0] != prev_pe) { 20 | printf("app: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]); 21 | num_errors_d = 1; 22 | } 23 | } 24 | 25 | int app_dowork() { 26 | int *array = (int *)nvshmem_calloc(1, sizeof(int)); 27 | int num_errors = 0; 28 | app_nvshmem_kernel<<<1, 1>>>(array); 29 | cudaDeviceSynchronize(); 30 | cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int)); 31 | nvshmem_free(array); 32 | return num_errors; 33 | } 34 | 35 | int main(int argc, char **argv) { 36 | nvshmem_init(); 37 | int mype = nvshmem_my_pe(); 38 | int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 39 | int npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 40 | int dev_count; 41 | cudaGetDeviceCount(&dev_count); 42 | int npes_per_gpu = (npes_node + dev_count - 1) / dev_count; 43 | cudaSetDevice(mype_node / npes_per_gpu); 44 | nvshmem_barrier_all(); 45 | 46 | simplelib1_init(); 47 | simplelib2_init(); 48 | 49 | int num_errors = app_dowork(); 50 | num_errors += simplelib1_dowork(); 51 | num_errors += simplelib2_dowork(); 52 | 53 | nvshmem_finalize(); 54 | simplelib1_finalize(); 55 | simplelib2_finalize(); 56 | 57 | return num_errors; 58 | } 59 | -------------------------------------------------------------------------------- /src/host/init/query_host.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include // for size_t 8 | #include "device_host/nvshmem_types.h" // for nvshmem_team_t 9 | #include "device_host_transport/nvshmem_constants.h" // for NVSHMEM_MAJOR... 10 | #include "host/nvshmem_api.h" // for nvshmem_team_... 11 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h" // for bootstrap_han... 12 | #include "non_abi/nvshmem_version.h" // for NVSHMEM_VENDO... 13 | #include "internal/host/nvshmemi_types.h" 14 | 15 | int nvshmem_my_pe(void) { return nvshmemi_boot_handle.pg_rank; } 16 | 17 | int nvshmem_n_pes(void) { return nvshmemi_boot_handle.pg_size; } 18 | 19 | void nvshmem_info_get_name(char *name) { 20 | size_t i; 21 | const char *str = NVSHMEM_VENDOR_STRING; 22 | 23 | /* Copy up to NVSHMEM_MAX_NAME_LEN-1 chars, then add NULL terminator */ 24 | for (i = 0; i < NVSHMEM_MAX_NAME_LEN - 1 && str[i] != '\0'; i++) name[i] = str[i]; 25 | 26 | name[i] = '\0'; 27 | } 28 | 29 | void nvshmem_info_get_version(int *major, int *minor) { 30 | *major = NVSHMEM_MAJOR_VERSION; 31 | *minor = NVSHMEM_MINOR_VERSION; 32 | } 33 | 34 | void nvshmemx_vendor_get_version_info(int *major, int *minor, int *patch) { 35 | *major = NVSHMEM_VENDOR_MAJOR_VERSION; 36 | *minor = NVSHMEM_VENDOR_MINOR_VERSION; 37 | *patch = NVSHMEM_VENDOR_PATCH_VERSION; 38 | } 39 | 40 | int nvshmemx_my_pe(nvshmemx_team_t team) { return nvshmem_team_my_pe((nvshmem_team_t)team); } 41 | 42 | int nvshmemx_n_pes(nvshmemx_team_t team) { return nvshmem_team_n_pes((nvshmem_team_t)team); } 43 | -------------------------------------------------------------------------------- /nvshmem4py/test/device/numba/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for NVSHMEM4Py unit tests 3 | 4 | TODO: this is vended from top level utils.py, let's find a way to centralize these functions 5 | """ 6 | 7 | from mpi4py import MPI 8 | import numpy as np 9 | 10 | import nvshmem.core 11 | 12 | from cuda.core.experimental import Device, system 13 | 14 | import os 15 | 16 | def get_local_rank_per_node(): 17 | comm = MPI.COMM_WORLD 18 | rank = comm.Get_rank() 19 | size = comm.Get_size() 20 | 21 | # Split COMM_WORLD into sub-communicators of processes on the same node 22 | node_comm = comm.Split_type(MPI.COMM_TYPE_SHARED) 23 | 24 | local_rank = node_comm.Get_rank() 25 | local_size = node_comm.Get_size() 26 | 27 | return local_rank 28 | 29 | def uid_init(): 30 | # This will use mpi4py to perform a UID based init with bcast. 31 | comm = MPI.COMM_WORLD 32 | rank = comm.Get_rank() 33 | nranks = comm.Get_size() 34 | 35 | local_rank_per_node = get_local_rank_per_node() 36 | dev = Device(local_rank_per_node) 37 | dev.set_current() 38 | 39 | # Create an empty uniqueid for all ranks 40 | uniqueid = nvshmem.core.get_unique_id(empty=True) 41 | if rank == 0: 42 | # Rank 0 gets a real uniqueid 43 | uniqueid = nvshmem.core.get_unique_id() 44 | 45 | # Broadcast UID to all ranks 46 | comm.Bcast(uniqueid._data.view(np.int8), root=0) 47 | 48 | nvshmem.core.init(device=dev, uid=uniqueid, rank=rank, nranks=nranks, 49 | mpi_comm=None, initializer_method="uid") 50 | 51 | return dev 52 | 53 | def mpi_init(): 54 | local_rank_per_node = get_local_rank_per_node() 55 | dev = Device(local_rank_per_node) 56 | dev.set_current() 57 | nvshmem.core.init(device=dev, uid=None, rank=None, nranks=None, 58 | mpi_comm=MPI.COMM_WORLD, initializer_method="mpi") 59 | 60 | return dev 61 | -------------------------------------------------------------------------------- /test/host/init/uid_init.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #include "utils.h" 12 | 13 | int main(int c, char *v[]) { 14 | int rank, nranks; 15 | int mype_node, npes_node; 16 | MPI_Comm mpi_comm; 17 | nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER; 18 | nvshmemx_uniqueid_t id = NVSHMEMX_UNIQUEID_INITIALIZER; 19 | int dev_count; 20 | MPI_Init(&c, &v); 21 | 22 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 23 | MPI_Comm_size(MPI_COMM_WORLD, &nranks); 24 | 25 | DEBUG_PRINT("MPI: [%d of %d] hello MPI world! \n", rank, nranks); 26 | if (rank == 0) { 27 | nvshmemx_get_uniqueid(&id); 28 | } 29 | 30 | MPI_Bcast(&id, sizeof(nvshmemx_uniqueid_t), MPI_UINT8_T, 0, MPI_COMM_WORLD); 31 | nvshmemx_set_attr_uniqueid_args(rank, nranks, &id, &attr); 32 | /* Verify if structure is set correctly */ 33 | assert(attr.args.uid_args.id == &id); 34 | assert(attr.args.uid_args.myrank == rank); 35 | assert(attr.args.uid_args.nranks == nranks); 36 | 37 | nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr); 38 | 39 | mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE); 40 | npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE); 41 | CUDA_CHECK(cudaGetDeviceCount(&dev_count)); 42 | int npes_per_gpu = (npes_node + dev_count - 1) / dev_count; 43 | CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu)); 44 | 45 | #ifdef _NVSHMEM_DEBUG 46 | int mype, npes; 47 | mype = nvshmem_my_pe(); 48 | npes = nvshmem_n_pes(); 49 | DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes); 50 | #endif 51 | 52 | MPI_Barrier(MPI_COMM_WORLD); 53 | 54 | nvshmem_finalize(); 55 | 56 | MPI_Finalize(); 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /src/host/coll/reducescatter/reducescatter.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef NVSHMEMI_REDUCESCATTER_COMMON_CPU_H 8 | #define NVSHMEMI_REDUCESCATTER_COMMON_CPU_H 9 | #include 10 | #include 11 | 12 | #include "cpu_coll.h" 13 | #include "non_abi/nvshmem_build_options.h" 14 | #include "device_host/nvshmem_common.cuh" 15 | #include "internal/host/nvshmem_internal.h" 16 | #include "device_host/nvshmem_types.h" 17 | #include "internal/host/util.h" 18 | #ifdef NVSHMEM_USE_NCCL 19 | #include "nccl.h" 20 | #endif 21 | 22 | template 23 | void nvshmemi_call_reducescatter_on_stream_kernel(nvshmem_team_t team, TYPE *dest, 24 | const TYPE *source, size_t nreduce, 25 | cudaStream_t stream); 26 | 27 | template 28 | int nvshmemi_reducescatter_on_stream(nvshmem_team_t team, TYPE *dest, const TYPE *source, 29 | size_t nreduce, cudaStream_t stream) { 30 | #ifdef NVSHMEM_USE_NCCL 31 | nvshmemi_team_t *teami = nvshmemi_team_pool[team]; 32 | if (teami->nvls_rsc_base_ptr == NULL && nvshmemi_use_nccl && 33 | nvshmemi_get_nccl_op() != ncclNumOps && nvshmemi_get_nccl_dt() != ncclNumTypes) { 34 | NCCL_CHECK(nccl_ftable.ReduceScatter(source, dest, nreduce, nvshmemi_get_nccl_dt(), 35 | nvshmemi_get_nccl_op(), 36 | (ncclComm_t)teami->nccl_comm, stream)); 37 | } else 38 | #endif /* NVSHMEM_USE_NCCL */ 39 | { 40 | nvshmemi_call_reducescatter_on_stream_kernel(team, dest, source, nreduce, stream); 41 | } 42 | return 0; 43 | } 44 | 45 | #endif /* NVSHMEMI_REDUCESCATTER_COMMON_CPU_H */ 46 | -------------------------------------------------------------------------------- /src/include/internal/host/custom_malloc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | /* 8 | * mspace is an opaque type representing an independent 9 | * region of space that supports mspace_malloc, etc. 10 | * */ 11 | #ifndef NVSHMEMI_CUSTOM_MALLOC_H 12 | #define NVSHMEMI_CUSTOM_MALLOC_H 13 | 14 | #include // for size_t 15 | #include // for map 16 | #include // for pair 17 | 18 | #define NVSHMEMI_MALLOC_ALIGNMENT ((size_t)512U) 19 | 20 | class mspace { 21 | private: 22 | /* free_chunks_start is mapping of start address of each free chunk to size of that chunk */ 23 | /* free_chunks_end is mapping of end address of each free chunk to size of that chunk */ 24 | std::map free_chunks_start, free_chunks_end; 25 | /* in_use_cunks is a mapping of each in use chunks start address to size of the chunk */ 26 | std::map inuse_chunks; 27 | size_t total_size = 0; /* size of total space managed by mspace */ 28 | public: 29 | mspace() {} 30 | mspace(void *base, size_t capacity); 31 | void print(); 32 | void add_free_chunk(char *base, size_t capacity); 33 | void add_new_chunk(void *base, size_t capacity); 34 | int track_large_chunks(int enable); 35 | void *allocate(size_t bytes); 36 | void deallocate(void *mem); 37 | void *allocate_zeroed(size_t n_elements, size_t elem_size); 38 | void *allocate_aligned(size_t alignment, size_t bytes); 39 | void *reallocate(void *ptr, size_t size); 40 | bool checkInuse(void *ptr, size_t size); 41 | void *get_startInusePtr() { 42 | if (inuse_chunks.empty()) { 43 | return NULL; 44 | } 45 | return inuse_chunks.begin()->first; 46 | } 47 | std::map *get_inuse_chunks() { return &inuse_chunks; } 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /nvshmem4py/test/test_highlevel_bindings.py: -------------------------------------------------------------------------------- 1 | import cffi 2 | import argparse 3 | 4 | from cuda.core.experimental import Device 5 | 6 | from numba import cuda, int32 7 | from numba.types import float32, Array 8 | from numba.core.extending import overload 9 | 10 | import nvshmem 11 | from nvshmem.bindings import barrier_all 12 | from nvshmem.bindings.device.numba import my_pe, n_pes, int_p, float_p 13 | from utils import uid_init, mpi_init 14 | 15 | 16 | def test_highlevel_bindings(dev: Device): 17 | 18 | ffi = cffi.FFI() 19 | 20 | 21 | def p(): 22 | pass 23 | 24 | 25 | @overload(p) 26 | def p_ol(arr, mype, peer): 27 | if arr == Array(dtype=int32, ndim=arr.ndim, layout=arr.layout): 28 | 29 | def impl(arr, mype, peer): 30 | ptr = ffi.from_buffer(arr) 31 | int_p(ptr, mype, peer) 32 | 33 | return impl 34 | elif arr == Array(dtype=float32, ndim=arr.ndim, layout=arr.layout): 35 | 36 | def impl(arr, mype, peer): 37 | ptr = ffi.from_buffer(arr) 38 | float_p(ptr, mype, peer) 39 | 40 | return impl 41 | 42 | 43 | @cuda.jit(lto=True) 44 | def app_kernel(dest): 45 | mype = my_pe() 46 | npes = n_pes() 47 | peer = int32((mype + 1) % npes) 48 | 49 | p(dest, mype, peer) 50 | 51 | dest = nvshmem.core.array((1,), dtype="float32") 52 | 53 | app_kernel[1, 1, 0](dest) 54 | 55 | barrier_all() 56 | dev.sync() 57 | 58 | nvshmem.core.free_array(dest) 59 | nvshmem.core.finalize() 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid") 65 | args = parser.parse_args() 66 | if args.init_type == "uid": 67 | dev = uid_init() 68 | elif args.init_type == "mpi": 69 | dev = mpi_init() 70 | 71 | test_highlevel_bindings(dev) -------------------------------------------------------------------------------- /test/device/sync/wait_until_all.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include "nvshmem.h" 8 | #include "nvshmemx.h" 9 | #include "utils.h" 10 | 11 | __device__ int error_d; 12 | 13 | #define N 100 14 | 15 | __global__ void test_kernel(uint64_t *flags, int *status, int mype, int npes) { 16 | for (int i = 0; i < npes; i++) nvshmemx_signal_op(&flags[mype], 1, NVSHMEM_SIGNAL_SET, i); 17 | nvshmem_quiet(); 18 | nvshmem_uint64_wait_until_all(flags, npes, status, NVSHMEM_CMP_EQ, 1); 19 | 20 | /* Check the flags array */ 21 | for (int i = 0; i < npes; i++) { 22 | if (flags[i] != 1) { 23 | printf("Incorrect flag value = %lu, expected = %d\n", flags[i], 1); 24 | error_d = 1; 25 | } 26 | } 27 | } 28 | 29 | int main(int argc, char **argv) { 30 | int ret_val = 0; 31 | read_args(argc, argv); 32 | init_wrapper(&argc, &argv); 33 | int mype = nvshmem_my_pe(); 34 | int npes = nvshmem_n_pes(); 35 | 36 | int zero = 0; 37 | cudaMemcpyToSymbol(error_d, &zero, sizeof(int), 0); 38 | cudaDeviceSynchronize(); 39 | 40 | uint64_t *flags; 41 | if (use_mmap) { 42 | flags = (uint64_t *)allocate_mmap_buffer(npes * sizeof(uint64_t), _mem_handle_type, use_egm, 43 | true); 44 | } else { 45 | flags = (uint64_t *)nvshmem_malloc(npes * sizeof(uint64_t)); 46 | cudaMemset(flags, 0, npes * sizeof(uint64_t)); 47 | } 48 | int *status = NULL; 49 | nvshmem_barrier_all(); 50 | 51 | cudaDeviceSynchronize(); 52 | test_kernel<<<1, 1>>>(flags, status, mype, npes); 53 | cudaDeviceSynchronize(); 54 | 55 | cudaMemcpyFromSymbol(&ret_val, error_d, sizeof(int), 0); 56 | if (use_mmap) { 57 | free_mmap_buffer(flags); 58 | } else { 59 | nvshmem_free(flags); 60 | } 61 | finalize_wrapper(); 62 | 63 | return ret_val; 64 | } 65 | -------------------------------------------------------------------------------- /nvshmem4py/test/device/numba/test_device_sync.py: -------------------------------------------------------------------------------- 1 | from cuda.core.experimental import Device, Stream 2 | import numba.cuda as cuda 3 | import nvshmem.core 4 | import nvshmem.core.device.numba 5 | 6 | import pytest 7 | 8 | @pytest.mark.mpi 9 | @pytest.mark.parametrize("teams", [nvshmem.core.Teams.TEAM_NODE, nvshmem.core.Teams.TEAM_WORLD, nvshmem.core.Teams.TEAM_SHARED]) 10 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.sync, nvshmem.core.device.numba.sync_block, nvshmem.core.device.numba.sync_warp]) 11 | def test_device_sync(nvshmem_init_fini, teams, func): 12 | print(f"Testing {func.__name__} on team {teams}") 13 | 14 | nblocks = 1 15 | nthreads = 1 16 | dev = Device() 17 | dev.sync() 18 | 19 | print(f"From PE {nvshmem.core.my_pe()}") 20 | 21 | @cuda.jit 22 | def test_sync(teams): 23 | func(teams) 24 | 25 | nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int 26 | cu_stream_ref = Stream.from_handle(nb_stream.handle.value) 27 | 28 | test_sync[nblocks, nthreads, nb_stream](teams) 29 | nvshmem.core.barrier(teams, stream=cu_stream_ref) 30 | cu_stream_ref.sync() 31 | dev.sync() 32 | print("Done testing sync") 33 | 34 | 35 | @pytest.mark.mpi 36 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.sync_all, nvshmem.core.device.numba.sync_all_block, nvshmem.core.device.numba.sync_all_warp]) 37 | def test_device_sync_all(nvshmem_init_fini, func): 38 | print(f"Testing {func.__name__}") 39 | 40 | nblocks = 1 41 | nthreads = 1 42 | 43 | dev = Device() 44 | dev.sync() 45 | 46 | print(f"From PE {nvshmem.core.my_pe()}") 47 | 48 | @cuda.jit 49 | def test_sync_all(): 50 | func() 51 | 52 | nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int 53 | cu_stream_ref = Stream.from_handle(nb_stream.handle.value) 54 | 55 | test_sync_all[nblocks, nthreads, nb_stream]() 56 | 57 | cu_stream_ref.sync() 58 | dev.sync() 59 | print("Done testing sync_all") -------------------------------------------------------------------------------- /src/host/coll/fcollect/fcollect_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include // for cudaStream_t 8 | #include // for size_t 9 | #include "device_host/nvshmem_common.cuh" // for NVSHMEMI_REPT_FOR_STAN... 10 | #include "device_host/nvshmem_types.h" // for nvshmem_team_t 11 | #include "fcollect.h" // for nvshmemi_fcollect_on_s... 12 | #include "host/nvshmemx_coll_api.h" // for nvshmemx_char_fcollect... 13 | #include "internal/host/nvshmem_internal.h" // for NVSHMEMI_CHECK_INIT_ST... 14 | #include "internal/host/nvshmem_nvtx.hpp" // for nvtx_cond_range, NVTX_... 15 | #include "internal/host/util.h" // for NVSHMEM_API_NOT_SUPPOR... 16 | 17 | #define DEFN_NVSHMEMX_TYPENAME_FCOLLECT_ON_STREAM(TYPENAME, TYPE) \ 18 | int nvshmemx_##TYPENAME##_fcollect_on_stream( \ 19 | nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, cudaStream_t stream) { \ 20 | NVTX_FUNC_RANGE_IN_GROUP(COLL); \ 21 | NVSHMEMI_CHECK_INIT_STATUS(); \ 22 | NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS(); \ 23 | return nvshmemi_fcollect_on_stream(team, dest, source, nelems, stream); \ 24 | } 25 | 26 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_TYPENAME_FCOLLECT_ON_STREAM) 27 | 28 | int nvshmemx_fcollectmem_on_stream(nvshmem_team_t team, void *dest, const void *source, 29 | size_t nelems, cudaStream_t stream) { 30 | return nvshmemx_char_fcollect_on_stream(team, (char *)dest, (const char *)source, nelems, 31 | stream); 32 | } 33 | -------------------------------------------------------------------------------- /nvshmem4py/nvshmem/bindings/device/numba/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | from cuda.pathfinder import find_nvidia_header_directory 12 | 13 | from numba import cuda 14 | 15 | import os 16 | import warnings 17 | 18 | from nvshmem.core.nvshmem_types import NvshmemWarning 19 | 20 | if os.path.exists(os.path.join(os.path.dirname(__file__), "_numbast.py")): 21 | from ._numbast import * 22 | 23 | INCLUDE_PATH = find_nvidia_header_directory("nvshmem") 24 | if "nvshmem.h" not in os.listdir(INCLUDE_PATH): 25 | raise RuntimeError("nvshmem.h not found, package may not be properly installed") 26 | 27 | if not os.path.exists(INCLUDE_PATH): 28 | raise RuntimeError(f"NVSHMEM headers not found at {INCLUDE_PATH}. Please confirm that nvshmem is installed correctly.") 29 | 30 | CCCL_INCLUDE_PATH = find_nvidia_header_directory("cccl") 31 | 32 | if not os.path.exists(CCCL_INCLUDE_PATH): 33 | raise RuntimeError(f"CCCL headers not found at {CCCL_INCLUDE_PATH}. Please confirm that cccl is installed correctly.") 34 | 35 | # Path to this folder to look for entry point file 36 | this_folder = os.path.dirname(os.path.abspath(__file__)) 37 | if not os.path.exists(os.path.join(this_folder, "entry_point.h")): 38 | raise RuntimeError("entry_point.h not found, package may not be properly installed") 39 | 40 | cuda.config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = ":".join([INCLUDE_PATH, CCCL_INCLUDE_PATH, this_folder]) 41 | 42 | else: 43 | warnings.warn("Numba device bindings are not enabled", NvshmemWarning) 44 | _numbast = None 45 | -------------------------------------------------------------------------------- /nvshmem4py/test/device/numba/test_device_barrier.py: -------------------------------------------------------------------------------- 1 | from cuda.core.experimental import Device, Stream 2 | import numba.cuda as cuda 3 | import nvshmem.core 4 | import nvshmem.core.device.numba 5 | 6 | import pytest 7 | 8 | @pytest.mark.mpi 9 | @pytest.mark.parametrize("teams", [nvshmem.core.Teams.TEAM_NODE, nvshmem.core.Teams.TEAM_WORLD, nvshmem.core.Teams.TEAM_SHARED]) 10 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.barrier, nvshmem.core.device.numba.barrier_block, nvshmem.core.device.numba.barrier_warp]) 11 | def test_device_barrier(nvshmem_init_fini, teams, func): 12 | print(f"Testing {func.__name__} on team {teams}") 13 | 14 | nblocks = 1 15 | nthreads = 1 16 | dev = Device() 17 | dev.sync() 18 | 19 | print(f"From PE {nvshmem.core.my_pe()}") 20 | 21 | @cuda.jit 22 | def test_barrier(teams): 23 | func(teams) 24 | 25 | nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int 26 | cu_stream_ref = Stream.from_handle(nb_stream.handle.value) 27 | 28 | test_barrier[nblocks, nthreads, nb_stream](teams) 29 | nvshmem.core.barrier(teams, stream=cu_stream_ref) 30 | cu_stream_ref.sync() 31 | dev.sync() 32 | print("Done testing barrier") 33 | 34 | 35 | @pytest.mark.mpi 36 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.barrier_all, nvshmem.core.device.numba.barrier_all_block, nvshmem.core.device.numba.barrier_all_warp]) 37 | def test_device_barrier_all(nvshmem_init_fini, func): 38 | print(f"Testing {func.__name__}") 39 | 40 | nblocks = 1 41 | nthreads = 1 42 | 43 | dev = Device() 44 | dev.sync() 45 | 46 | print(f"From PE {nvshmem.core.my_pe()}") 47 | 48 | @cuda.jit 49 | def test_barrier_all(): 50 | func() 51 | 52 | nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int 53 | cu_stream_ref = Stream.from_handle(nb_stream.handle.value) 54 | 55 | test_barrier_all[nblocks, nthreads, nb_stream]() 56 | 57 | cu_stream_ref.sync() 58 | dev.sync() 59 | print("Done testing barrier_all") 60 | -------------------------------------------------------------------------------- /nvshmem4py/examples/simple_p2p_kernel.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property 4 | # and proprietary rights in and to this software, related documentation 5 | # and any modifications thereto. Any use, reproduction, disclosure or 6 | # distribution of this software and related documentation without an express 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited. 8 | # 9 | # See License.txt for license information 10 | 11 | """ 12 | This file shows a minimal example of using NVSHMEM4Py to run a collective operation on CuPy arrays 13 | """ 14 | 15 | import cupy 16 | import nvshmem.core 17 | from cuda.core.experimental import Device, system 18 | from numba import cuda 19 | 20 | @cuda.jit 21 | def simple_shift(arr, dst_pe): 22 | arr[0] = dst_pe 23 | 24 | # Initialize NVSHMEM Using an MPI communicator 25 | local_rank_per_node = MPI.COMM_WORLD.Get_rank() % system.num_devices 26 | dev = Device(local_rank_per_node) 27 | dev.set_current() 28 | stream = dev.create_stream() 29 | nvshmem.core.init(device=dev, mpi_comm=MPI.COMM_WORLD, initializer_method="mpi") 30 | 31 | # Helper function to return a CuPy ArrayView backed by NVSHMEM symmetric memory 32 | array = nvshmem.core.array((1,), dtype="int32") 33 | 34 | my_pe = nvshmem.core.my_pe() 35 | # A unidirectional ring - always get the neighbor to the right 36 | dst_pe = (my_pe + 1) % nvshmem.core.n_pes() 37 | 38 | # This function returns an Array which can be directly load/store'd to over NVLink 39 | # The dst_PE must be in the same NVL domain as the PE calling this function, otherwise it will raise an Exception 40 | dev_dst = nvshmem.core.get_peer_array(b, dst_pe) 41 | 42 | 43 | block = 1 44 | grid = (size + block - 1) // block 45 | simple_shift[block, grid, 0, 0](array, my_pe) 46 | nvshmem.core.barrier(nvshmem.core.Teams.TEAM_NODE, stream) 47 | # This should print the neighbor's PE 48 | print(f"From PE {my_pe}, array contains {array}") 49 | 50 | nvshmem.core.free_array(arr_src) 51 | nvshmem.core.free_array(arr_dst) 52 | nvshmem.core.finalize() 53 | -------------------------------------------------------------------------------- /src/modules/bootstrap/common/env_defs.h: -------------------------------------------------------------------------------- 1 | /**** 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Copyright 2011 Sandia Corporation. Under the terms of Contract 5 | * DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government 6 | * retains certain rights in this software. 7 | * 8 | * Copyright (c) 2017 Intel Corporation. All rights reserved. 9 | * This software is available to you under the BSD license. 10 | * 11 | * Portions of this file are derived from Sandia OpenSHMEM. 12 | * 13 | * See License.txt for license information 14 | ****/ 15 | 16 | /* NVSHMEMI_ENV_DEF( name, kind, default, category, short description ) 17 | * 18 | * Kinds: long, size, bool, string 19 | * Categories: NVSHMEMI_ENV_CAT_OPENSHMEM, NVSHMEMI_ENV_CAT_OTHER, 20 | * NVSHMEMI_ENV_CAT_COLLECTIVES, NVSHMEMI_ENV_CAT_TRANSPORT, 21 | * NVSHMEMI_ENV_CAT_HIDDEN 22 | */ 23 | 24 | #ifndef NVSHMEM_ENV_DEFS_INTERNAL 25 | #include "bootstrap_host_transport/env_defs_internal.h" // IWYU pragma: keep 26 | #endif 27 | 28 | #ifdef NVSHMEMI_ENV_DEF 29 | 30 | NVSHMEMI_ENV_DEF(DEBUG, string, "", NVSHMEMI_ENV_CAT_OPENSHMEM, 31 | "Set to enable debugging messages.\n" 32 | "Optional values: VERSION, WARN, INFO, ABORT, TRACE") 33 | 34 | /** Bootstrap **/ 35 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SOCK_IFNAME, string, "", NVSHMEMI_ENV_CAT_BOOTSTRAP, 36 | "Name of the UID bootstrap socket interface name") 37 | 38 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SOCK_FAMILY, string, "AF_INET", NVSHMEMI_ENV_CAT_BOOTSTRAP, 39 | "Name of the UID bootstrap socket family name") 40 | 41 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SESSION_ID, string, "", NVSHMEMI_ENV_CAT_BOOTSTRAP, 42 | "Name of the UID bootstrap session identifier") 43 | 44 | /** Debugging **/ 45 | NVSHMEMI_ENV_DEF(DEBUG_SUBSYS, string, "", NVSHMEMI_ENV_CAT_HIDDEN, 46 | "Comma separated list of debugging message sources. Prefix with '^' to exclude.\n" 47 | "Values: INIT, COLL, P2P, PROXY, TRANSPORT, MEM, BOOTSTRAP, TOPO, UTIL, ALL") 48 | #endif 49 | -------------------------------------------------------------------------------- /src/include/internal/host/cuda_interface_sync.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef _CUDA_INTERFACE_SYNC_H_ 8 | #define _CUDA_INTERFACE_SYNC_H_ 9 | #include "device_host/nvshmem_common.cuh" 10 | 11 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL(type, TYPE) \ 12 | void call_nvshmemi_##type##_wait_until_on_stream_kernel(volatile TYPE *ivar, int cmp, \ 13 | TYPE cmp_value, cudaStream_t cstream); 14 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL) 15 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL 16 | 17 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL(type, TYPE) \ 18 | void call_nvshmemi_##type##_wait_until_all_on_stream_kernel( \ 19 | volatile TYPE *ivars, size_t nelems, const int *status, int cmp, TYPE cmp_value, \ 20 | cudaStream_t cstream); 21 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL) 22 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL 23 | 24 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL(type, TYPE) \ 25 | void call_nvshmemi_##type##_wait_until_all_vector_on_stream_kernel( \ 26 | volatile TYPE *ivars, size_t nelems, const int *status, int cmp, TYPE *cmp_value, \ 27 | cudaStream_t cstream); 28 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL) 29 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL 30 | 31 | void call_nvshmemi_signal_wait_until_on_stream_kernel(volatile uint64_t *sig_addr, int cmp, 32 | uint64_t cmp_value, cudaStream_t cstream); 33 | 34 | void call_nvshmemi_signal_op_kernel(uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, 35 | cudaStream_t cstrm); 36 | #endif 37 | -------------------------------------------------------------------------------- /nvshmem4py/setup.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import glob 3 | import os 4 | import shutil 5 | import sys 6 | import tempfile 7 | 8 | from Cython.Build import cythonize 9 | from setuptools import setup, Extension, find_packages 10 | from packaging.version import Version 11 | import Cython 12 | 13 | # Set package name dynamically 14 | PACKAGE_NAME = os.environ.get("PACKAGE_NAME") 15 | 16 | ext_modules = [ 17 | "nvshmem.bindings.nvshmem" 18 | ] 19 | 20 | def calculate_modules(module): 21 | module = module.split(".") 22 | 23 | lowpp_mod = module.copy() 24 | lowpp_mod_pyx = os.path.join(*module[:-1], f"{module[-1]}.pyx") 25 | lowpp_mod = ".".join(lowpp_mod) 26 | lowpp_ext = Extension( 27 | lowpp_mod, 28 | sources=[lowpp_mod_pyx], 29 | language="c++", 30 | ) 31 | 32 | cy_mod = module.copy() 33 | cy_mod[-1] = f"cy{cy_mod[-1]}" 34 | cy_mod_pyx = os.path.join(*cy_mod[:-1], f"{cy_mod[-1]}.pyx") 35 | cy_mod = ".".join(cy_mod) 36 | cy_ext = Extension( 37 | cy_mod, 38 | sources=[cy_mod_pyx], 39 | language="c++", 40 | ) 41 | 42 | inter_mod = module.copy() 43 | inter_mod.insert(-1, "_internal") 44 | inter_mod_pyx = os.path.join(*inter_mod[:-1], f"{inter_mod[-1]}.pyx") 45 | inter_mod = ".".join(inter_mod) 46 | inter_ext = Extension( 47 | inter_mod, 48 | sources=[inter_mod_pyx], 49 | language="c++", 50 | ) 51 | 52 | return lowpp_ext, cy_ext, inter_ext 53 | 54 | 55 | # Note: the extension attributes are overwritten in build_extension() 56 | ext_modules = [e for ext in ext_modules for e in calculate_modules(ext)] 57 | 58 | 59 | compiler_directives = {"embedsignature": True, "show_performance_hints": False} 60 | 61 | setup( 62 | name=PACKAGE_NAME, 63 | ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives=compiler_directives), 64 | zip_safe=False, 65 | packages=find_packages(include=["nvshmem", "nvshmem.*"]), 66 | include_package_data=True, 67 | options={"build_ext": {"inplace": True}}, 68 | install_requires=open(f"{os.path.dirname(__file__)}/requirements.txt").read().splitlines() 69 | ) 70 | -------------------------------------------------------------------------------- /nvshmem4py/test/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for NVSHMEM4Py unit tests 3 | """ 4 | 5 | from mpi4py import MPI 6 | import numpy as np 7 | 8 | import nvshmem.core 9 | from nvshmem.core.nvshmem_types import * 10 | 11 | from cuda.core.experimental import Device, system 12 | from cuda.core.experimental._stream import Stream 13 | from cuda.core.experimental._memory import MemoryResource, Buffer 14 | import cuda.bindings.driver as driver 15 | import ctypes 16 | 17 | import os 18 | 19 | def get_local_rank_per_node(): 20 | comm = MPI.COMM_WORLD 21 | rank = comm.Get_rank() 22 | size = comm.Get_size() 23 | 24 | # Split COMM_WORLD into sub-communicators of processes on the same node 25 | node_comm = comm.Split_type(MPI.COMM_TYPE_SHARED) 26 | 27 | local_rank = node_comm.Get_rank() 28 | local_size = node_comm.Get_size() 29 | print(f"Local rank {local_rank} global rank {rank} and node size {local_size} of global size {size} ranks") 30 | return local_rank 31 | 32 | def uid_init(): 33 | # This will use mpi4py to perform a UID based init with bcast. 34 | comm = MPI.COMM_WORLD 35 | rank = comm.Get_rank() 36 | nranks = comm.Get_size() 37 | 38 | local_rank_per_node = get_local_rank_per_node() 39 | dev = Device(local_rank_per_node) 40 | dev.set_current() 41 | 42 | # Create an empty uniqueid for all ranks 43 | uniqueid = nvshmem.core.get_unique_id(empty=True) 44 | if rank == 0: 45 | # Rank 0 gets a real uniqueid 46 | uniqueid = nvshmem.core.get_unique_id() 47 | 48 | # Broadcast UID to all ranks 49 | comm.Bcast(uniqueid._data.view(np.int8), root=0) 50 | 51 | nvshmem.core.init(device=dev, uid=uniqueid, rank=rank, nranks=nranks, 52 | mpi_comm=None, initializer_method="uid") 53 | 54 | return dev 55 | 56 | def mpi_init(): 57 | local_rank_per_node = get_local_rank_per_node() 58 | dev = Device(local_rank_per_node) 59 | dev.set_current() 60 | nvshmem.core.init(device=dev, uid=None, rank=None, nranks=None, 61 | mpi_comm=MPI.COMM_WORLD, initializer_method="mpi") 62 | print(f"MPI initialized on device {dev.device_id}") 63 | return dev 64 | -------------------------------------------------------------------------------- /src/host/coll/alltoall/alltoall_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include // for cudaStream_t 8 | #include // for size_t, ptrdiff_t 9 | #include // for int16_t, int32_t, int64_t 10 | #include "alltoall.h" // for nvshmemi_alltoall_on_s... 11 | #include "device_host/nvshmem_common.cuh" // for NVSHMEMI_REPT_FOR_STAN... 12 | #include "device_host/nvshmem_types.h" // for nvshmem_team_t 13 | #include "host/nvshmemx_coll_api.h" // for nvshmemx_alltoallmem_o... 14 | #include "internal/host/nvshmem_internal.h" // for NVSHMEMI_CHECK_INIT_ST... 15 | #include "internal/host/nvshmem_nvtx.hpp" // for nvtx_cond_range, NVTX_... 16 | #include "internal/host/util.h" // for NVSHMEM_API_NOT_SUPPOR... 17 | 18 | #define DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM(TYPENAME, TYPE) \ 19 | int nvshmemx_##TYPENAME##_alltoall_on_stream( \ 20 | nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, cudaStream_t stream) { \ 21 | NVTX_FUNC_RANGE_IN_GROUP(COLL); \ 22 | NVSHMEMI_CHECK_INIT_STATUS(); \ 23 | NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS(); \ 24 | return nvshmemi_alltoall_on_stream(team, dest, source, nelems, stream); \ 25 | } 26 | 27 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM) 28 | #undef DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM 29 | 30 | int nvshmemx_alltoallmem_on_stream(nvshmem_team_t team, void *dest, const void *source, 31 | size_t nelems, cudaStream_t stream) { 32 | return nvshmemx_char_alltoall_on_stream(team, (char *)dest, (const char *)source, nelems, 33 | stream); 34 | } 35 | -------------------------------------------------------------------------------- /src/host/coll/broadcast/broadcast_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include // for cudaStream_t 8 | #include // for size_t 9 | #include "broadcast.h" // for nvshmemi_broadcast_on_... 10 | #include "device_host/nvshmem_common.cuh" // for NVSHMEMI_REPT_FOR_STAN... 11 | #include "device_host/nvshmem_types.h" // for nvshmem_team_t 12 | #include "host/nvshmemx_coll_api.h" // for nvshmemx_char_broadcas... 13 | #include "internal/host/nvshmem_internal.h" // for NVSHMEMI_CHECK_INIT_ST... 14 | #include "internal/host/nvshmem_nvtx.hpp" // for nvtx_cond_range, NVTX_... 15 | #include "internal/host/util.h" // for NVSHMEM_API_NOT_SUPPOR... 16 | 17 | #define DEFN_NVSHMEMX_BROADCAST_ON_STREAM(TYPENAME, TYPE) \ 18 | int nvshmemx_##TYPENAME##_broadcast_on_stream(nvshmem_team_t team, TYPE *dest, \ 19 | const TYPE *source, size_t nelems, int PE_root, \ 20 | cudaStream_t stream) { \ 21 | NVTX_FUNC_RANGE_IN_GROUP(COLL); \ 22 | NVSHMEMI_CHECK_INIT_STATUS(); \ 23 | NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS(); \ 24 | return nvshmemi_broadcast_on_stream(team, dest, source, nelems, PE_root, stream); \ 25 | } 26 | 27 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_BROADCAST_ON_STREAM) 28 | #undef DEFN_NVSHMEMX_BROADCAST_ON_STREAM 29 | 30 | int nvshmemx_broadcastmem_on_stream(nvshmem_team_t team, void *dest, const void *source, 31 | size_t nelems, int PE_root, cudaStream_t stream) { 32 | return nvshmemx_char_broadcast_on_stream(team, (char *)dest, (const char *)source, nelems, 33 | PE_root, stream); 34 | } 35 | -------------------------------------------------------------------------------- /src/include/device_host_transport/nvshmem_common_transport.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #ifndef __NVSHMEM_TRANSPORT_COMMON_H 8 | #define __NVSHMEM_TRANSPORT_COMMON_H 9 | 10 | #if !defined __CUDACC_RTC__ 11 | #include 12 | #include 13 | #else 14 | #include 15 | #include 16 | #endif 17 | 18 | typedef enum { 19 | NVSHMEMI_OP_PUT = 1, 20 | NVSHMEMI_OP_P = 2, 21 | NVSHMEMI_OP_PUT_SIGNAL = 3, 22 | NVSHMEMI_OP_GET = 4, 23 | NVSHMEMI_OP_G = 5, 24 | NVSHMEMI_OP_FENCE = 6, 25 | NVSHMEMI_OP_AMO = 7, 26 | NVSHMEMI_OP_QUIET = 8, 27 | NVSHMEMI_OP_QP_OP_OFFSET = 100, 28 | NVSHMEMI_OP_PUT_QP = 101, 29 | NVSHMEMI_OP_P_QP = 102, 30 | NVSHMEMI_OP_PUT_SIGNAL_QP = 103, 31 | NVSHMEMI_OP_GET_QP = 104, 32 | NVSHMEMI_OP_G_QP = 105, 33 | NVSHMEMI_OP_FENCE_QP = 106, 34 | NVSHMEMI_OP_AMO_QP = 107, 35 | NVSHMEMI_OP_QUIET_QP = 108, 36 | NVSHMEMI_OP_SENTINEL = INT_MAX, 37 | } nvshmemi_op_t; 38 | 39 | typedef enum { NVSHMEM_SIGNAL_SET = 9, NVSHMEM_SIGNAL_ADD = 10 } nvshmemx_signal_op_t; 40 | 41 | typedef enum { 42 | NVSHMEMI_AMO_ACK = 1, 43 | NVSHMEMI_AMO_INC = 2, 44 | NVSHMEMI_AMO_SET = 3, 45 | NVSHMEMI_AMO_ADD = 4, 46 | NVSHMEMI_AMO_AND = 5, 47 | NVSHMEMI_AMO_OR = 6, 48 | NVSHMEMI_AMO_XOR = 7, 49 | NVSHMEMI_AMO_SIGNAL = 8, 50 | NVSHMEMI_AMO_SIGNAL_SET = NVSHMEM_SIGNAL_SET, // Note - NVSHMEM_SIGNAL_SET == 9 51 | NVSHMEMI_AMO_SIGNAL_ADD = NVSHMEM_SIGNAL_ADD, // Note - NVSHMEM_SIGNAL_ADD == 10 52 | NVSHMEMI_AMO_END_OF_NONFETCH = 11, // end of nonfetch atomics 53 | NVSHMEMI_AMO_FETCH = 12, 54 | NVSHMEMI_AMO_FETCH_INC = 13, 55 | NVSHMEMI_AMO_FETCH_ADD = 14, 56 | NVSHMEMI_AMO_FETCH_AND = 15, 57 | NVSHMEMI_AMO_FETCH_OR = 16, 58 | NVSHMEMI_AMO_FETCH_XOR = 17, 59 | NVSHMEMI_AMO_SWAP = 18, 60 | NVSHMEMI_AMO_COMPARE_SWAP = 19, 61 | NVSHMEMI_AMO_OP_SENTINEL = INT_MAX, 62 | } nvshmemi_amo_t; 63 | 64 | typedef struct { 65 | volatile uint64_t data; 66 | volatile uint64_t flag; 67 | } g_elem_t; 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /perftest/host/coll/sync_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include "coll_test.h" 14 | 15 | int main(int argc, char *argv[]) { 16 | int status = 0; 17 | int mype; 18 | size_t size = 1; 19 | struct timeval t_start, t_stop; 20 | float ms = 0; 21 | double latency_value; 22 | cudaEvent_t start_event, stop_event; 23 | cudaStream_t stream; 24 | 25 | read_args(argc, argv); 26 | 27 | init_wrapper(&argc, &argv); 28 | 29 | mype = nvshmem_my_pe(); 30 | #ifdef _NVSHMEM_DEBUG 31 | int npes = nvshmem_n_pes(); 32 | #endif 33 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 34 | CUDA_CHECK(cudaEventCreate(&start_event)); 35 | CUDA_CHECK(cudaEventCreate(&stop_event)); 36 | 37 | DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes); 38 | 39 | for (size_t iter = 0; iter < iters + warmup_iters; iter++) { 40 | if (iter == warmup_iters) CUDA_CHECK(cudaEventRecord(start_event, stream)); 41 | 42 | nvshmemx_team_sync_on_stream(NVSHMEM_TEAM_WORLD, stream); 43 | } 44 | CUDA_CHECK(cudaEventRecord(stop_event, stream)); 45 | CUDA_CHECK(cudaStreamSynchronize(stream)); 46 | CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event)); 47 | 48 | if (!mype) { 49 | latency_value = (ms / iters) * 1000; 50 | print_table_basic("sync_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size, 51 | &latency_value, 1); 52 | } 53 | 54 | nvshmem_barrier_all(); 55 | 56 | CUDA_CHECK(cudaStreamDestroy(stream)); 57 | CUDA_CHECK(cudaEventDestroy(start_event)); 58 | CUDA_CHECK(cudaEventDestroy(stop_event)); 59 | 60 | finalize_wrapper(); 61 | 62 | return status; 63 | } 64 | -------------------------------------------------------------------------------- /perftest/host/coll/sync_all_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include "coll_test.h" 14 | int coll_max_iters = MAX_ITERS; 15 | 16 | int main(int c, char *v[]) { 17 | int status = 0; 18 | int mype; 19 | size_t size = 1; 20 | double latency_value; 21 | int iters = MAX_ITERS; 22 | int skip = MAX_SKIP; 23 | struct timeval t_start, t_stop; 24 | float ms = 0; 25 | cudaEvent_t start_event, stop_event; 26 | cudaStream_t stream; 27 | 28 | init_wrapper(&c, &v); 29 | 30 | mype = nvshmem_my_pe(); 31 | #ifdef _NVSHMEM_DEBUG 32 | int npes = nvshmem_n_pes(); 33 | #endif 34 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 35 | CUDA_CHECK(cudaEventCreate(&start_event)); 36 | CUDA_CHECK(cudaEventCreate(&stop_event)); 37 | 38 | DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes); 39 | 40 | for (iters = 0; iters < coll_max_iters + skip; iters++) { 41 | if (iters == skip) CUDA_CHECK(cudaEventRecord(start_event, stream)); 42 | 43 | nvshmemx_sync_all_on_stream(stream); 44 | } 45 | CUDA_CHECK(cudaEventRecord(stop_event, stream)); 46 | CUDA_CHECK(cudaStreamSynchronize(stream)); 47 | CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event)); 48 | if (!mype) { 49 | latency_value = (ms / coll_max_iters) * 1000; 50 | print_table_basic("sync_all_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size, 51 | &latency_value, 1); 52 | } 53 | 54 | nvshmem_barrier_all(); 55 | 56 | CUDA_CHECK(cudaStreamDestroy(stream)); 57 | CUDA_CHECK(cudaEventDestroy(start_event)); 58 | CUDA_CHECK(cudaEventDestroy(stop_event)); 59 | 60 | finalize_wrapper(); 61 | 62 | return status; 63 | } 64 | -------------------------------------------------------------------------------- /test/device/query/hello-team.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | */ 6 | 7 | #include 8 | #include 9 | #include "nvshmem.h" 10 | #include "nvshmemx.h" 11 | #include "utils.h" 12 | 13 | #define N 1 14 | 15 | __device__ int errors_d; 16 | 17 | __global__ void hello_world(void) { 18 | int val; 19 | 20 | printf("Device - world PE %d of %d, node PE %d of %d\n", nvshmem_team_my_pe(NVSHMEM_TEAM_WORLD), 21 | nvshmem_team_n_pes(NVSHMEM_TEAM_WORLD), nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE), 22 | nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE)); 23 | 24 | val = nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID); 25 | 26 | if (val != -1) { 27 | printf("Error: device nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID) = %d\n", val); 28 | ++errors_d; 29 | } 30 | 31 | val = nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID); 32 | 33 | if (val != -1) { 34 | printf("Error: device nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID) = %d\n", val); 35 | ++errors_d; 36 | } 37 | } 38 | 39 | int main(int argc, char **argv) { 40 | int errors_h = 0; 41 | int val = 0; 42 | init_wrapper(&argc, &argv); 43 | 44 | nvshmem_barrier_all(); /* Ensure NVSHMEM device init has completed */ 45 | cudaMemcpyToSymbol(errors_d, &val, sizeof(int), 0, cudaMemcpyHostToDevice); 46 | 47 | printf("Host - world PE %d of %d, node PE %d of %d\n", nvshmem_team_my_pe(NVSHMEM_TEAM_WORLD), 48 | nvshmem_team_n_pes(NVSHMEM_TEAM_WORLD), nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE), 49 | nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE)); 50 | 51 | val = nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID); 52 | 53 | if (val != -1) { 54 | printf("Error: host nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID) = %d\n", val); 55 | ++errors_h; 56 | } 57 | 58 | val = nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID); 59 | 60 | if (val != -1) { 61 | printf("Error: host nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID) = %d\n", val); 62 | ++errors_h; 63 | } 64 | 65 | hello_world<<<1, N>>>(); 66 | 67 | cudaMemcpyFromSymbol(&val, errors_d, sizeof(int), 0, cudaMemcpyDeviceToHost); 68 | finalize_wrapper(); 69 | return errors_h + val; 70 | } 71 | -------------------------------------------------------------------------------- /test/host/interop/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(simplelib1 SHARED simplelib1.cu) 2 | add_library(simplelib2 SHARED simplelib2.cu) 3 | add_executable(app app.cu) 4 | add_executable(app_multi_init app_multi_init.cu) 5 | 6 | set_target_properties(simplelib1 simplelib2 PROPERTIES POSITION_INDEPENDENT_CODE ON) 7 | 8 | set_target_properties(app simplelib1 simplelib2 PROPERTIES INSTALL_RPATH "$ORIGIN/../../../../lib" BUILD_WITH_INSTALL_RPATH TRUE) 9 | target_link_options(simplelib1 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/simplelib1.sym") 10 | target_link_options(simplelib2 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/simplelib2.sym") 11 | 12 | # Tile-granular APIs need C++ 17 13 | set(TEST_CXX_STANDARD 17) 14 | set(TEST_CXX_ARG c++17) 15 | set_target_properties(simplelib1 PROPERTIES 16 | CXX_STANDARD "${TEST_CXX_STANDARD}" 17 | CUDA_STANDARD "${TEST_CXX_STANDARD}" 18 | ) 19 | set_target_properties(simplelib2 PROPERTIES 20 | CXX_STANDARD "${TEST_CXX_STANDARD}" 21 | CUDA_STANDARD "${TEST_CXX_STANDARD}" 22 | ) 23 | set_target_properties(app PROPERTIES 24 | CXX_STANDARD "${TEST_CXX_STANDARD}" 25 | CUDA_STANDARD "${TEST_CXX_STANDARD}" 26 | ) 27 | set_target_properties(app_multi_init PROPERTIES 28 | CXX_STANDARD "${TEST_CXX_STANDARD}" 29 | CUDA_STANDARD "${TEST_CXX_STANDARD}" 30 | ) 31 | 32 | target_include_directories(simplelib1 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 33 | target_include_directories(simplelib2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 34 | target_include_directories(app PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 35 | target_include_directories(app_multi_init PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 36 | 37 | target_link_libraries(simplelib1 PRIVATE nvshmem_test_helper) 38 | target_link_libraries(simplelib2 PRIVATE nvshmem_test_helper) 39 | target_link_libraries(app PRIVATE simplelib1 simplelib2 nvshmem_test_helper) 40 | target_link_libraries(app_multi_init PRIVATE simplelib1 simplelib2 nvshmem_test_helper) 41 | 42 | install(TARGETS simplelib1 LIBRARY DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop") 43 | install(TARGETS simplelib2 LIBRARY DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop") 44 | install(TARGETS app RUNTIME DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop") 45 | install(TARGETS app_multi_init RUNTIME DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop") 46 | -------------------------------------------------------------------------------- /src/modules/bootstrap/uid/ncclSocket/ncclsocket_utils.h: -------------------------------------------------------------------------------- 1 | /************************************************************************* 2 | * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * See License.txt for license information 5 | ************************************************************************/ 6 | 7 | #ifndef NCCL_SOCKET_UTILS_H_ 8 | #define NCCL_SOCKET_UTILS_H_ 9 | 10 | #include // for atoi 11 | #include // for strlen, strncmp 12 | 13 | struct netIf { 14 | char prefix[64]; 15 | int port; 16 | }; 17 | 18 | static inline int parseStringList(const char* string, struct netIf* ifList, int maxList) { 19 | if (!string) return 0; 20 | 21 | const char* ptr = string; 22 | 23 | int ifNum = 0; 24 | int ifC = 0; 25 | char c; 26 | do { 27 | c = *ptr; 28 | if (c == ':') { 29 | if (ifC > 0) { 30 | ifList[ifNum].prefix[ifC] = '\0'; 31 | ifList[ifNum].port = atoi(ptr+1); 32 | ifNum++; ifC = 0; 33 | } 34 | while (c != ',' && c != '\0') c = *(++ptr); 35 | } else if (c == ',' || c == '\0') { 36 | if (ifC > 0) { 37 | ifList[ifNum].prefix[ifC] = '\0'; 38 | ifList[ifNum].port = -1; 39 | ifNum++; ifC = 0; 40 | } 41 | } else { 42 | ifList[ifNum].prefix[ifC] = c; 43 | ifC++; 44 | } 45 | ptr++; 46 | } while (ifNum < maxList && c); 47 | return ifNum; 48 | } 49 | 50 | static bool matchIf(const char* string, const char* ref, bool matchExact) { 51 | // Make sure to include '\0' in the exact case 52 | int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); 53 | return strncmp(string, ref, matchLen) == 0; 54 | } 55 | 56 | static bool matchPort(const int port1, const int port2) { 57 | if (port1 == -1) return true; 58 | if (port2 == -1) return true; 59 | if (port1 == port2) return true; 60 | return false; 61 | } 62 | 63 | static inline bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { 64 | // Make an exception for the case where no user list is defined 65 | if (listSize == 0) return true; 66 | 67 | for (int i=0; i 8 | #include 9 | #include 10 | #include "nvshmem.h" 11 | #include "nvshmemx.h" 12 | #include "cuda_runtime.h" 13 | #include "utils.h" 14 | 15 | #define MAX_SIZE 100 * 1024 * 1024 16 | #define ITER 20 17 | #define REPEAT 50 18 | 19 | int main(int argc, char **argv) { 20 | int status = 0; 21 | int mype; 22 | size_t size; 23 | char **buffer; 24 | int iter = ITER; 25 | int repeat = REPEAT; 26 | char size_string[100]; 27 | 28 | size = (size_t)MAX_SIZE * iter * 2; 29 | sprintf(size_string, "%zu", size); 30 | 31 | status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1); 32 | if (status) { 33 | ERROR_PRINT("setenv failed \n"); 34 | goto out; 35 | } 36 | 37 | srand(1); 38 | init_wrapper(&argc, &argv); 39 | 40 | mype = nvshmem_my_pe(); 41 | 42 | buffer = (char **)calloc(iter, sizeof(char *)); 43 | if (!buffer) { 44 | ERROR_PRINT("malloc failed \n"); 45 | goto out; 46 | } 47 | 48 | for (int r = 0; r < repeat; r++) { 49 | uint32_t lsize = r; 50 | if (!mype) DEBUG_PRINT("[iter %d of %d] allocations: ", r, repeat); 51 | for (int i = 0; i < iter; i++) { 52 | lsize = rand_r(&lsize) % (MAX_SIZE - 1) + 1; 53 | 54 | buffer[i] = (char *)nvshmem_malloc(lsize); 55 | if (!buffer[i]) { 56 | ERROR_PRINT("shmem_malloc failed \n"); 57 | goto out; 58 | } 59 | 60 | cudaMemset(buffer[i], 0, lsize); 61 | if (!mype) DEBUG_PRINT("ptr: %p size: %zuB; ", (void *)buffer[i], lsize); 62 | } 63 | if (!mype) DEBUG_PRINT("\n \n"); 64 | 65 | if (!mype) DEBUG_PRINT("[%d of %d] freeing all buffers: ", r, repeat); 66 | 67 | for (int i = 0; i < iter; i++) { 68 | if (!mype) DEBUG_PRINT("ptr: %p; ", (void *)buffer[i]); 69 | nvshmem_free(buffer[i]); 70 | } 71 | if (!mype) DEBUG_PRINT("\n \n"); 72 | 73 | if (!mype) DEBUG_PRINT("[iter %d of %d] end of iter \n \n", r, repeat); 74 | } 75 | 76 | free(buffer); 77 | 78 | finalize_wrapper(); 79 | 80 | out: 81 | return status; 82 | } 83 | -------------------------------------------------------------------------------- /perftest/host/coll/barrier_on_stream.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * NVIDIA CORPORATION and its licensors retain all intellectual property 5 | * and proprietary rights in and to this software, related documentation 6 | * and any modifications thereto. Any use, reproduction, disclosure or 7 | * distribution of this software and related documentation without an express 8 | * license agreement from NVIDIA CORPORATION is strictly prohibited. 9 | * 10 | * See License.txt for license information 11 | */ 12 | 13 | #include "coll_test.h" 14 | 15 | int main(int argc, char *argv[]) { 16 | int status = 0; 17 | int mype; 18 | size_t size = 1; 19 | 20 | read_args(argc, argv); 21 | float ms; 22 | double latency_value; 23 | cudaStream_t stream; 24 | cudaEvent_t start_event, stop_event; 25 | 26 | init_wrapper(&argc, &argv); 27 | 28 | mype = nvshmem_my_pe(); 29 | #ifdef _NVSHMEM_DEBUG 30 | int npes = nvshmem_n_pes(); 31 | #endif 32 | CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 33 | CUDA_CHECK(cudaEventCreate(&start_event)); 34 | CUDA_CHECK(cudaEventCreate(&stop_event)); 35 | 36 | DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes); 37 | 38 | for (size_t iter = 0; iter < warmup_iters; iter++) { 39 | nvshmemx_barrier_on_stream(NVSHMEM_TEAM_WORLD, stream); 40 | } 41 | CUDA_CHECK(cudaStreamSynchronize(stream)); 42 | nvshmem_barrier_all(); 43 | 44 | CUDA_CHECK(cudaEventRecord(start_event, stream)); 45 | for (size_t iter = 0; iter < iters; iter++) { 46 | nvshmemx_barrier_on_stream(NVSHMEM_TEAM_WORLD, stream); 47 | } 48 | CUDA_CHECK(cudaEventRecord(stop_event, stream)); 49 | CUDA_CHECK(cudaStreamSynchronize(stream)); 50 | CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event)); 51 | 52 | if (!mype) { 53 | latency_value = (ms / iters) * 1000; 54 | print_table_basic("barrier_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size, 55 | &latency_value, 1); 56 | } 57 | 58 | nvshmem_barrier_all(); 59 | 60 | CUDA_CHECK(cudaStreamDestroy(stream)); 61 | CUDA_CHECK(cudaEventDestroy(start_event)); 62 | CUDA_CHECK(cudaEventDestroy(stop_event)); 63 | 64 | finalize_wrapper(); 65 | 66 | return status; 67 | } 68 | -------------------------------------------------------------------------------- /src/modules/transport/common/mlx5_prm.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: BSD-3-Clause 2 | * Copyright 2016 6WIND S.A. 3 | * Copyright 2016 Mellanox Technologies, Ltd 4 | */ 5 | 6 | #ifndef RTE_PMD_MLX5_PRM_H_ 7 | #define RTE_PMD_MLX5_PRM_H_ 8 | 9 | #include 10 | 11 | #define u8 uint8_t 12 | 13 | #define MLX5_ADAPTER_PAGE_SHIFT 12 14 | 15 | enum { 16 | MLX5_CQE_SIZE_64B = 0x0, 17 | MLX5_CQE_SIZE_128B = 0x1, 18 | }; 19 | 20 | struct mlx5_ifc_cqc_bits { 21 | u8 status[0x4]; 22 | u8 as_notify[0x1]; 23 | u8 initiator_src_dct[0x1]; 24 | u8 dbr_umem_valid[0x1]; 25 | u8 reserved_at_7[0x1]; 26 | u8 cqe_sz[0x3]; 27 | u8 cc[0x1]; 28 | u8 reserved_at_c[0x1]; 29 | u8 scqe_break_moderation_en[0x1]; 30 | u8 oi[0x1]; 31 | u8 cq_period_mode[0x2]; 32 | u8 cqe_comp_en[0x1]; 33 | u8 mini_cqe_res_format[0x2]; 34 | u8 st[0x4]; 35 | u8 reserved_at_18[0x1]; 36 | u8 cqe_comp_layout[0x7]; 37 | u8 dbr_umem_id[0x20]; 38 | u8 reserved_at_40[0x14]; 39 | u8 page_offset[0x6]; 40 | u8 reserved_at_5a[0x2]; 41 | u8 mini_cqe_res_format_ext[0x2]; 42 | u8 cq_timestamp_format[0x2]; 43 | u8 reserved_at_60[0x3]; 44 | u8 log_cq_size[0x5]; 45 | u8 uar_page[0x18]; 46 | u8 reserved_at_80[0x4]; 47 | u8 cq_period[0xc]; 48 | u8 cq_max_count[0x10]; 49 | u8 reserved_at_a0[0x18]; 50 | u8 c_eqn[0x8]; 51 | u8 reserved_at_c0[0x3]; 52 | u8 log_page_size[0x5]; 53 | u8 reserved_at_c8[0x18]; 54 | u8 reserved_at_e0[0x20]; 55 | u8 reserved_at_100[0x8]; 56 | u8 last_notified_index[0x18]; 57 | u8 reserved_at_120[0x8]; 58 | u8 last_solicit_index[0x18]; 59 | u8 reserved_at_140[0x8]; 60 | u8 consumer_counter[0x18]; 61 | u8 reserved_at_160[0x8]; 62 | u8 producer_counter[0x18]; 63 | u8 local_partition_id[0xc]; 64 | u8 process_id[0x14]; 65 | u8 reserved_at_1A0[0x20]; 66 | u8 dbr_addr[0x40]; 67 | }; 68 | 69 | struct mlx5_ifc_create_cq_in_bits { 70 | u8 opcode[0x10]; 71 | u8 uid[0x10]; 72 | u8 reserved_at_20[0x10]; 73 | u8 op_mod[0x10]; 74 | u8 reserved_at_40[0x40]; 75 | struct mlx5_ifc_cqc_bits cq_context; 76 | u8 cq_umem_offset[0x40]; 77 | u8 cq_umem_id[0x20]; 78 | u8 cq_umem_valid[0x1]; 79 | u8 reserved_at_2e1[0x1f]; 80 | u8 reserved_at_300[0x580]; 81 | u8 pas[]; 82 | }; 83 | 84 | #endif /* RTE_PMD_MLX5_PRM_H_ */ 85 | --------------------------------------------------------------------------------