├── perftest
    ├── host
    │   ├── pt-to-pt
    │   │   ├── stream_latency.args
    │   │   ├── bw.args
    │   │   ├── latency.args
    │   │   └── CMakeLists.txt
    │   ├── init
    │   │   └── CMakeLists.txt
    │   ├── coll
    │   │   ├── barrier_on_stream.args
    │   │   ├── sync_on_stream.args
    │   │   ├── alltoall_on_stream.args
    │   │   ├── broadcast_on_stream.args
    │   │   ├── fcollect_on_stream.args
    │   │   ├── reduction_on_stream.args
    │   │   ├── reducescatter_on_stream.args
    │   │   ├── CMakeLists.txt
    │   │   ├── sync_on_stream.cpp
    │   │   ├── sync_all_on_stream.cpp
    │   │   └── barrier_on_stream.cpp
    │   └── CMakeLists.txt
    ├── device
    │   ├── coll
    │   │   ├── sync_latency.args
    │   │   ├── barrier_latency.args
    │   │   ├── bcast_latency.args
    │   │   ├── alltoall_latency.args
    │   │   ├── fcollect_latency.args
    │   │   ├── reduction_latency.args
    │   │   ├── reducescatter_latency.args
    │   │   └── CMakeLists.txt
    │   ├── pt-to-pt
    │   │   ├── shmem_atomic_latency.args
    │   │   ├── shmem_g_latency.args
    │   │   ├── shmem_p_latency.args
    │   │   ├── shmem_signal_ping_pong_latency.args
    │   │   ├── shmem_st_bw.args
    │   │   ├── shmem_get_latency.args
    │   │   ├── shmem_put_latency.args
    │   │   ├── shmem_put_ping_pong_latency.args
    │   │   ├── shmem_p_ping_pong_latency.args
    │   │   ├── shmem_put_signal_ping_pong_latency.args
    │   │   ├── shmem_atomic_bw.args
    │   │   ├── shmem_get_bw.args
    │   │   ├── shmem_p_bw.args
    │   │   ├── shmem_g_bw.args
    │   │   ├── shmem_put_bw.args
    │   │   ├── shmem_atomic_ping_pong_latency.args
    │   │   └── CMakeLists.txt
    │   ├── tile
    │   │   └── CMakeLists.txt
    │   └── CMakeLists.txt
    ├── perftest-mmap-sanity.list
    ├── perftest-p2p-cudagraph.list
    ├── perfTestRunnerSlurm.py
    ├── perftest-mmap-full.list
    ├── perftest-p2p-pcie.list
    ├── perftest-ib.list
    ├── perftest-p2p-nvlink.list
    ├── README.md
    └── common
    │   └── CMakeLists.txt
├── nvshmem4py
    ├── nvshmem
    │   ├── bindings
    │   │   ├── _internal
    │   │   │   └── __init__.py
    │   │   ├── device
    │   │   │   ├── numba
    │   │   │   │   ├── entry_point.h
    │   │   │   │   ├── _numbast.py
    │   │   │   │   └── __init__.py
    │   │   │   └── __init__.py
    │   │   └── __init__.py
    │   ├── core
    │   │   ├── device
    │   │   │   ├── __init__.py
    │   │   │   └── numba
    │   │   │   │   └── __init__.py
    │   │   ├── __init__.py
    │   │   └── _internal_tracking.py
    │   ├── __init__.py
    │   └── version.py
    ├── build_assets
    │   └── numbast
    │   │   ├── numbast_entry_point.h
    │   │   └── templates
    │   │       └── config_nvshmem.yml.j2
    ├── requirements_cuda12.txt
    ├── requirements_cuda13.txt
    ├── requirements_optional_cuda13.txt
    ├── requirements_build.txt
    ├── MANIFEST.in
    ├── test
    │   ├── device
    │   │   └── numba
    │   │   │   ├── conftest.py
    │   │   │   ├── utils.py
    │   │   │   ├── test_device_sync.py
    │   │   │   └── test_device_barrier.py
    │   ├── wheel_sanity_test.py
    │   ├── test_npe.py
    │   ├── test_get_version.py
    │   ├── test_ring.py
    │   ├── test_collective.py
    │   ├── test_highlevel_bindings.py
    │   └── utils.py
    ├── requirements_optional_cuda12.txt
    ├── perftest
    │   ├── alltoall_on_stream.py
    │   ├── fcollect_on_stream.py
    │   ├── reduction_on_stream.py
    │   ├── broadcast_on_stream.py
    │   └── reducescatter_on_stream.py
    ├── scripts
    │   └── find_python_versions.sh
    ├── README.md
    ├── examples
    │   └── simple_p2p_kernel.py
    └── setup.py
├── test
    ├── apps
    │   ├── dgl
    │   │   └── CMakeLists.txt
    │   ├── cufft
    │   │   └── CMakeLists.txt
    │   ├── interop
    │   │   └── CMakeLists.txt
    │   └── CMakeLists.txt
    ├── host
    │   ├── pt-to-pt
    │   │   ├── fence.args
    │   │   ├── quiet.args
    │   │   ├── CMakeLists.txt
    │   │   ├── quiet_on_stream.cu
    │   │   ├── quiet.cpp
    │   │   └── fence.cpp
    │   ├── mem
    │   │   ├── mmap_unmap_loop.args
    │   │   ├── CMakeLists.txt
    │   │   ├── calloc.cpp
    │   │   ├── malloc_simple.cpp
    │   │   └── malloc_loop.cpp
    │   ├── coll
    │   │   ├── collective_launch_choose_grid.args
    │   │   ├── collective_launch_user_specified_grid.args
    │   │   └── CMakeLists.txt
    │   ├── interop
    │   │   ├── simplelib1.sym
    │   │   ├── simplelib2.sym
    │   │   ├── simplelib1.h
    │   │   ├── simplelib2.h
    │   │   ├── simplelib1.cu
    │   │   ├── simplelib2.cu
    │   │   ├── app.cu
    │   │   └── CMakeLists.txt
    │   ├── team
    │   │   ├── CMakeLists.txt
    │   │   └── shmem_team_reuse_teams.cpp
    │   ├── init
    │   │   ├── kernel_nvshmem.cu
    │   │   ├── CMakeLists.txt
    │   │   ├── static_init.cpp
    │   │   ├── global_exit.cpp
    │   │   ├── mpi_init.cpp
    │   │   ├── nvshmemx_init_status.cpp
    │   │   ├── shmem_init.cpp
    │   │   ├── init_loop.cpp
    │   │   └── uid_init.cpp
    │   └── CMakeLists.txt
    ├── device
    │   ├── init
    │   │   ├── CMakeLists.txt
    │   │   └── global_exit.cu
    │   ├── query
    │   │   ├── CMakeLists.txt
    │   │   ├── hello.cu
    │   │   └── hello-team.cu
    │   ├── sync
    │   │   ├── CMakeLists.txt
    │   │   ├── sync_test.cu
    │   │   └── wait_until_all.cu
    │   ├── tile
    │   │   └── CMakeLists.txt
    │   ├── CMakeLists.txt
    │   └── coll
    │   │   └── coll_test.h
    ├── unit
    │   ├── CMakeLists.txt
    │   ├── mem
    │   │   ├── CMakeLists.txt
    │   │   ├── heap
    │   │   │   ├── internal
    │   │   │   │   └── host
    │   │   │   │   │   └── nvshmem_nvtx.hpp
    │   │   │   └── CMakeLists.txt
    │   │   └── transport
    │   │   │   └── CMakeLists.txt
    │   └── host
    │   │   ├── bootstrap
    │   │       └── CMakeLists.txt
    │   │   └── CMakeLists.txt
    ├── common
    │   ├── test_teams.h
    │   ├── data_check.h
    │   ├── ring_alltoall.h
    │   └── test-simple-pmi
    │   │   └── test_simple_pmiutil.h
    └── README.md
├── .github
    └── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── QUESTION.yaml
    │   └── RFE.yaml
├── nvshmem_transport.sym
├── .gitattributes
├── src
    ├── modules
    │   ├── transport
    │   │   ├── ibgda
    │   │   │   └── CMakeLists.txt.in
    │   │   ├── ibrc
    │   │   │   └── CMakeLists.txt.in
    │   │   ├── ibdevx
    │   │   │   ├── CMakeLists.txt.in
    │   │   │   └── ibdevx.h
    │   │   ├── ucx
    │   │   │   └── CMakeLists.txt.in
    │   │   ├── libfabric
    │   │   │   └── CMakeLists.txt.in
    │   │   └── common
    │   │   │   ├── transport_mlx5_common.h
    │   │   │   ├── transport_gdr_common.h
    │   │   │   ├── CMakeLists.txt.in
    │   │   │   └── mlx5_prm.h
    │   └── bootstrap
    │   │   ├── mpi
    │   │       └── CMakeLists.txt.in
    │   │   ├── uid
    │   │       ├── CMakeLists.txt.in
    │   │       └── ncclSocket
    │   │       │   ├── commit_info.txt
    │   │       │   ├── ncclsocket_param.h
    │   │       │   ├── ncclsocket_nccl.h
    │   │       │   ├── ncclsocket_debug.h
    │   │       │   └── ncclsocket_utils.h
    │   │   ├── pmix
    │   │       └── CMakeLists.txt.in
    │   │   ├── shmem
    │   │       └── CMakeLists.txt.in
    │   │   ├── common
    │   │       ├── CMakeLists.txt.in
    │   │       └── env_defs.h
    │   │   └── pmi
    │   │       ├── CMakeLists.txt.in
    │   │       ├── simple-pmi
    │   │           └── simple_pmiutil.h
    │   │       └── pmi-2
    │   │           └── COPYRIGHT
    ├── include
    │   ├── nvshmem_host.h
    │   ├── internal
    │   │   ├── host
    │   │   │   ├── shared_memory.h
    │   │   │   ├── sockets.h
    │   │   │   ├── nvshmemi_coll.h
    │   │   │   ├── error_codes_internal.h
    │   │   │   ├── nvshmemi_bootstrap_library.h
    │   │   │   ├── custom_malloc.h
    │   │   │   └── cuda_interface_sync.h
    │   │   ├── host_transport
    │   │   │   └── nvshmemi_transport_defines.h
    │   │   ├── common
    │   │   │   └── error_codes_internal.h
    │   │   ├── device
    │   │   │   └── nvshmemi_device.h
    │   │   ├── bootstrap_host_transport
    │   │   │   └── nvshmemi_bootstrap_defines.h
    │   │   └── bootstrap_host
    │   │   │   └── nvshmemi_bootstrap.h
    │   ├── non_abi
    │   │   ├── device
    │   │   │   └── coll
    │   │   │   │   └── defines.cuh
    │   │   ├── nvshmem_version.h.in
    │   │   └── nvshmem_build_options.h.in
    │   ├── host
    │   │   └── nvshmem_macros.h
    │   ├── device
    │   │   └── nvshmemx_collective_launch_apis.h
    │   ├── nvshmem.h
    │   ├── nvshmemx.h
    │   ├── bootstrap_device_host
    │   │   └── nvshmem_uniqueid.h
    │   └── device_host_transport
    │   │   └── nvshmem_common_transport.h
    └── host
    │   ├── stream
    │       ├── coll
    │       │   ├── rdxn
    │       │   │   ├── reduce_and.cu
    │       │   │   ├── reduce_or.cu
    │       │   │   ├── reduce_xor.cu
    │       │   │   ├── reduce_max.cu
    │       │   │   ├── reduce_min.cu
    │       │   │   ├── reduce_sum.cu
    │       │   │   ├── reduce_prod.cu
    │       │   │   └── reduce_team.cu
    │       │   └── reducescatter
    │       │   │   ├── reducescatter_or.cu
    │       │   │   ├── reducescatter_and.cu
    │       │   │   ├── reducescatter_xor.cu
    │       │   │   ├── reducescatter_max.cu
    │       │   │   ├── reducescatter_min.cu
    │       │   │   ├── reducescatter_sum.cu
    │       │   │   └── reducescatter_prod.cu
    │       └── comm
    │       │   └── quiet_on_stream.cu
    │   ├── coll
    │       ├── fcollect
    │       │   ├── fcollect_on_stream.h
    │       │   └── fcollect_on_stream.cpp
    │       ├── barrier
    │       │   └── barrier.h
    │       ├── rdxn
    │       │   └── rdxn.h
    │       ├── reducescatter
    │       │   └── reducescatter.h
    │       ├── alltoall
    │       │   └── alltoall_on_stream.cpp
    │       └── broadcast
    │       │   └── broadcast_on_stream.cpp
    │   ├── topo
    │       └── topo.h
    │   ├── transport
    │       └── p2p
    │       │   └── p2p.h
    │   ├── util
    │       └── cs.cpp
    │   ├── comm
    │       ├── rma.cu
    │       └── fence.cpp
    │   ├── mem
    │       └── dlmalloc.h
    │   └── init
    │       └── query_host.cpp
├── nvshmem_host.sym
├── nvshmem_bootstrap.sym
├── scripts
    ├── bitcode_lib_cleanup.sh
    └── install_hydra.sh
├── pkg
    └── nvshmem_package_description.txt
├── .pre-commit-config.yaml
├── Compatibility.md
├── examples
    ├── shmem-based-init.cu
    ├── hello.cpp
    ├── gemm_allreduce
    │   └── nvshmemAlloc.hpp
    └── dev-guide-ring.cu
├── README.md
└── .gitignore


/perftest/host/pt-to-pt/stream_latency.args:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/_internal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/perftest/device/coll/sync_latency.args:
--------------------------------------------------------------------------------
1 | -n 1000 -w 10


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_atomic_latency.args:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/perftest/device/coll/barrier_latency.args:
--------------------------------------------------------------------------------
1 | -n 1000 -w 10


--------------------------------------------------------------------------------
/test/apps/dgl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(dgl.cu)
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false


--------------------------------------------------------------------------------
/perftest/device/coll/bcast_latency.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M


--------------------------------------------------------------------------------
/test/host/pt-to-pt/fence.args:
--------------------------------------------------------------------------------
1 | 1 2 3 4 5 6 7 8
2 | -n 100
3 | 


--------------------------------------------------------------------------------
/test/host/pt-to-pt/quiet.args:
--------------------------------------------------------------------------------
1 | 1 2 3 4 5 6 7 8
2 | -n 100
3 | 


--------------------------------------------------------------------------------
/perftest/device/coll/alltoall_latency.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M


--------------------------------------------------------------------------------
/perftest/device/coll/fcollect_latency.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M


--------------------------------------------------------------------------------
/perftest/device/coll/reduction_latency.args:
--------------------------------------------------------------------------------
1 | -n 50 -w 10 -b 1 -e 4M


--------------------------------------------------------------------------------
/test/device/init/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(global_exit.cu)
2 | 


--------------------------------------------------------------------------------
/test/host/mem/mmap_unmap_loop.args:
--------------------------------------------------------------------------------
1 | 
2 | -i 3 -b 1G -e 2G -r 1
3 | 


--------------------------------------------------------------------------------
/perftest/device/coll/reducescatter_latency.args:
--------------------------------------------------------------------------------
1 | -n 50 -w 10 -b 1 -e 4k


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_g_latency.args:
--------------------------------------------------------------------------------
1 | -n 200 -w 20 -t 512 -e 64K


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_p_latency.args:
--------------------------------------------------------------------------------
1 | -t 512 -e 64K -n 50 -w 5


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_signal_ping_pong_latency.args:
--------------------------------------------------------------------------------
1 | -n 500 -w 50


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_st_bw.args:
--------------------------------------------------------------------------------
1 | -n 10 -w 10 -t 1024 -c 4 -e 32M


--------------------------------------------------------------------------------
/perftest/host/init/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_perftest(malloc.cpp)
2 | 


--------------------------------------------------------------------------------
/test/apps/cufft/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(cufft_smoke_test.cu)
2 | 


--------------------------------------------------------------------------------
/test/unit/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(host)
2 | add_subdirectory(mem)


--------------------------------------------------------------------------------
/nvshmem_transport.sym:
--------------------------------------------------------------------------------
1 | {
2 |   global: nvshmemt_init;
3 |   local: *;
4 | };


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_get_latency.args:
--------------------------------------------------------------------------------
1 | -n 200 -w 20 -t 1024 -e 64K


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_put_latency.args:
--------------------------------------------------------------------------------
1 | -e 64K -t 1024 -n 200 -w 20


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_put_ping_pong_latency.args:
--------------------------------------------------------------------------------
1 | -e 1M -n 500 -w 50


--------------------------------------------------------------------------------
/test/apps/interop/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(nccl_nvshmem_interop.cu)
2 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_p_ping_pong_latency.args:
--------------------------------------------------------------------------------
1 | -t 512 -e 16K -n 500 -w 50


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_put_signal_ping_pong_latency.args:
--------------------------------------------------------------------------------
1 | -e 1M -n 500 -w 50


--------------------------------------------------------------------------------
/test/host/coll/collective_launch_choose_grid.args:
--------------------------------------------------------------------------------
1 | 1 2 3 4 5 6 7 8
2 | -t 32
3 | 


--------------------------------------------------------------------------------
/test/unit/mem/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(transport)
2 | add_subdirectory(heap)


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_atomic_bw.args:
--------------------------------------------------------------------------------
1 | -n 10 -w 10 -c 4 -e 65536 -t 1024 -a inc


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_get_bw.args:
--------------------------------------------------------------------------------
1 | -n 200 -w 20 -b 1024 -e 32M -c 4 -t 1024
2 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_p_bw.args:
--------------------------------------------------------------------------------
1 | -n 10 -w 10 -t 1024 -c 4 -b 1024 -e 64K -s 1


--------------------------------------------------------------------------------
/perftest/host/coll/barrier_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 1000 -w 10
2 | -n 1000 -w 10 --cudagraph


--------------------------------------------------------------------------------
/perftest/host/coll/sync_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 1000 -w 10
2 | -n 1000 -w 10 --cudagraph
3 | 


--------------------------------------------------------------------------------
/perftest/host/pt-to-pt/bw.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 4 -e 128M --dir write --issue host
2 | 


--------------------------------------------------------------------------------
/test/unit/host/bootstrap/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_unit_test(bootstrap_coll.cpp)
2 | 


--------------------------------------------------------------------------------
/perftest/host/pt-to-pt/latency.args:
--------------------------------------------------------------------------------
1 | -n 10 -w 10 -b 1 -e 128M --dir write --issue host
2 | 


--------------------------------------------------------------------------------
/test/host/coll/collective_launch_user_specified_grid.args:
--------------------------------------------------------------------------------
1 | 1 2 3 4 5 6 7 8
2 | -c 1 -t 32
3 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_g_bw.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -t 1024 -c 8 -b 1024 -e 65536 -d double
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | nvshmem4py/nvshmem/bindings/device/numba/_numbast.py filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/perftest/host/coll/alltoall_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M
2 | -n 100 -w 10 -b 1 -e 4M --cudagraph


--------------------------------------------------------------------------------
/perftest/host/coll/broadcast_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M
2 | -n 100 -w 10 -b 1 -e 4M --cudagraph


--------------------------------------------------------------------------------
/perftest/host/coll/fcollect_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 1000 -w 10 -b 1 -e 4M
2 | -n 1000 -w 10 -b 1 -e 4M --cudagraph


--------------------------------------------------------------------------------
/perftest/host/coll/reduction_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4M
2 | -n 100 -w 10 -b 1 -e 4M --cudagraph


--------------------------------------------------------------------------------
/perftest/host/coll/reducescatter_on_stream.args:
--------------------------------------------------------------------------------
1 | -n 100 -w 10 -b 1 -e 4k
2 | -n 100 -w 10 -b 1 -e 4k --cudagraph


--------------------------------------------------------------------------------
/src/modules/transport/ibgda/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | nvshmem_add_transport(nvshmem_transport_ibgda ibgda.cpp ON ON ON ON)
2 | 


--------------------------------------------------------------------------------
/src/modules/transport/ibrc/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | nvshmem_add_transport(nvshmem_transport_ibrc ibrc.cpp ON ON ON OFF)
2 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_put_bw.args:
--------------------------------------------------------------------------------
1 | -n 200 -w 20 -c 4 -t 1024 -e 32M
2 | --bidir -n 200 -w 20 -c 4 -t 1024 -e 32M


--------------------------------------------------------------------------------
/perftest/perftest-mmap-sanity.list:
--------------------------------------------------------------------------------
1 | /host/coll/broadcast_on_stream
2 | /host/pt-to-pt/bw
3 | /device/pt-to-pt/shmem_put_bw


--------------------------------------------------------------------------------
/src/modules/transport/ibdevx/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | nvshmem_add_transport(nvshmem_transport_ibdevx ibdevx.cpp ON OFF ON ON)
2 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/shmem_atomic_ping_pong_latency.args:
--------------------------------------------------------------------------------
1 | add
2 | and
3 | compare_swap
4 | inc
5 | or
6 | set
7 | swap
8 | xor


--------------------------------------------------------------------------------
/nvshmem_host.sym:
--------------------------------------------------------------------------------
1 | NVSHMEM {
2 |   global: nvshmem_*;
3 |           nvshmemid_*;
4 |           nvshmemx_*;
5 |   local: *;
6 | };
7 | 


--------------------------------------------------------------------------------
/nvshmem4py/build_assets/numbast/numbast_entry_point.h:
--------------------------------------------------------------------------------
1 | #define __NVSHMEM_NUMBA_SUPPORT__
2 | #include <nvshmem.h>
3 | #include <nvshmemx.h>
4 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/device/numba/entry_point.h:
--------------------------------------------------------------------------------
1 | #define __NVSHMEM_NUMBA_SUPPORT__
2 | #include <nvshmem.h>
3 | #include <nvshmemx.h>
4 | 


--------------------------------------------------------------------------------
/perftest/device/tile/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_perftest(tile_allreduce_latency.cu)
2 | nvshmem_add_perftest(tile_allgather_latency.cu)
3 | 


--------------------------------------------------------------------------------
/nvshmem_bootstrap.sym:
--------------------------------------------------------------------------------
1 | {
2 |   global: nvshmemi_bootstrap_plugin_init;
3 |           nvshmemi_bootstrap_plugin_pre_init;
4 |   local: *;
5 | };
6 | 


--------------------------------------------------------------------------------
/perftest/host/pt-to-pt/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_perftest(bw.cpp)
2 | nvshmem_add_perftest(latency.cpp)
3 | nvshmem_add_perftest(stream_latency.cu)
4 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib1.sym:
--------------------------------------------------------------------------------
1 | CUFFT {
2 |   global: simplelib1_init;
3 |           simplelib1_finalize;
4 |           simplelib1_dowork;
5 |   local: *;
6 | };
7 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib2.sym:
--------------------------------------------------------------------------------
1 | CUFFT {
2 |   global: simplelib2_init;
3 |           simplelib2_finalize;
4 |           simplelib2_dowork;
5 |   local: *;
6 | };
7 | 


--------------------------------------------------------------------------------
/test/unit/mem/heap/internal/host/nvshmem_nvtx.hpp:
--------------------------------------------------------------------------------
1 | #ifndef _NVSHMEM_NVTX_HPP_
2 | #define _NVSHMEM_NVTX_HPP_
3 | 
4 | #define NVTX_FUNC_RANGE_IN_GROUP(G) ;
5 | 
6 | #endif


--------------------------------------------------------------------------------
/src/modules/transport/ucx/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | nvshmem_add_transport(nvshmem_transport_ucx ucx.cpp OFF ON OFF OFF)
2 | target_link_libraries(nvshmem_transport_ucx PRIVATE ucx::ucs ucx::ucp)
3 | 


--------------------------------------------------------------------------------
/test/device/query/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(hello.cu)
2 | nvshmem_add_test(hello-team.cu)
3 | nvshmem_add_test(info.cu)
4 | nvshmem_add_test(ptr.cu)
5 | nvshmem_add_test(mc_ptr.cu)
6 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/device/numba/_numbast.py:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b4a5b3944fdd88661742e667444c2cd82fcc2cddb20409dbfd718b4fd1ddfc4a
3 | size 5738544
4 | 


--------------------------------------------------------------------------------
/scripts/bitcode_lib_cleanup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | myVar="$(cat $1 | grep -E '!([0-9]+) = !\{[^"]+"nvvm-reflect-ftz"' | cut -d ' ' -f 1)" 
4 | awk '!/nvvm-reflect-ftz/' $1 | sed "/^\!llvm\.module\.flags = /s/$myVar, //" > $2


--------------------------------------------------------------------------------
/test/unit/host/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(NVSHMEM_UNIT_TEST_PREFIX "host_")
2 | macro(nvshmem_add_unit_test SOURCE)
3 |   nvshmem_add_unit_test_prefix(${SOURCE} ${NVSHMEM_UNIT_TEST_PREFIX})
4 | endmacro()
5 | 
6 | add_subdirectory(bootstrap)
7 | 


--------------------------------------------------------------------------------
/nvshmem4py/requirements_cuda12.txt:
--------------------------------------------------------------------------------
1 | # These are the run-time dependencies of nvshmem4py
2 | # They are non-negotiables
3 | nvidia-nvshmem-cu12
4 | cuda-python>=12.0,<=12.9
5 | cuda.core==0.4
6 | cuda.pathfinder==1.2.3
7 | numpy
8 | Cython>=0.29.24
9 | 


--------------------------------------------------------------------------------
/nvshmem4py/requirements_cuda13.txt:
--------------------------------------------------------------------------------
1 | # These are the run-time dependencies of nvshmem4py
2 | # They are non-negotiables
3 | nvidia-nvshmem-cu13
4 | cuda-python>=13.0,<14.0
5 | cuda.core==0.4
6 | cuda.pathfinder==1.2.3
7 | numpy
8 | Cython>=0.29.24
9 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/mpi/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | set(SOURCE_LIST bootstrap_mpi.c)
2 | 
3 | nvshmem_add_bootstrap(nvshmem_bootstrap_mpi ${SOURCE_LIST})
4 | 
5 | find_package(MPI REQUIRED)
6 | 
7 | target_link_libraries(nvshmem_bootstrap_mpi PRIVATE MPI::MPI_C)
8 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | set(SOURCE_LIST bootstrap_uid.cpp ncclSocket/ncclsocket_socket.cpp)
2 | 
3 | nvshmem_add_bootstrap(nvshmem_bootstrap_uid ${SOURCE_LIST})
4 | 
5 | target_include_directories(nvshmem_bootstrap_uid ncclSocket)
6 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib1.h:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include "nvshmem.h"
 3 | #include "cuda.h"
 4 | #include "cuda_runtime.h"
 5 | 
 6 | extern "C" {
 7 | void simplelib1_init();
 8 | int simplelib1_dowork();
 9 | void simplelib1_finalize();
10 | }
11 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib2.h:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include "nvshmem.h"
 3 | #include "cuda.h"
 4 | #include "cuda_runtime.h"
 5 | 
 6 | extern "C" {
 7 | void simplelib2_init();
 8 | int simplelib2_dowork();
 9 | void simplelib2_finalize();
10 | }
11 | 


--------------------------------------------------------------------------------
/test/host/team/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | nvshmem_add_test(shmem_team_max.cpp)
2 | nvshmem_add_test(shmem_team_dup.cpp)
3 | nvshmem_add_test(shmem_team_reuse_teams.cpp)
4 | nvshmem_add_test(shmem_team_split_2d.cpp)
5 | nvshmem_add_test(shmem_team_translate_2.cpp)
6 | 


--------------------------------------------------------------------------------
/perftest/host/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | set(NVSHMEM_TEST_PREFIX "perf_host_")
2 | macro(nvshmem_add_perftest SOURCE)
3 |   nvshmem_add_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX})
4 | endmacro()
5 | add_subdirectory(coll)
6 | add_subdirectory(init)
7 | add_subdirectory(pt-to-pt)
8 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/ncclSocket/commit_info.txt:
--------------------------------------------------------------------------------
1 | commit 68c00a0000fd021070b61edc14b7547325a1c6ff (HEAD -> master, tag: v2.19.4-1, origin/stable, origin/master, origin/HEAD)
2 | Author: Sylvain Jeaugey <sjeaugey@nvidia.com>
3 | Date:   Wed Nov 8 18:49:01 2023 -0800
4 | 
5 |     NCCL 2.19.4-1


--------------------------------------------------------------------------------
/src/modules/transport/libfabric/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | nvshmem_add_transport(nvshmem_transport_libfabric libfabric.cpp ON ON OFF OFF)
2 | target_include_directories(nvshmem_transport_libfabric PRIVATE ${LIBFABRIC_HOME}/include)
3 | target_link_libraries(nvshmem_transport_libfabric PRIVATE ${FABRIC_lib})
4 | 


--------------------------------------------------------------------------------
/perftest/perftest-p2p-cudagraph.list:
--------------------------------------------------------------------------------
1 | /host/coll/barrier_all_on_stream
2 | /host/coll/barrier_on_stream
3 | /host/coll/sync_all_on_stream
4 | /host/coll/sync_on_stream
5 | /host/coll/alltoall_on_stream
6 | /host/coll/broadcast_on_stream
7 | /host/coll/fcollect_on_stream
8 | /host/coll/reduction_on_stream
9 | 


--------------------------------------------------------------------------------
/src/include/nvshmem_host.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * NVSHmem Host Include
 3 |  * This file exists so that we can cleanly include only the nvshmem host library headers
 4 |  */
 5 | 
 6 | #ifndef NVSHMEM_HOST_H
 7 | #define NVSHMEM_HOST_H
 8 | #include "host/nvshmem_api.h"
 9 | #include "host/nvshmemx_api.h"
10 | #endif
11 | 


--------------------------------------------------------------------------------
/test/apps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(NVSHMEM_TEST_PREFIX "apps_")
 2 | macro(nvshmem_add_test SOURCE)
 3 |   nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON)
 4 | endmacro()
 5 | add_subdirectory(cufft)
 6 | add_subdirectory(dgl)
 7 | if(NVSHMEM_USE_NCCL)
 8 | add_subdirectory(interop)
 9 | endif()
10 | 


--------------------------------------------------------------------------------
/test/unit/mem/transport/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(GTest_FOUND)
2 | nvshmem_add_gtest_unit_test_prefix(P2P_unit_tests.cpp "transport_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_transport.cpp")
3 | nvshmem_add_gtest_unit_test_prefix(remote_unit_tests.cpp "transport_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_transport.cpp")
4 | endif()


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_and.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, AND)
11 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_or.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, OR)
11 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_xor.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, XOR)
11 | 


--------------------------------------------------------------------------------
/src/host/coll/fcollect/fcollect_on_stream.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H
 8 | #define NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H
 9 | #include "fcollect_common.h"
10 | 
11 | #endif /* NVSHMEMI_FCOLLECT_ON_STREAM_CPU_H */
12 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/pmix/CMakeLists.txt.in:
--------------------------------------------------------------------------------
1 | set(SOURCE_LIST bootstrap_pmix.c)
2 | 
3 | nvshmem_add_bootstrap(nvshmem_bootstrap_pmix ${SOURCE_LIST})
4 | 
5 | find_library(PMIX_lib NAMES pmix HINTS "${PMIX_HOME}/lib")
6 | 
7 | target_link_libraries(nvshmem_bootstrap_pmix PRIVATE ${PMIX_lib})
8 | target_include_directories(nvshmem_bootstrap_pmix PRIVATE ${PMIX_HOME}/include)
9 | 


--------------------------------------------------------------------------------
/test/host/init/kernel_nvshmem.cu:
--------------------------------------------------------------------------------
 1 | #include <nvshmem.h>
 2 | #include <cstdio>
 3 | 
 4 | __global__ void kernel_nvshmem(int* destination) {
 5 |     int mype = nvshmem_my_pe();
 6 |     int npes = nvshmem_n_pes();
 7 |     assert(npes > 0);
 8 |     int peer = (mype + 1) % npes;
 9 |     nvshmem_int_p(destination, 3 * peer + 14, peer);
10 |     nvshmem_barrier_all();
11 | }
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_or.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, OR)
11 | 


--------------------------------------------------------------------------------
/test/host/coll/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(alltoall.cu)
 2 | nvshmem_add_test(barrier.cu)
 3 | nvshmem_add_test(barrier_all.cu)
 4 | nvshmem_add_test(broadcast.cu)
 5 | nvshmem_add_test(fcollect.cu)
 6 | nvshmem_add_test(collective_launch_choose_grid.cu)
 7 | nvshmem_add_test(collective_launch_user_specified_grid.cu)
 8 | nvshmem_add_test(reduce.cu)
 9 | nvshmem_add_test(reducescatter.cu)
10 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_and.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, AND)
11 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_xor.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, XOR)
11 | 


--------------------------------------------------------------------------------
/pkg/nvshmem_package_description.txt:
--------------------------------------------------------------------------------
1 | 
2 | NVSHMEM is a parallel programming interface based on OpenSHMEM
3 | that provides efficient and scalable communication for NVIDIA GPU
4 | clusters. NVSHMEM creates a global address space for data that spans
5 | the memory of multiple GPUs and can be accessed with fine-grained
6 | GPU-initiated operations, CPU-initiated operations, and operations
7 | on CUDA(R) streams.
8 | 


--------------------------------------------------------------------------------
/test/device/sync/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(sync_test.cu)
 2 | nvshmem_add_test(test_all_any_some.cu)
 3 | nvshmem_add_test(test_any.cu)
 4 | nvshmem_add_test(test_some.cu)
 5 | nvshmem_add_test(test_vector.cu)
 6 | nvshmem_add_test(wait_until.cu)
 7 | nvshmem_add_test(wait_until_all.cu)
 8 | nvshmem_add_test(wait_until_any.cu)
 9 | nvshmem_add_test(wait_until_some.cu)
10 | nvshmem_add_test(wait_until_vector.cu)
11 | 


--------------------------------------------------------------------------------
/nvshmem4py/requirements_optional_cuda13.txt:
--------------------------------------------------------------------------------
 1 | # These are optional dependencies
 2 | # nvshmem4py is aware of and interoperable with them
 3 | # but doesn't depend on them
 4 | mpi4py
 5 | --index-url https://download.pytorch.org/whl/cu130
 6 | --extra-index-url https://pypi.org/simple
 7 | torch==2.9.0
 8 | --index-url https://pypi.org/simple
 9 | ml-dtypes
10 | # These are cu13 packages
11 | nvidia-cuda-nvcc
12 | nvidia-nvjitlink


--------------------------------------------------------------------------------
/test/host/mem/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(align_bins.cpp)
 2 | nvshmem_add_test(align_free_reorder.cpp)
 3 | nvshmem_add_test(calloc.cpp)
 4 | nvshmem_add_test(malloc_bins.cpp)
 5 | nvshmem_add_test(malloc_free_loop.cpp)
 6 | nvshmem_add_test(mmap_unmap_loop.cu)
 7 | nvshmem_add_test(malloc_free_reorder.cpp)
 8 | nvshmem_add_test(malloc_loop.cpp)
 9 | nvshmem_add_test(malloc_simple.cpp)
10 | nvshmem_add_test(register_buffer.cu)
11 | 


--------------------------------------------------------------------------------
/test/unit/mem/heap/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(GTest_FOUND)
2 | nvshmem_add_gtest_unit_test_prefix(nvshmemi_symmetric_heap_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp")
3 | nvshmem_add_gtest_unit_test_prefix(static_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp")
4 | nvshmem_add_gtest_unit_test_prefix(dynamic_unit_tests.cpp "heap_" "${NVSHMEM_TEST_TLD}/../src/host/mem/mem_heap.cpp")
5 | endif()


--------------------------------------------------------------------------------
/test/device/tile/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(tile_put.cu)
 2 | nvshmem_add_test(tile_get.cu)
 3 | nvshmem_add_test(tile_bcast_pred.cu)
 4 | nvshmem_add_test(tile_reduce.cu)
 5 | nvshmem_add_test(tile_allreduce.cu)
 6 | nvshmem_add_test(tile_allreduce_pred.cu)
 7 | nvshmem_add_test(tile_allgather.cu)
 8 | nvshmem_add_test(tile_allgather_pred.cu)
 9 | nvshmem_add_test(tile_allreduce_1D.cu)
10 | nvshmem_add_test(tile_allgather_1D.cu)
11 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_max.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MAX)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MAX)
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_min.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MIN)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, MIN)
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_sum.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, SUM)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, SUM)
12 | 


--------------------------------------------------------------------------------
/nvshmem4py/requirements_build.txt:
--------------------------------------------------------------------------------
 1 | # These are the build-time python/pip requirements for nvshmem4py
 2 | # They are mostly required to run Cybind and build wheels
 3 | networkx
 4 | numpy
 5 | pycparser
 6 | build
 7 | Cython>=0.29.24
 8 | setuptools==68
 9 | setuptools_scm
10 | testresources
11 | wheel
12 | auditwheel
13 | patchelf
14 | # The following are Numbast requirements
15 | pybind11
16 | pyyaml
17 | click
18 | scikit-build-core
19 | ninja
20 | 
21 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_prod.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, PROD)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_RDXN_ON_STREAM_KERNEL, PROD)
12 | 


--------------------------------------------------------------------------------
/src/include/internal/host/shared_memory.h:
--------------------------------------------------------------------------------
 1 | #ifndef SHARED_MEMORY_H
 2 | #define SHARED_MEMORY_H
 3 | 
 4 | #include "internal/host/nvshmem_internal.h"  // for nvshmemi_shared_memory_info
 5 | 
 6 | int shared_memory_create(const char *name, size_t sz, nvshmemi_shared_memory_info *info);
 7 | int shared_memory_open(const char *name, size_t sz, nvshmemi_shared_memory_info *info);
 8 | void shared_memory_close(char *shm_name, nvshmemi_shared_memory_info *info);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/perftest/device/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(NVSHMEM_TEST_PREFIX "perf_device_")
 2 | 
 3 | macro(nvshmem_add_perftest SOURCE)
 4 |   nvshmem_add_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX})
 5 | endmacro()
 6 | if(NVSHMEM_BUILD_BITCODE_LIBRARY)
 7 |   macro(nvshmem_add_cubin_perftest SOURCE)
 8 |     nvshmem_add_cubin_perftest_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX})
 9 |   endmacro()
10 | endif()
11 | add_subdirectory(tile)
12 | add_subdirectory(coll)
13 | add_subdirectory(pt-to-pt)
14 | 


--------------------------------------------------------------------------------
/perftest/host/coll/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_perftest(alltoall_on_stream.cpp)
 2 | nvshmem_add_perftest(barrier_all_on_stream.cpp)
 3 | nvshmem_add_perftest(barrier_on_stream.cpp)
 4 | nvshmem_add_perftest(broadcast_on_stream.cpp)
 5 | nvshmem_add_perftest(fcollect_on_stream.cpp)
 6 | nvshmem_add_perftest(reduction_on_stream.cpp)
 7 | nvshmem_add_perftest(reducescatter_on_stream.cpp)
 8 | nvshmem_add_perftest(sync_all_on_stream.cpp)
 9 | nvshmem_add_perftest(sync_on_stream.cpp)
10 | 


--------------------------------------------------------------------------------
/perftest/perfTestRunnerSlurm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from subprocess import check_call,CalledProcessError #2.6
3 | 
4 | check_call(["sbatch", "-N", "1", "--qos=short", "-p", "dgx-1p", "perfTestRunner.py"])
5 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "dgx-1v", "perfTestRunner.py"])
6 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "hsw_p100", "perfTestRunner.py"])
7 | #check_call(["sbatch", "-N", "1", "--qos=short", "-p", "hsw_v100", "perfTestRunner.py"])
8 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_max.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MAX)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MAX)
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_min.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MIN)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, MIN)
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_sum.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, SUM)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, SUM)
12 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/reducescatter/reducescatter_prod.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reducescatter_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | REPT_FOR_BITWISE_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, PROD)
11 | REPT_FOR_FLOATING_TYPES(INSTANTIATE_NVSHMEMI_CALL_REDUCESCATTER_ON_STREAM_KERNEL, PROD)
12 | 


--------------------------------------------------------------------------------
/test/host/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(NVSHMEM_TEST_PREFIX "host_")
 2 | macro(nvshmem_add_test SOURCE)
 3 |   nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON)
 4 | endmacro()
 5 | 
 6 | macro(nvshmem_add_test_no_device SOURCE)
 7 |   nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} OFF)
 8 | endmacro()
 9 | 
10 | add_subdirectory(coll)
11 | add_subdirectory(init)
12 | add_subdirectory(interop)
13 | add_subdirectory(mem)
14 | add_subdirectory(pt-to-pt)
15 | add_subdirectory(team)
16 | 


--------------------------------------------------------------------------------
/nvshmem4py/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include pyproject.toml
 2 | include setup.py
 3 | include README.md
 4 | include License.txt
 5 | include requirements_cuda11.txt
 6 | include requirements_cuda12.txt
 7 | include requirements_cuda13.txt
 8 | include nvshmem/bindings/_internal/*.pyx
 9 | include nvshmem/bindings/_internal/*.pxd
10 | include nvshmem/bindings/*.pxd
11 | include nvshmem/bindings/*.pyx
12 | include nvshmem/bindings/_internal/*.cpp
13 | include nvshmem/core/*.py
14 | recursive-include nvshmem *
15 | 


--------------------------------------------------------------------------------
/test/host/pt-to-pt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(fence.cpp)
 2 | nvshmem_add_test(g.cpp)
 3 | nvshmem_add_test(get.cpp)
 4 | nvshmem_add_test(iget.cpp)
 5 | nvshmem_add_test(iput.cpp)
 6 | nvshmem_add_test(p.cpp)
 7 | nvshmem_add_test(put.cpp)
 8 | nvshmem_add_test(quiet_on_stream.cu)
 9 | nvshmem_add_test(quiet.cpp)
10 | nvshmem_add_test(signal_on_stream.cpp)
11 | nvshmem_add_test(wait_until_all_on_stream.cpp)
12 | nvshmem_add_test(wait_until_all_vector_on_stream.cpp)
13 | nvshmem_add_test(wait.cpp)
14 | 


--------------------------------------------------------------------------------
/test/common/test_teams.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_TEAMS_H
 2 | #define TEST_TEAMS_H
 3 | 
 4 | #include "nvshmem.h"
 5 | #include "nvshmemx.h"
 6 | #include <algorithm>
 7 | #include <unordered_map>
 8 | #include <string>
 9 | 
10 | extern std::unordered_map<nvshmem_team_t, std::string> map_team_to_string;
11 | extern std::unordered_map<std::string, nvshmem_team_t> map_string_to_team;
12 | 
13 | bool get_next_team(nvshmem_team_t *team);
14 | void init_test_teams();
15 | void finalize_test_teams();
16 | 
17 | #endif /* TEST_TEAMS_H */
18 | 


--------------------------------------------------------------------------------
/test/device/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(NVSHMEM_TEST_PREFIX "device_")
 2 | macro(nvshmem_add_test SOURCE)
 3 |   nvshmem_add_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX} ON)
 4 | endmacro()
 5 | if(NVSHMEM_BUILD_BITCODE_LIBRARY)
 6 |   macro(nvshmem_add_cubin_test SOURCE)
 7 |     nvshmem_add_cubin_test_prefix(${SOURCE} ${NVSHMEM_TEST_PREFIX})
 8 |   endmacro()
 9 | endif()
10 | add_subdirectory(tile)
11 | add_subdirectory(coll)
12 | add_subdirectory(init)
13 | add_subdirectory(pt-to-pt)
14 | add_subdirectory(query)
15 | add_subdirectory(sync)
16 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/device/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/ncclSocket/ncclsocket_param.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_PARAM_H_
 8 | #define NCCL_SOCKET_PARAM_H_
 9 | 
10 | #include <stdlib.h>  // for getenv
11 | 
12 | static inline const char *ncclGetEnv(const char *name) {
13 |   return getenv(name);
14 | }
15 | 
16 | #endif


--------------------------------------------------------------------------------
/src/include/non_abi/device/coll/defines.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _NVSHMEMI_DEVICE_COLL_DEFINES_H_
 8 | #define _NVSHMEMI_DEVICE_COLL_DEFINES_H_
 9 | 
10 | #include "alltoall.cuh"
11 | #include "barrier.cuh"
12 | #include "broadcast.cuh"
13 | #include "fcollect.cuh"
14 | #include "reduce.cuh"
15 | #include "reducescatter.cuh"
16 | #include "broadcast.cuh"
17 | #include "fcollect.cuh"
18 | 
19 | #endif /* NVSHMEMI_DEVICE_COLL_DEFINES_H */
20 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/core/device/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | __all__ = ["numba"]
12 | 


--------------------------------------------------------------------------------
/src/host/topo/topo.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef __TOPO_H
 8 | #define __TOPO_H
 9 | #include "internal/host/nvshmemi_types.h"  // for nvshmemi_state_t
10 | 
11 | int nvshmemi_get_devices_by_distance(int *device_arr, int max_dev_per_pe,
12 |                                      struct nvshmem_transport *tcurr);
13 | int nvshmemi_detect_same_device(nvshmemi_state_t *state);
14 | int nvshmemi_build_transport_map(nvshmemi_state_t *state);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/shmem/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | set(SOURCE_LIST bootstrap_shmem.c)
 2 | 
 3 | nvshmem_add_bootstrap(nvshmem_bootstrap_shmem ${SOURCE_LIST})
 4 | 
 5 | find_library(
 6 | SHMEM_LIB
 7 | NAMES oshmem
 8 | HINTS ${SHMEM_HOME}
 9 | PATH_SUFFIXES lib lib64)
10 | find_path(SHMEM_INCLUDE NAME shmem.h HINTS ${SHMEM_HOME}
11 |           PATH_SUFFIXES include
12 | )
13 | add_library(shmem IMPORTED INTERFACE)
14 | target_link_libraries(shmem INTERFACE ${SHMEM_LIB})
15 | target_include_directories(shmem INTERFACE ${SHMEM_INCLUDE})
16 | 
17 | target_link_libraries(nvshmem_bootstrap_shmem PRIVATE shmem)
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/QUESTION.yaml:
--------------------------------------------------------------------------------
 1 | name: NVSHMEM question
 2 | description: Ask the NVSHMEM team a question
 3 | title: "[Question]: "
 4 | labels: ["question"]
 5 | 
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/nvshmem/api/using.html), in particular the FAQ section, and the [release notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html).
11 |         ---
12 |   - type: textarea
13 |     id: question
14 |     attributes:
15 |       label: Question


--------------------------------------------------------------------------------
/nvshmem4py/test/device/numba/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from utils import uid_init, mpi_init
 4 | from nvshmem.core import finalize
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption("--init-type", action="store", default="uid", help="Method to initialize NVSHMEM", choices=["uid", "mpi"])
 8 | 
 9 | @pytest.fixture(scope="session", autouse=True)
10 | def nvshmem_init_fini(request):
11 |     init_type = request.config.getoption("--init-type")
12 |     if init_type == "uid":
13 |         uid_init()
14 |     elif init_type == "mpi":
15 |         mpi_init()
16 |     
17 |     yield
18 |     
19 |     finalize()
20 | 
21 | 


--------------------------------------------------------------------------------
/src/include/host/nvshmem_macros.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef _NVSHMEM_MACROS_H_
 3 | #define _NVSHMEM_MACROS_H_
 4 | 
 5 | #include <cuda_runtime.h>
 6 | #include "non_abi/nvshmem_build_options.h"
 7 | 
 8 | #ifdef __CUDA_ARCH__
 9 | #ifdef NVSHMEMI_HOST_ONLY
10 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__
11 | #else
12 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__ __device__
13 | #endif
14 | #else
15 | #define NVSHMEMI_HOSTDEVICE_PREFIX
16 | #endif
17 | 
18 | #if defined NVSHMEM_HOSTLIB_ONLY
19 | #undef NVSHMEMI_HOSTDEVICE_PREFIX
20 | #define NVSHMEMI_HOSTDEVICE_PREFIX __host__ __device__ __attribute__((always_inline))
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/common/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | add_library(nvshmem_bootstrap_common STATIC bootstrap_util.cpp)
 2 | 
 3 | set_target_properties(nvshmem_bootstrap_common PROPERTIES
 4 |   POSITION_INDEPENDENT_CODE ON
 5 |   CXX_STANDARD_REQUIRED ON
 6 |   CUDA_STANDARD_REQUIRED ON
 7 |   CXX_STANDARD 11
 8 |   CUDA_STANDARD 11
 9 |   CUDA_SEPARABLE_COMPILATION ON
10 | )
11 | 
12 | target_include_directories(nvshmem_bootstrap_common INTERFACE
13 |                            ${CMAKE_CURRENT_SOURCE_DIR}
14 | )
15 | 
16 | target_include_directories(nvshmem_bootstrap_common PRIVATE
17 |                            ${CMAKE_SOURCE_DIR}/include
18 | )
19 | 


--------------------------------------------------------------------------------
/nvshmem4py/requirements_optional_cuda12.txt:
--------------------------------------------------------------------------------
 1 | # These are optional dependencies
 2 | # nvshmem4py is aware of and interoperable with them
 3 | # but doesn't depend on them
 4 | mpi4py
 5 | --index-url https://download.pytorch.org/whl/cu129
 6 | --extra-index-url https://pypi.org/simple
 7 | torch==2.8.0
 8 | --index-url https://pypi.org/simple
 9 | ml-dtypes
10 | # NOTE! If you are installing PyTorch, you should install these requirements AFTER PyTorch.
11 | # Pytorch has a hard pin to these libraries. Because of CUDA-compatibility, these versions work
12 | # But older ones that pytorch pulls in will not.
13 | nvidia-cuda-nvcc-cu12
14 | nvidia-nvjitlink-cu12
15 | 


--------------------------------------------------------------------------------
/src/include/internal/host_transport/nvshmemi_transport_defines.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef __NVSHMEMI_TRANSPORT_DEFINES_H
 8 | #define __NVSHMEMI_TRANSPORT_DEFINES_H
 9 | 
10 | #define NVSHMEM_MEM_HANDLE_SIZE 512
11 | 
12 | #define NVSHMEM_PCIE_BDF_BUFFER_LEN 50
13 | 
14 | typedef struct pcie_identifier {
15 |     int dev_id;
16 |     int bus_id;
17 |     int domain_id;
18 | } pcie_id_t;
19 | 
20 | typedef struct nvshmem_mem_handle {
21 |     char reserved[NVSHMEM_MEM_HANDLE_SIZE];
22 | } nvshmem_mem_handle_t;
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/include/internal/host/sockets.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef SOCKETS_H
 8 | #define SOCKETS_H
 9 | 
10 | #include <sys/types.h>
11 | 
12 | typedef struct ipcHandle_st {
13 |     int socket;
14 |     char *socketName;
15 | } ipcHandle;
16 | 
17 | int ipcOpenSocket(ipcHandle *&handle, pid_t, pid_t);
18 | 
19 | int ipcCloseSocket(ipcHandle *handle);
20 | 
21 | int ipcRecvFd(ipcHandle *handle, int *fd);
22 | 
23 | int ipcSendFd(ipcHandle *handle, const int fd, pid_t process, pid_t);
24 | int ipcCloseFd(int fd);
25 | 
26 | #endif /* SOCKETS_H */
27 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # This is config file for git pre commit hooks
 2 | # It requires the pre-commit package from https://pre-commit.com/
 3 | # pre-commit is used to run git pre-commit hooks with
 4 | # a lot of pre-built hooks available on the website
 5 | #
 6 | # Install the pre-commit package and run pre-commit install
 7 | # from the git repository
 8 | 
 9 | # The following hook would install clang-format hook and
10 | # run it for every commit. It would apply clang-format fixes
11 | # and leave them as unstaged.
12 | repos:
13 | -   repo: https://github.com/pre-commit/mirrors-clang-format
14 |     rev: v13.0.0
15 |     hooks:
16 |       - id: clang-format
17 |         types: [c, c++, cuda]
18 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | 
12 | from .nvshmem import *
13 | 
14 | # Define what gets exposed when users do `import nvshmem.bindings`
15 | __all__ = [name for name in dir() if not name.startswith("_")]
16 | 


--------------------------------------------------------------------------------
/src/host/coll/barrier/barrier.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef NVSHMEMI_BARRIER_CPU_H
 8 | #define NVSHMEMI_BARRIER_CPU_H
 9 | #include <driver_types.h>
10 | #include "device_host/nvshmem_types.h"
11 | 
12 | int nvshmemi_call_barrier_on_stream_kernel(nvshmem_team_t team, cudaStream_t stream);
13 | int nvshmemi_call_sync_on_stream_kernel(nvshmem_team_t team, cudaStream_t stream);
14 | void nvshmemxi_barrier_all_on_stream(cudaStream_t);
15 | void nvshmemxi_barrier_on_stream(nvshmem_team_t team, cudaStream_t stream);
16 | void nvshmemxi_sync_all_on_stream(cudaStream_t);
17 | 
18 | #endif /* NVSHMEMI_BARRIER_CPU_H */
19 | 


--------------------------------------------------------------------------------
/src/host/transport/p2p/p2p.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _P2P_H
 8 | #define _P2P_H
 9 | 
10 | #include <stdint.h>  // IWYU pragma: keep
11 | // IWYU pragma: no_include <bits/stdint-uintn.h>
12 | #include <cuda.h>
13 | #include "internal/host_transport/nvshmemi_transport_defines.h"
14 | 
15 | typedef struct {
16 |     int ndev;
17 |     CUdevice *cudev;
18 |     int *devid;
19 |     CUdeviceptr *curetval;
20 |     CUdevice cudevice;
21 |     int device_id;
22 |     uint64_t hostHash;
23 |     pcie_id_t *pcie_ids;
24 |     char pcie_bdf[NVSHMEM_PCIE_BDF_BUFFER_LEN];
25 | } transport_p2p_state_t;
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/nvshmem4py/perftest/alltoall_on_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a Python implementation of the `alltoall_on_stream` NVSHMEM Perftest
 3 | 
 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported.
 5 | """
 6 | import argparse
 7 | 
 8 | from cuda.core.experimental._event import Event
 9 | from cuda.core.experimental import Device, system
10 | import cuda.core
11 | 
12 | import nvshmem.core
13 | 
14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark
15 | 
16 | if __name__ == '__main__':
17 |     args = build_parser()
18 |     print_runtime_options(args)
19 |     uid_init()
20 |     run_coll_benchmark(args, "alltoall")
21 |     nvshmem.core.finalize()
22 | 


--------------------------------------------------------------------------------
/nvshmem4py/perftest/fcollect_on_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a Python implementation of the `fcollect_on_stream` NVSHMEM Perftest
 3 | 
 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported.
 5 | """
 6 | import argparse
 7 | 
 8 | from cuda.core.experimental._event import Event
 9 | from cuda.core.experimental import Device, system
10 | import cuda.core
11 | 
12 | import nvshmem.core
13 | 
14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark
15 | 
16 | if __name__ == '__main__':
17 |     args = build_parser()
18 |     print_runtime_options(args)
19 |     uid_init()
20 |     run_coll_benchmark(args, "fcollect")
21 |     nvshmem.core.finalize()
22 | 


--------------------------------------------------------------------------------
/nvshmem4py/perftest/reduction_on_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a Python implementation of the `reduction_on_stream` NVSHMEM Perftest
 3 | 
 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported.
 5 | """
 6 | import argparse
 7 | 
 8 | from cuda.core.experimental._event import Event
 9 | from cuda.core.experimental import Device, system
10 | import cuda.core
11 | 
12 | import nvshmem.core
13 | 
14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark
15 | 
16 | if __name__ == '__main__':
17 |     args = build_parser()
18 |     print_runtime_options(args)
19 |     uid_init()
20 |     run_coll_benchmark(args, "reduce")
21 |     nvshmem.core.finalize()
22 | 


--------------------------------------------------------------------------------
/nvshmem4py/perftest/broadcast_on_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a Python implementation of the `broadcast_on_stream` NVSHMEM Perftest
 3 | 
 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported.
 5 | """
 6 | import argparse
 7 | 
 8 | from cuda.core.experimental._event import Event
 9 | from cuda.core.experimental import Device, system
10 | import cuda.core
11 | 
12 | import nvshmem.core
13 | 
14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark
15 | 
16 | if __name__ == '__main__':
17 |     args = build_parser()
18 |     print_runtime_options(args)
19 |     uid_init()
20 |     run_coll_benchmark(args, "broadcast")
21 |     nvshmem.core.finalize()
22 | 


--------------------------------------------------------------------------------
/nvshmem4py/perftest/reducescatter_on_stream.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a Python implementation of the `reducescatter_on_stream` NVSHMEM Perftest
 3 | 
 4 | The options are identical, although CUDA graph-based kernel launches are not yet supported.
 5 | """
 6 | import argparse
 7 | 
 8 | from cuda.core.experimental._event import Event
 9 | from cuda.core.experimental import Device, system
10 | import cuda.core
11 | 
12 | import nvshmem.core
13 | 
14 | from utils import build_parser, print_runtime_options, uid_init, print_header, print_result, run_coll_benchmark
15 | 
16 | if __name__ == '__main__':
17 |     args = build_parser()
18 |     print_runtime_options(args)
19 |     uid_init()
20 |     run_coll_benchmark(args, "reducescatter")
21 |     nvshmem.core.finalize()
22 | 


--------------------------------------------------------------------------------
/Compatibility.md:
--------------------------------------------------------------------------------
1 | # Compatibility with NVSHMEM
2 | 
3 | NVSHMEM follows semantic versioning for its releases and packages per commit i.e `MAJOR.MINOR.PATCH.TWEAK`.
4 | - Each component of the version is monotonically increasing number. So, if the author makes non-source change e.g. `test`, `perftest`, etc, it would require updating `TWEAK` component of the version.
5 | - If the author makes a change to the source file, but not the ABI or API, it is PATCH change by 1 and `TWEAK` resets.
6 | - If the author makes a change to the API/ABI definition in a backward compat way, it is MINOR change by 1 and TWEAK/PATCH reset to 0.
7 | - If the author makes a change to the ABI/API definition in the non-backward compat way, it is MAJOR change by 1 and TWEAK/PATCH/MINOR resets to 0.
8 | 


--------------------------------------------------------------------------------
/test/host/init/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_test(cuobject_init.cpp)
 2 | nvshmem_add_test(nvrtc_api.cpp)
 3 | if(NOT CUDA_VERSION VERSION_LESS 12000)
 4 |   # This test depends on NVJitLink which is not available in CUDA 11.x
 5 |   nvshmem_add_test(nvrtc_numba_ltoir.cpp)
 6 | endif()
 7 | nvshmem_add_test(nvshmemx_init_status.cpp)
 8 | nvshmem_add_test(static_init.cpp)
 9 | nvshmem_add_test(init_loop.cpp)
10 | nvshmem_add_test(global_exit.cpp)
11 | 
12 | if(NVSHMEM_MPI_SUPPORT)
13 |   nvshmem_add_test(mpi_init.cpp)
14 |   nvshmem_add_test(uid_init.cpp)
15 |   nvshmem_add_test(nvshmemx_init_with_device.cpp)
16 |   nvshmem_add_test_no_device(nvshmemx_hostlib_init_attr.cpp)
17 | endif()
18 | 
19 | if(NVSHMEM_SHMEM_SUPPORT)
20 |   nvshmem_add_test(shmem_init.cpp)
21 | endif()
22 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/wheel_sanity_test.py:
--------------------------------------------------------------------------------
 1 | import nvshmem
 2 | 
 3 | import os
 4 | 
 5 | def test_import_modules():
 6 |     print("Testing import modules")
 7 |     # Import modules
 8 |     import nvshmem
 9 |     import nvshmem.core
10 |     import nvshmem.bindings
11 | 
12 |     # Import core APIs
13 |     from nvshmem.core import init, finalize
14 | 
15 |     # Import bindings
16 |     print("Testing import bindings")
17 |     attr = nvshmem.bindings.InitAttr()
18 | 
19 |     # Can't run these because it assumes stuff about nvshmem state
20 |     from nvshmem.bindings import hostlib_finalize, hostlib_init_attr, uniqueid, check_status
21 | 
22 |     # Get version info
23 |     print(nvshmem.core.get_version())
24 | 
25 | if __name__ == '__main__':
26 |     test_import_modules()
27 | 


--------------------------------------------------------------------------------
/test/common/data_check.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _DATA_CHECK_H_
 8 | #define _DATA_CHECK_H_
 9 | 
10 | #include "cuda_runtime.h"
11 | template <typename T>
12 | int init_data_ring(T *buf, size_t size, int disp, int iters, int mype, int npes, int *nextpe,
13 |                    int *prevpe, int seed, cudaStream_t cstrm);
14 | template <typename T>
15 | int init_data_alltoall(T *buf, size_t size, int disp, int iters, int mype, int npes, int seed,
16 |                        cudaStream_t cstrm);
17 | template <typename T>
18 | int check_data_ring(T *buf, cudaStream_t);
19 | template <typename T>
20 | int check_data_alltoall(T *buf, cudaStream_t);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | """
12 | __init__.py for nvshmem Python package
13 | """
14 | import os
15 | 
16 | __package_name__ = os.getenv("PACKAGE_NAME", "nvshmem4py")
17 | 
18 | # Version is autogenerated by setuptools_scm
19 | from .version import version, __version__, __version_tuple__
20 | 


--------------------------------------------------------------------------------
/perftest/device/coll/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_perftest(alltoall_latency.cu)
 2 | nvshmem_add_perftest(barrier_latency.cu)
 3 | nvshmem_add_perftest(bcast_latency.cu)
 4 | nvshmem_add_perftest(fcollect_latency.cu)
 5 | nvshmem_add_perftest(reducescatter_latency.cu)
 6 | nvshmem_add_perftest(reduction_latency.cu)
 7 | nvshmem_add_perftest(sync_latency.cu)
 8 | 
 9 | if(NVSHMEM_BUILD_BITCODE_LIBRARY)
10 |     nvshmem_add_cubin_perftest(alltoall_latency.cu)
11 |     nvshmem_add_cubin_perftest(barrier_latency.cu)
12 |     nvshmem_add_cubin_perftest(bcast_latency.cu)
13 |     nvshmem_add_cubin_perftest(fcollect_latency.cu)
14 |     nvshmem_add_cubin_perftest(reducescatter_latency.cu) 
15 |     nvshmem_add_cubin_perftest(reduction_latency.cu)
16 |     nvshmem_add_cubin_perftest(sync_latency.cu)
17 | endif()
18 | 


--------------------------------------------------------------------------------
/test/common/ring_alltoall.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _RING_ALL_TO_ALL_H_
 8 | #define _RING_ALL_TO_ALL_H_
 9 | 
10 | #include <cuda_runtime.h>
11 | 
12 | #define MAX_MSG_SIZE 65536
13 | #define ITER 100
14 | 
15 | typedef void (*launch_alltoall_ptr_t)(void *, void *, size_t, int, int, cudaStream_t);
16 | typedef void (*launch_ring_ptr_t)(void *, void *, size_t, int, int, cudaStream_t);
17 | 
18 | int setup(bool is_scalar, int disp, size_t max_size = MAX_MSG_SIZE, uint64_t max_iter = ITER,
19 |           bool local_dest = false, int *argc = NULL, char ***argv = NULL);
20 | void cleanup();
21 | template <typename T>
22 | int test(launch_alltoall_ptr_t, launch_ring_ptr_t);
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/include/internal/host/nvshmemi_coll.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include "host/nvshmem_macros.h"
14 | #include "device_host/nvshmem_types.h"
15 | 
16 | #ifndef NVSHMEMI_COLL_H
17 | #define NVSHMEMI_COLL_H
18 | 
19 | NVSHMEMI_HOSTDEVICE_PREFIX void nvshmemi_barrier(nvshmem_team_t team);
20 | 
21 | #endif /* NVSHMEMI_COLL_H */
22 | 


--------------------------------------------------------------------------------
/test/device/query/hello.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <cuda.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #include "utils.h"
12 | 
13 | #define N 4
14 | 
15 | __global__ void hello_world(void) {
16 |     printf("Hello World from device PE %d <%d> of %d\n", nvshmem_my_pe(), threadIdx.x,
17 |            nvshmem_n_pes());
18 | }
19 | 
20 | int main(int argc, char **argv) {
21 |     init_wrapper(&argc, &argv);
22 | 
23 |     nvshmem_barrier_all(); /* Ensure NVSHMEM device init has completed */
24 | 
25 |     printf("Hello World from host PE %d of %d\n", nvshmem_my_pe(), nvshmem_n_pes());
26 | 
27 |     hello_world<<<1, N>>>();
28 | 
29 |     finalize_wrapper();
30 |     return 0;
31 | }
32 | 


--------------------------------------------------------------------------------
/test/host/pt-to-pt/quiet_on_stream.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <cstdio>
 8 | #include "nvshmem.h"
 9 | #include "nvshmemx.h"
10 | 
11 | #include "utils.h"
12 | 
13 | #define NUM_ITERS 100
14 | 
15 | int main(int argc, char **argv) {
16 |     int status = 0;
17 |     int num_iters = NUM_ITERS;
18 |     cudaStream_t cstrm;
19 | 
20 |     init_wrapper(&argc, &argv);
21 | 
22 |     CUDA_CHECK(cudaStreamCreate(&cstrm));
23 | 
24 |     for (int i = 0; i < num_iters; i++) {
25 |         nvshmemx_quiet_on_stream(cstrm);
26 |         CUDA_CHECK(cudaStreamSynchronize(cstrm));
27 |     }
28 | 
29 |     CUDA_CHECK(cudaStreamDestroy(cstrm));
30 | 
31 |     nvshmem_barrier_all();
32 |     finalize_wrapper();
33 | 
34 |     return status;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/include/internal/common/error_codes_internal.h:
--------------------------------------------------------------------------------
 1 | /****
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ****/
 6 | 
 7 | #ifndef NVSHMEM_ERROR_CODES_INTERNAL_H_
 8 | #define NVSHMEM_ERROR_CODES_INTERNAL_H_
 9 | 
10 | typedef enum {
11 |     NVSHMEMI_SUCCESS = 0,
12 |     NVSHMEMI_UNHANDLED_CUDA_ERROR = 1,
13 |     NVSHMEMI_SYSTEM_ERROR = 2,
14 |     NVSHMEMI_INTERNAL_ERROR = 3,
15 |     NVSHMEMI_INVALID_ARGUMENT = 4,
16 |     NVSHMEMI_INVALID_USAGE = 5,
17 |     NVSHMEMI_GET_CUCTX_FAILED = 6,
18 |     NVSHMEMI_NOT_BOOTSTRAPPED = 7,
19 |     NVSHMEMI_NOT_INITIALIZED = 8,
20 |     NVSHMEMI_CUDA_GET_DEVICE_FAILED = 9,
21 |     NVSHMEMI_INIT_DEVICE_ONLY_STATE_FAILED = 10,
22 |     NVSHMEMI_ERROR_SKIPPED = 11,
23 |     NVSHMEMI_NUM_RESULTS = 12
24 | } nvshmemResult_t;
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/include/internal/host/error_codes_internal.h:
--------------------------------------------------------------------------------
 1 | /****
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ****/
 6 | 
 7 | #ifndef NVSHMEM_ERROR_CODES_INTERNAL_H_
 8 | #define NVSHMEM_ERROR_CODES_INTERNAL_H_
 9 | 
10 | typedef enum {
11 |     NVSHMEMI_SUCCESS = 0,
12 |     NVSHMEMI_UNHANDLED_CUDA_ERROR = 1,
13 |     NVSHMEMI_SYSTEM_ERROR = 2,
14 |     NVSHMEMI_INTERNAL_ERROR = 3,
15 |     NVSHMEMI_INVALID_ARGUMENT = 4,
16 |     NVSHMEMI_INVALID_USAGE = 5,
17 |     NVSHMEMI_GET_CUCTX_FAILED = 6,
18 |     NVSHMEMI_NOT_BOOTSTRAPPED = 7,
19 |     NVSHMEMI_NOT_INITIALIZED = 8,
20 |     NVSHMEMI_CUDA_GET_DEVICE_FAILED = 9,
21 |     NVSHMEMI_INIT_DEVICE_ONLY_STATE_FAILED = 10,
22 |     NVSHMEMI_ERROR_SKIPPED = 11,
23 |     NVSHMEMI_NUM_RESULTS = 12
24 | } nvshmemResult_t;
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/version.py:
--------------------------------------------------------------------------------
 1 | # file generated by setuptools-scm
 2 | # don't change, don't track in version control
 3 | 
 4 | __all__ = [
 5 |     "__version__",
 6 |     "__version_tuple__",
 7 |     "version",
 8 |     "version_tuple",
 9 |     "__commit_id__",
10 |     "commit_id",
11 | ]
12 | 
13 | TYPE_CHECKING = False
14 | if TYPE_CHECKING:
15 |     from typing import Tuple
16 |     from typing import Union
17 | 
18 |     VERSION_TUPLE = Tuple[Union[int, str], ...]
19 |     COMMIT_ID = Union[str, None]
20 | else:
21 |     VERSION_TUPLE = object
22 |     COMMIT_ID = object
23 | 
24 | version: str
25 | __version__: str
26 | __version_tuple__: VERSION_TUPLE
27 | version_tuple: VERSION_TUPLE
28 | commit_id: COMMIT_ID
29 | __commit_id__: COMMIT_ID
30 | 
31 | __version__ = version = '0.1.2'
32 | __version_tuple__ = version_tuple = (0, 1, 2)
33 | 
34 | __commit_id__ = commit_id = None
35 | 


--------------------------------------------------------------------------------
/examples/shmem-based-init.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include <stdio.h>
14 | #include "shmem.h"
15 | #include "nvshmem.h"
16 | 
17 | int main(int c, char *v[]) {
18 |     nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
19 | 
20 |     shmem_init();
21 |     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_SHMEM, &attr);
22 | 
23 |     nvshmem_finalize();
24 |     shmem_finalize();
25 |     return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/test_npe.py:
--------------------------------------------------------------------------------
 1 | from numba import cuda
 2 | import cupy as cp
 3 | import argparse
 4 | 
 5 | from utils import uid_init, mpi_init
 6 | 
 7 | from nvshmem.bindings.device.numba import n_pes, sync_all
 8 | 
 9 | def test_npe():
10 | 
11 |     @cuda.jit()
12 |     def kernel_nvshmem(destination):
13 |         npes = n_pes()
14 |         sync_all()
15 |         destination[0] = npes
16 | 
17 |     npes = cp.zeros(1, dtype="int32")
18 |     kernel_nvshmem[1, 1](npes)
19 | 
20 |     assert npes[0] > 0
21 |     print(f"{npes[0]=}")
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid")
27 |     args = parser.parse_args()
28 |     if args.init_type == "uid":
29 |         uid_init()
30 |     elif args.init_type == "mpi":
31 |         mpi_init()
32 | 
33 |     test_npe()


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/ncclSocket/ncclsocket_nccl.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_NCCL_H_
 8 | #define NCCL_SOCKET_NCCL_H_
 9 | 
10 | /* Error type */
11 | typedef enum { ncclSuccess                 =  0,
12 |                ncclUnhandledCudaError      =  1,
13 |                ncclSystemError             =  2,
14 |                ncclInternalError           =  3,
15 |                ncclInvalidArgument         =  4,
16 |                ncclInvalidUsage            =  5,
17 |                ncclRemoteError             =  6,
18 |                ncclInProgress              =  7,
19 |                ncclNumResults              =  8 } ncclResult_t;
20 | 
21 | 
22 | #endif 


--------------------------------------------------------------------------------
/examples/hello.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <unistd.h>
 9 | #include "nvshmem.h"
10 | 
11 | int main(int argc, char **argv) {
12 |     char hostname[256];
13 | 
14 |     int ret = gethostname(hostname, 256);
15 |     if (ret < 0) {
16 |         printf("Failed to get hostname\n");
17 |         return 1;
18 |     }
19 | 
20 |     printf("[%s][%ld] Starting up...\n", hostname, (long)getpid());
21 | 
22 |     nvshmem_init();
23 | 
24 |     int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
25 |     cudaSetDevice(mype_node);
26 |     void *ptr = nvshmem_malloc(1);  // initialize NVSHMEM after device is set
27 | 
28 |     printf("[%s][%ld] Hello from PE %d of %d\n", hostname, (long)getpid(), nvshmem_my_pe(),
29 |            nvshmem_n_pes());
30 | 
31 |     nvshmem_finalize();
32 |     return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/src/host/stream/comm/quiet_on_stream.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "internal/host/util.h"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_sync_defs.cuh"
 9 | 
10 | static int nvshmemi_quiet_maxblocksize = -1;
11 | 
12 | void nvshmemi_call_proxy_quiet_entrypoint(cudaStream_t cstrm) {
13 |     if (nvshmemi_quiet_maxblocksize == -1) {
14 |         int tmp;
15 |         CUDA_RUNTIME_CHECK(cudaOccupancyMaxPotentialBlockSize(
16 |             &tmp, (int *)&nvshmemi_quiet_maxblocksize, nvshmemi_proxy_quiet_entrypoint));
17 |     }
18 |     int status = cudaLaunchKernel((const void *)nvshmemi_proxy_quiet_entrypoint, 1,
19 |                                   nvshmemi_quiet_maxblocksize, NULL, 0, cstrm);
20 |     if (status) {
21 |         NVSHMEMI_ERROR_PRINT("cudaLaunchKernel() failed in nvshmem_quiet_on_stream \n");
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/pmi/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | set(SOURCE_LIST bootstrap_pmi.cpp)
 2 | 
 3 | if(NVSHMEM_BUILD_PMI_BOOTSTRAP)
 4 |   nvshmem_add_bootstrap(nvshmem_bootstrap_pmi ${SOURCE_LIST})
 5 | 
 6 |   find_library(PMI_lib NAMES pmi HINTS "${PMI_HOME}/lib")
 7 | 
 8 |   target_link_libraries(nvshmem_bootstrap_pmi PRIVATE ${PMI_lib})
 9 |   target_compile_definitions(nvshmem_bootstrap_pmi PRIVATE NVSHMEM_CUSTOM_BOOTSTRAP_BUILD)
10 |   target_include_directories(nvshmem_bootstrap_pmi PRIVATE ${PMI_HOME}/include)
11 | endif()
12 | 
13 | if(NVSHMEM_BUILD_PMI2_BOOTSTRAP)
14 |   nvshmem_add_bootstrap(nvshmem_bootstrap_pmi2 ${SOURCE_LIST})
15 | 
16 |   find_library(PMI2_lib NAMES pmi2 HINTS "${PM2I_HOME}/lib")
17 | 
18 |   target_compile_definitions(nvshmem_bootstrap_pmi2 PRIVATE NVSHMEM_BUILD_PMI2)
19 |   target_link_libraries(nvshmem_bootstrap_pmi2 PRIVATE ${PMI2_lib})
20 |   target_include_directories(nvshmem_bootstrap_pmi2 PRIVATE ${PMI2_HOME}/include)
21 | endif()
22 | 


--------------------------------------------------------------------------------
/perftest/perftest-mmap-full.list:
--------------------------------------------------------------------------------
 1 | /host/coll/broadcast_on_stream
 2 | /host/coll/reducescatter_on_stream
 3 | /host/coll/fcollect_on_stream
 4 | /host/coll/reduction_on_stream
 5 | /host/coll/alltoall_on_stream
 6 | /host/pt-to-pt/latency
 7 | /host/pt-to-pt/bw
 8 | /device/coll/bcast_latency
 9 | /device/coll/fcollect_latency
10 | /device/coll/alltoall_latency
11 | /device/pt-to-pt/shmem_p_latency
12 | /device/pt-to-pt/shmem_put_bw
13 | /device/pt-to-pt/shmem_st_bw
14 | /device/pt-to-pt/shmem_p_ping_pong_latency
15 | /device/pt-to-pt/shmem_atomic_bw
16 | /device/pt-to-pt/shmem_g_bw
17 | /device/pt-to-pt/shmem_p_bw
18 | /device/pt-to-pt/shmem_get_bw
19 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency
20 | /device/pt-to-pt/shmem_get_latency
21 | /device/pt-to-pt/shmem_put_ping_pong_latency
22 | /device/pt-to-pt/shmem_g_latency
23 | /device/pt-to-pt/shmem_put_latency
24 | /device/pt-to-pt/shmem_signal_ping_pong_latency
25 | /device/pt-to-pt/shmem_put_signal_ping_pong_latency
26 | 


--------------------------------------------------------------------------------
/src/include/non_abi/nvshmem_version.h.in:
--------------------------------------------------------------------------------
 1 | /* Note - For packaging reasons, do not move this file from src/include/non_abi */
 2 | 
 3 | #pragma once
 4 | 
 5 | // clang-format off
 6 | 
 7 | #define NVSHMEM_VENDOR_MAJOR_VERSION @PROJECT_VERSION_MAJOR@
 8 | #define NVSHMEM_VENDOR_MINOR_VERSION @PROJECT_VERSION_MINOR@
 9 | #define NVSHMEM_VENDOR_PATCH_VERSION @PROJECT_VERSION_PATCH@
10 | #define NVSHMEM_VENDOR_PACKAGE_VERSION @PROJECT_VERSION_TWEAK@
11 | #define NVSHMEM_TRANSPORT_PLUGIN_MAJOR_VERSION @TRANSPORT_VERSION_MAJOR@
12 | #define NVSHMEM_TRANSPORT_PLUGIN_MINOR_VERSION @TRANSPORT_VERSION_MINOR@
13 | #define NVSHMEM_TRANSPORT_PLUGIN_PATCH_VERSION @TRANSPORT_VERSION_PATCH@
14 | #define NVSHMEM_BOOTSTRAP_PLUGIN_MAJOR_VERSION @BOOTSTRAP_VERSION_MAJOR@
15 | #define NVSHMEM_BOOTSTRAP_PLUGIN_MINOR_VERSION @BOOTSTRAP_VERSION_MINOR@
16 | #define NVSHMEM_BOOTSTRAP_PLUGIN_PATCH_VERSION @BOOTSTRAP_VERSION_PATCH@
17 | #define NVSHMEM_BUILD_VARS @INFO_BUILD_VARS@
18 | 
19 | // clang-format on
20 | 


--------------------------------------------------------------------------------
/examples/gemm_allreduce/nvshmemAlloc.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda_runtime.h>
 4 | #include <cuda.h>
 5 | #include <cuda_fp16.h>
 6 | #include <nvshmem.h>
 7 | #include <stdio.h>
 8 | 
 9 | template <typename T>
10 | class nvshmemAllocation {
11 |    public:
12 |     nvshmemAllocation() = default;
13 |     ~nvshmemAllocation() { dealloc(); }
14 | 
15 |     void reset(size_t capacity) {
16 |         dealloc();
17 |         alloc(capacity * sizeof(T));
18 |         _capacity = capacity;
19 |     }
20 | 
21 |     T* get() { return _data; }
22 |     size_t size() { return _capacity; }
23 |     void free() { dealloc(); }
24 | 
25 |    private:
26 |     void dealloc() {
27 |         if (_capacity) {
28 |             nvshmem_free((void*)_data);
29 |         }
30 |         _capacity = 0;
31 |     }
32 | 
33 |     void alloc(size_t size) {
34 |         _data = (T*)nvshmem_malloc(size);
35 |         assert(_data);
36 |     }
37 | 
38 |     T* _data = NULL;
39 |     size_t _capacity = 0;
40 | };
41 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/RFE.yaml:
--------------------------------------------------------------------------------
 1 | name: NVSHMEM request for enhancement
 2 | description: Request for enhancement
 3 | title: "[RFE]: "
 4 | labels: ["enhancement"]
 5 | body:
 6 |   - type: markdown
 7 |     attributes:
 8 |       value: |
 9 | 
10 |         Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nvshmem/issues?q=is%3Aissue%20state%3Aopen%20label%3Aenhancement).
11 |         
12 |         ---
13 |   - type: textarea
14 |     id: rfe-description
15 |     attributes:
16 |       label: Please provide the below details to ensure we understand your needs
17 |       description: |
18 |         * What is the goal of this request?
19 |         * Who will benefit from this feature?
20 |         * Is this request for a specific GPU architecture or network infrastructure?
21 |         * How will this feature improve current workflows or processes?
22 |         * What is the priority level of this request?


--------------------------------------------------------------------------------
/test/host/init/static_init.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include "nvshmem.h"
 9 | #include "nvshmemx.h"
10 | #include "utils.h"
11 | 
12 | int main(int c, char *v[]) {
13 |     int mype_node, npes_node;
14 |     int dev_count;
15 | 
16 |     nvshmem_init();
17 | 
18 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
19 |     npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
20 |     CUDA_CHECK(cudaGetDeviceCount(&dev_count));
21 |     int npes_per_gpu = (npes_node + dev_count - 1) / dev_count;
22 |     CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu));
23 | 
24 | #ifdef _NVSHMEM_DEBUG
25 |     int mype = nvshmem_my_pe();
26 |     int npes = nvshmem_n_pes();
27 | #endif
28 |     DEBUG_PRINT("[%d of %d] hello shmem world! \n", mype, npes);
29 | 
30 |     nvshmem_barrier_all();
31 | 
32 |     nvshmem_finalize();
33 | 
34 |     return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/test/host/init/global_exit.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include "nvshmem.h"
 9 | #include "nvshmemx.h"
10 | #include "utils.h"
11 | 
12 | int main(int c, char *v[]) {
13 |     int mype, status = 0;
14 | 
15 |     init_wrapper(&c, &v);
16 | 
17 |     mype = nvshmem_my_pe();
18 | #ifdef _NVSHMEM_DEBUG
19 |     int npes = nvshmem_n_pes();
20 | #endif
21 | 
22 |     DEBUG_PRINT("[%d of %d] hello world! \n", mype, npes);
23 | 
24 |     nvshmem_barrier_all();
25 | 
26 |     if (mype == 0) {
27 |         nvshmem_global_exit(0);
28 |         /* Note, this should be unreachable. return a unique error code if we reach here. */
29 |         status = 2;
30 |     } else {
31 |         sleep(60);
32 |         fprintf(stderr, "Was able to get to the end of the test.\n");
33 |         finalize_wrapper();
34 |         return 1;
35 |     }
36 | 
37 |     return status;
38 | }
39 | 


--------------------------------------------------------------------------------
/perftest/perftest-p2p-pcie.list:
--------------------------------------------------------------------------------
 1 | /device/pt-to-pt/shmem_p_latency
 2 | /device/pt-to-pt/shmem_p_bw
 3 | /device/pt-to-pt/shmem_g_latency
 4 | /device/pt-to-pt/shmem_g_bw
 5 | /device/pt-to-pt/shmem_st_bw
 6 | /device/pt-to-pt/shmem_p_ping_pong_latency
 7 | /device/pt-to-pt/shmem_put_latency
 8 | /device/pt-to-pt/shmem_put_ping_pong_latency
 9 | /device/pt-to-pt/shmem_signal_ping_pong_latency
10 | /device/pt-to-pt/shmem_put_bw
11 | /device/pt-to-pt/shmem_get_bw
12 | /device/coll/barrier_latency
13 | /device/coll/bcast_latency
14 | /device/coll/fcollect_latency
15 | /device/coll/alltoall_latency
16 | /device/coll/reduction_latency
17 | /device/coll/sync_latency
18 | /host/pt-to-pt/bw
19 | /host/pt-to-pt/latency
20 | /host/pt-to-pt/stream_latency
21 | /host/coll/barrier_all_on_stream
22 | /host/coll/barrier_on_stream
23 | /host/coll/sync_all_on_stream
24 | /host/coll/sync_on_stream
25 | /host/coll/alltoall_on_stream
26 | /host/coll/broadcast_on_stream
27 | /host/coll/fcollect_on_stream
28 | /host/coll/reduction_on_stream
29 | 


--------------------------------------------------------------------------------
/src/host/util/cs.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <pthread.h>
 8 | #include <stddef.h>
 9 | 
10 | #include "non_abi/nvshmemx_error.h"
11 | 
12 | static pthread_mutex_t global_mutex;
13 | 
14 | void nvshmemu_thread_cs_init() {
15 |     int status = pthread_mutex_init(&global_mutex, NULL);
16 |     NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex init failed \n");
17 | }
18 | 
19 | void nvshmemu_thread_cs_finalize() {
20 |     int status = pthread_mutex_destroy(&global_mutex);
21 |     NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex destroy failed \n");
22 | }
23 | 
24 | void nvshmemu_thread_cs_enter() {
25 |     int status = pthread_mutex_lock(&global_mutex);
26 |     NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex lock failed \n");
27 | }
28 | 
29 | void nvshmemu_thread_cs_exit() {
30 |     int status = pthread_mutex_unlock(&global_mutex);
31 |     NVSHMEMI_NZ_SYSCHECK_EXIT(status, "mutex unlock failed \n");
32 | }
33 | 


--------------------------------------------------------------------------------
/perftest/perftest-ib.list:
--------------------------------------------------------------------------------
 1 | /device/pt-to-pt/shmem_atomic_ping_pong_latency
 2 | /device/pt-to-pt/shmem_put_latency
 3 | /device/pt-to-pt/shmem_put_ping_pong_latency
 4 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency
 5 | /device/pt-to-pt/shmem_put_signal_ping_pong_latency
 6 | /device/pt-to-pt/shmem_signal_ping_pong_latency
 7 | /device/pt-to-pt/shmem_put_bw
 8 | /device/pt-to-pt/shmem_get_bw
 9 | /device/pt-to-pt/shmem_g_bw
10 | /device/pt-to-pt/shmem_g_latency
11 | /device/coll/barrier_latency
12 | /device/coll/bcast_latency
13 | /device/coll/fcollect_latency
14 | /device/coll/alltoall_latency
15 | /device/coll/reduction_latency
16 | /device/coll/sync_latency
17 | /host/pt-to-pt/bw
18 | /host/pt-to-pt/latency
19 | /host/pt-to-pt/stream_latency
20 | /host/coll/barrier_all_on_stream
21 | /host/coll/barrier_on_stream
22 | /host/coll/sync_all_on_stream
23 | /host/coll/sync_on_stream
24 | /host/coll/alltoall_on_stream
25 | /host/coll/broadcast_on_stream
26 | /host/coll/fcollect_on_stream
27 | /host/coll/reduction_on_stream
28 | /host/init/malloc
29 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | from .init_fini import *
12 | from .nvshmem_types import *
13 | from .memory import *
14 | from .interop.cupy import *
15 | from .interop.torch import *
16 | from .direct import *
17 | from .collective import *
18 | from .rma import *
19 | from .teams import *
20 | 
21 | import os
22 | 
23 | # Define public exports
24 | __all__ = memory.__all__ + init_fini.__all__ + nvshmem_types.__all__ + \
25 |           interop.cupy.__all__ + interop.torch.__all__ + direct.__all__ + \
26 |           collective.__all__ + rma.__all__ + teams.__all__ 
27 | 


--------------------------------------------------------------------------------
/nvshmem4py/scripts/find_python_versions.sh:
--------------------------------------------------------------------------------
 1 | declare -A seen
 2 | IFS=':' read -ra paths <<< "$PATH"
 3 | for dir in "${paths[@]}"; do
 4 |     for p in "$dir"/python3.[0-9]*; do
 5 |     [[ -e "$p" ]] || continue  # Skip if no match or broken glob
 6 |         # Ignore invalid versions
 7 |         if [[ $(basename "$p") =~ ^python3\.[0-9]+(\.[0-9]+)*$ ]]; then
 8 |             :
 9 |         else
10 |            continue
11 |         fi
12 |         if [ -x "$p" ]; then
13 |             v=$(
14 |                 "$p" -c 'import sys; print("%d.%d" % (sys.version_info[0], sys.version_info[1]))' 2>/dev/null
15 |             )
16 |             if [[ $v =~ ^3.[0-9]+$ ]]; then
17 |                 major=$(echo $v | cut -d. -f1)
18 |                 minor=$(echo $v | cut -d. -f2)
19 |                 if (( major > 3 || (major == 3 && minor >= 9) )); then
20 |                     if [[ -z ${seen[$v]} ]]; then
21 |                         seen[$v]=1
22 |                         echo "$v|$p"
23 |                     fi
24 |                 fi
25 |             fi
26 |         fi
27 |     done
28 | done
29 | 


--------------------------------------------------------------------------------
/src/host/comm/rma.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <cuda_runtime.h>
 8 | #include "internal/host/nvshmem_internal.h"
 9 | #include "internal/non_abi/nvshmemi_h_to_d_rma_defs.cuh"
10 | 
11 | int nvshmemi_proxy_rma_launcher(void *args[], cudaStream_t cstrm, bool is_nbi, bool is_signal) {
12 |     if (is_signal && is_nbi) {
13 |         return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_signal_entrypoint, 1, 1, args, 0,
14 |                                 cstrm);
15 |     } else if (is_nbi) {
16 |         return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_entrypoint, 1, 1, args, 0, cstrm);
17 |     } else if (is_signal) {
18 |         return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_signal_entrypoint_blocking, 1, 1,
19 |                                 args, 0, cstrm);
20 |     } else {
21 |         return cudaLaunchKernel((const void *)nvshmemi_proxy_rma_entrypoint_blocking, 1, 1, args, 0,
22 |                                 cstrm);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib1.cu:
--------------------------------------------------------------------------------
 1 | #include "nvshmem.h"
 2 | #include "nvshmemx.h"
 3 | #include "simplelib1.h"
 4 | 
 5 | __device__ int num_errors_d;
 6 | __global__ void simplelib1_nvshmem_kernel(int *array) {
 7 |     int my_pe = nvshmem_my_pe();
 8 |     int n_pes = nvshmem_n_pes();
 9 |     int next_pe = (my_pe + 1) % n_pes;
10 |     int prev_pe = (my_pe - 1 + n_pes) % n_pes;
11 |     nvshmem_int_p(array, my_pe, next_pe);
12 |     nvshmem_barrier_all();
13 | 
14 |     if (array[0] != prev_pe) {
15 |         printf("simplelib1: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]);
16 |         num_errors_d = 1;
17 |     }
18 | }
19 | 
20 | int simplelib1_dowork() {
21 |     int *array = (int *)nvshmem_calloc(1, sizeof(int));
22 |     int num_errors = 0;
23 |     simplelib1_nvshmem_kernel<<<1, 1>>>(array);
24 |     cudaDeviceSynchronize();
25 |     cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int));
26 |     nvshmem_free(array);
27 |     return num_errors;
28 | }
29 | 
30 | void simplelib1_init() { nvshmem_init(); }
31 | 
32 | void simplelib1_finalize() { nvshmem_finalize(); }
33 | 


--------------------------------------------------------------------------------
/test/host/interop/simplelib2.cu:
--------------------------------------------------------------------------------
 1 | #include "nvshmem.h"
 2 | #include "nvshmemx.h"
 3 | #include "simplelib2.h"
 4 | 
 5 | __device__ int num_errors_d;
 6 | __global__ void simplelib2_nvshmem_kernel(int *array) {
 7 |     int my_pe = nvshmem_my_pe();
 8 |     int n_pes = nvshmem_n_pes();
 9 |     int next_pe = (my_pe + 1) % n_pes;
10 |     int prev_pe = (my_pe - 1 + n_pes) % n_pes;
11 |     nvshmem_int_p(array, my_pe, next_pe);
12 |     nvshmem_barrier_all();
13 | 
14 |     if (array[0] != prev_pe) {
15 |         printf("simplelib2: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]);
16 |         num_errors_d = 1;
17 |     }
18 | }
19 | 
20 | int simplelib2_dowork() {
21 |     int *array = (int *)nvshmem_calloc(1, sizeof(int));
22 |     int num_errors = 0;
23 |     simplelib2_nvshmem_kernel<<<1, 1>>>(array);
24 |     cudaDeviceSynchronize();
25 |     cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int));
26 |     nvshmem_free(array);
27 |     return num_errors;
28 | }
29 | 
30 | void simplelib2_init() { nvshmem_init(); }
31 | 
32 | void simplelib2_finalize() { nvshmem_finalize(); }
33 | 


--------------------------------------------------------------------------------
/src/include/internal/device/nvshmemi_device.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _NVSHMEMI_DEVICE_STATE_H_
 8 | #define _NVSHMEMI_DEVICE_STATE_H_
 9 | 
10 | #include <cuda_runtime.h>
11 | #if !defined __CUDACC_RTC__
12 | #include <stdio.h>
13 | #endif
14 | 
15 | int nvshmemi_setup_collective_launch();
16 | int nvshmemi_teardown_collective_launch();
17 | 
18 | int nvshmemi_check_state_and_init_d();
19 | typedef struct {
20 |     cudaStream_t stream;
21 |     cudaEvent_t begin_event;
22 |     cudaEvent_t end_event;
23 | } collective_launch_params_t;
24 | 
25 | typedef struct {
26 |     int multi_processor_count;
27 |     int cooperative_launch;
28 | } cuda_device_attributes_t;
29 | 
30 | typedef struct nvshmemi_device_state {
31 |     bool is_initialized;
32 |     int cuda_device_id;
33 |     cuda_device_attributes_t cu_dev_attrib;
34 |     collective_launch_params_t claunch_params;
35 | } nvshmemi_device_state_t;
36 | 
37 | extern nvshmemi_device_state_t nvshmemi_device_only_state;
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/src/modules/transport/ibdevx/ibdevx.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _IBRC_H
 8 | #define _IBRC_H
 9 | 
10 | #define NVSHMEMT_IBDEVX_DBSIZE 8
11 | /* 64 bytes per WQE BB shift = log2(64) for easy multiplication. */
12 | #define NVSHMEMT_IBDEVX_WQE_BB_SHIFT 6
13 | 
14 | /* Atomic mode for our transport */
15 | #define NVSHMEMT_IBDEVX_MLX5_QPC_ATOMIC_MODE_UP_TO_64B 0x3
16 | 
17 | #define NVSHMEMT_IBDEVX_MLX5_SEND_WQE_DS 0x10
18 | 
19 | /* Indicates to DEVX that we should be using an SRQ. */
20 | #define NVSHMEMT_IBDEVX_SRQ_TYPE_VALUE 0x1
21 | 
22 | /* Enables remote read/write/atomic access for a QP */
23 | #define NVSHMEMT_IBDEVX_INIT2R2R_PARAM_MASK 0xE
24 | 
25 | /* Important byte masks. */
26 | #define NVSHMEMT_IBDEVX_MASK_UPPER_BYTE_32 0x00FFFFFF
27 | #define NVSHMEMT_IBDEVX_MASK_LOWER_3_BYTES_32 0xFF000000
28 | 
29 | /* OPMOD Constants for AMOs. */
30 | #define NVSHMEMT_IBDEVX_4_BYTE_EXT_AMO_OPMOD 0x08000000
31 | #define NVSHMEMT_IBDEVX_8_BYTE_EXT_AMO_OPMOD 0x09000000
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/core/device/numba/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | import os
12 | import warnings
13 | 
14 | from nvshmem.core.nvshmem_types import NvshmemWarning
15 | 
16 | if os.path.exists(os.path.join(os.path.dirname(__file__), "rma.py")):
17 |     from .rma import *
18 |     from .direct import *
19 |     from .amo import *
20 |     from .collective import *
21 |     from .mem import *
22 |     __all__ = rma.__all__ + direct.__all__ + amo.__all__ + collective.__all__ + mem.__all__
23 | else:
24 |     warnings.warn("Numba device bindings are not enabled", NvshmemWarning)
25 |     rma = None
26 |     direct = None
27 |     amo = None
28 | 


--------------------------------------------------------------------------------
/perftest/perftest-p2p-nvlink.list:
--------------------------------------------------------------------------------
 1 | /device/pt-to-pt/shmem_atomic_ping_pong_latency
 2 | /device/pt-to-pt/shmem_put_atomic_ping_pong_latency
 3 | /device/pt-to-pt/shmem_p_latency
 4 | /device/pt-to-pt/shmem_p_bw
 5 | /device/pt-to-pt/shmem_g_latency
 6 | /device/pt-to-pt/shmem_g_bw
 7 | /device/pt-to-pt/shmem_st_bw
 8 | /device/pt-to-pt/shmem_p_ping_pong_latency
 9 | /device/pt-to-pt/shmem_put_latency
10 | /device/pt-to-pt/shmem_put_ping_pong_latency
11 | /device/pt-to-pt/shmem_signal_ping_pong_latency
12 | /device/pt-to-pt/shmem_put_bw
13 | /device/pt-to-pt/shmem_get_bw
14 | /device/coll/barrier_latency
15 | /device/coll/bcast_latency
16 | /device/coll/fcollect_latency
17 | /device/coll/alltoall_latency
18 | /device/coll/reduction_latency
19 | /device/coll/sync_latency
20 | /host/pt-to-pt/bw
21 | /host/pt-to-pt/latency
22 | /host/pt-to-pt/stream_latency
23 | /host/coll/barrier_all_on_stream
24 | /host/coll/barrier_on_stream
25 | /host/coll/sync_all_on_stream
26 | /host/coll/sync_on_stream
27 | /host/coll/alltoall_on_stream
28 | /host/coll/broadcast_on_stream
29 | /host/coll/fcollect_on_stream
30 | /host/coll/reduction_on_stream
31 | 


--------------------------------------------------------------------------------
/src/include/device/nvshmemx_collective_launch_apis.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #ifndef _NVSHMEMX_COLLECTIVE_LAUNCH_APIS_H_
14 | #define _NVSHMEMX_COLLECTIVE_LAUNCH_APIS_H_
15 | 
16 | #include <cuda_runtime.h>
17 | 
18 | #if !defined __CUDACC_RTC__
19 | int nvshmemx_collective_launch(const void *func, dim3 gridDims, dim3 blockDims, void **args,
20 |                                size_t sharedMem, cudaStream_t stream);
21 | int nvshmemx_collective_launch_query_gridsize(const void *func, dim3 blockDims, void **args,
22 |                                               size_t sharedMem, int *gridsize);
23 | #endif
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/ncclSocket/ncclsocket_debug.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_DEBUG_H_
 8 | #define NCCL_SOCKET_DEBUG_H_
 9 | 
10 | #include "bootstrap_util.h"  // for BOOTSTRAP_DEBUG_PRINT, BOOTSTRAP_ERROR_P...
11 | 
12 | extern thread_local int ncclDebugNoWarn;
13 | typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
14 | typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
15 | 
16 | #define WARN(...) BOOTSTRAP_ERROR_PRINT(__VA_ARGS__) 
17 | #define INFO(FLAGS, ...) BOOTSTRAP_DEBUG_PRINT(__VA_ARGS__)
18 | #define TRACE_CALL(...) BOOTSTRAP_DEBUG_PRINT(__VA_ARGS__)
19 | #define TRACE(...) //nop
20 | 
21 | #endif


--------------------------------------------------------------------------------
/nvshmem4py/README.md:
--------------------------------------------------------------------------------
 1 | NVSHMEM4Py Overview
 2 | *******************
 3 | 
 4 | NVSHMEM4Py is a Python package that provides a Pythonic interface to NVSHMEM
 5 | 
 6 | NVSHMEM4Py follows the NVSHMEM SLA. The details of the NVSHMEM SLA [is available here](https://docs.nvidia.com/nvshmem/api/sla.html).
 7 | 
 8 | Quick Links
 9 | ****************
10 | 
11 | NVSHMEM4Py is a component of NVSHMEM™. Please see the following public links for information on building and working wih NVSHMEM:
12 | 
13 | [Project Homepage](https://developer.nvidia.com/nvshmem)
14 | 
15 | [Release Notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html)
16 | 
17 | [Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/index.html)
18 | 
19 | [Best Practice Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/best-practice-guide/index.html)
20 | 
21 | [API Documentation](https://docs.nvidia.com/nvshmem/api/index.html)
22 | 
23 | [Devzone Topic Page](https://forums.developer.nvidia.com/tag/nvshmem)
24 | 
25 | The maintainers of the NVSHMEM project can also be contacted by e-mail at nvshmem@nvidia.com
26 | 


--------------------------------------------------------------------------------
/test/host/pt-to-pt/quiet.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | #include <unistd.h>
10 | #include "nvshmem.h"
11 | #include "nvshmemx.h"
12 | #include "utils.h"
13 | 
14 | #define NUM_ITERS 100
15 | 
16 | int main(int argc, char **argv) {
17 |     int status = 0;
18 |     int num_iters = NUM_ITERS;
19 | 
20 |     init_wrapper(&argc, &argv);
21 | 
22 |     while (1) {
23 |         int c;
24 |         c = getopt(argc, argv, "n:h");
25 |         if (c == -1) break;
26 | 
27 |         switch (c) {
28 |             case 'n':
29 |                 num_iters = strtol(optarg, NULL, 0);
30 |                 break;
31 |             default:
32 |             case 'h':
33 |                 printf("-n [No of iterations] \n");
34 |                 goto out;
35 |         }
36 |     }
37 |     nvshmem_barrier_all();
38 |     for (int i = 0; i < num_iters; i++) {
39 |         nvshmem_quiet();
40 |     }
41 | 
42 | out:
43 |     nvshmem_barrier_all();
44 |     finalize_wrapper();
45 | 
46 |     return status;
47 | }
48 | 


--------------------------------------------------------------------------------
/perftest/README.md:
--------------------------------------------------------------------------------
 1 | Performance tests accept command line arguments. Below is the full list of options, 
 2 | any given test will use only a subset of these options.
 3 | * -b, --min_size <minbytes> 
 4 | * -e, --max_size <maxbytes> 
 5 | * -f, --step <step factor for message sizes> 
 6 | * -n, --iters <number of iterations> 
 7 | * -w, --warmup_iters <number of warmup iterations> 
 8 | * -c, --ctas <number of CTAs to launch> (used in some device pt-to-pt tests) 
 9 | * -t, --threads_per_cta <number of threads per block> (used in some device pt-to-pt tests) 
10 | * -d, --datatype: <int, int32_t, uint32_t, int64_t, uint64_t, long, longlong, ulonglong, size, ptrdiff, float, double, fp16, bf16> 
11 | * -o, --reduce_op <min, max, sum, prod, and, or, xor> 
12 | * -s, --scope <thread, warp, block, all> 
13 | * -i, --stride stride between elements 
14 | * -a, --atomic_op <inc, add, and, or, xor, set, swap, fetch_<inc, add, and, or, xor>, compare_swap> 
15 | * --bidir: run bidirectional test 
16 | * --msgrate: report message rate (MMPs)
17 | * --dir: <read, write> (whether to run put or get operations) 
18 | * --issue: <on_stream, host> (applicable in some host pt-to-pt tests) 
19 | 


--------------------------------------------------------------------------------
/test/host/pt-to-pt/fence.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <cstdio>
 8 | #include <cstdlib>
 9 | #include <unistd.h>
10 | #include "nvshmem.h"
11 | #include "nvshmemx.h"
12 | #include "utils.h"
13 | 
14 | #define NUM_ITERS 100
15 | 
16 | int main(int argc, char *argv[]) {
17 |     int status = 0;
18 |     int num_iters = NUM_ITERS;
19 | 
20 |     init_wrapper(&argc, &argv);
21 | 
22 |     while (1) {
23 |         int c;
24 |         c = getopt(argc, argv, "n:h");
25 |         if (c == -1) break;
26 | 
27 |         switch (c) {
28 |             case 'n':
29 |                 num_iters = strtol(optarg, NULL, 0);
30 |                 break;
31 |             default:
32 |             case 'h':
33 |                 printf("-n [No of iterations] \n");
34 |                 goto out;
35 |         }
36 |     }
37 | 
38 |     nvshmem_barrier_all();
39 |     for (int i = 0; i < num_iters; i++) {
40 |         nvshmem_fence();
41 |     }
42 | 
43 | out:
44 |     nvshmem_barrier_all();
45 |     finalize_wrapper();
46 | 
47 |     return status;
48 | }
49 | 


--------------------------------------------------------------------------------
/src/include/nvshmem.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #ifndef _NVSHMEM_H_
14 | #define _NVSHMEM_H_
15 | 
16 | #include "non_abi/nvshmem_build_options.h"
17 | /* NVRTC only compiles device code. Leave out host headers */
18 | #if !defined __CUDACC_RTC__ && !defined __clang_llvm_bitcode_lib__ && \
19 |     !defined __NVSHMEM_NUMBA_SUPPORT__
20 | #include "nvshmem_host.h"
21 | #endif
22 | /* NVSHMEM4PY hostlib can't parse device headers */
23 | #if !defined NVSHMEM_HOSTLIB_ONLY
24 | #include "device/nvshmem_defines.h"
25 | #include "device/nvshmem_coll_defines.cuh"
26 | #include "device/nvshmemx_defines.h"
27 | #include "device/nvshmemx_coll_defines.cuh"
28 | #endif
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/include/nvshmemx.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include "non_abi/nvshmem_build_options.h"
14 | 
15 | #ifndef _NVSHMEMX_H_
16 | #define _NVSHMEMX_H_
17 | 
18 | /* NVRTC only compiles device code. Leave out host headers */
19 | #if !defined __CUDACC_RTC__ && !defined __clang_llvm_bitcode_lib__ && \
20 |     !defined __NVSHMEM_NUMBA_SUPPORT__
21 | #include "host/nvshmemx_api.h"
22 | #include "device/tile/nvshmemx_tile_api.hpp"
23 | #include "device/nvshmemx_collective_launch_apis.h"
24 | #endif
25 | #if !defined NVSHMEM_HOSTLIB_ONLY
26 | #include "device/nvshmemx_defines.h"
27 | #include "device/nvshmemx_coll_defines.cuh"
28 | #include "device/tile/nvshmemx_tile_api_defines.cuh"
29 | #endif
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NVSHMEM Overview
 2 | ****************
 3 | 
 4 | NVSHMEM™ is a parallel programming interface based on OpenSHMEM that provides efficient and
 5 | scalable communication for NVIDIA GPU clusters. NVSHMEM creates a global address space for
 6 | data that spans the memory of multiple GPUs and can be accessed with fine-grained 
 7 | GPU-initiated operations, CPU-initiated operations, and operations on CUDA® streams.
 8 | 
 9 | Quick Links
10 | ****************
11 | 
12 | Please see the following public links for information on building and working wih NVSHMEM:
13 | 
14 | [Project Homepage](https://developer.nvidia.com/nvshmem)
15 | 
16 | [Release Notes](https://docs.nvidia.com/nvshmem/release-notes-install-guide/release-notes/index.html)
17 | 
18 | [Installation Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/index.html)
19 | 
20 | [Best Practice Guide](https://docs.nvidia.com/nvshmem/release-notes-install-guide/best-practice-guide/index.html)
21 | 
22 | [API Documentation](https://docs.nvidia.com/nvshmem/api/index.html)
23 | 
24 | [Devzone Topic Page](https://forums.developer.nvidia.com/tag/nvshmem)
25 | 
26 | The maintainers of the NVSHMEM project can also be contacted by e-mail at nvshmem@nvidia.com
27 | 


--------------------------------------------------------------------------------
/test/device/init/global_exit.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include "nvshmem.h"
 9 | #include "nvshmemx.h"
10 | #include "utils.h"
11 | 
12 | __global__ void test_kernel() { nvshmem_global_exit(0); }
13 | 
14 | int main(int c, char *v[]) {
15 |     int status = 0;
16 | 
17 |     init_wrapper(&c, &v);
18 | 
19 | #ifdef _NVSHMEM_DEBUG
20 |     int mype = nvshmem_my_pe();
21 |     int npes = nvshmem_n_pes();
22 | #endif
23 | 
24 |     DEBUG_PRINT("[%d of %d] hello world! \n", mype, npes);
25 | 
26 |     nvshmem_barrier_all();
27 | 
28 |     if (mype == 0) {
29 |         test_kernel<<<1, 1, 0>>>();
30 |         CUDA_CHECK(cudaDeviceSynchronize());
31 |         /* Note, this should be unreachable. return a unique error code if we reach here. */
32 |         status = 2;
33 |     } else {
34 |         sleep(60); /* This is added to allow the PE0's global_exit to abort the program before PE1+
35 |                       finalize themselves */
36 |         fprintf(stderr, "Was able to get to the end of the test.\n");
37 |         finalize_wrapper();
38 |         return 1;
39 |     }
40 | 
41 |     return status;
42 | }
43 | 


--------------------------------------------------------------------------------
/test/common/test-simple-pmi/test_simple_pmiutil.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 2 | /*
 3 |  *  (C) 2001 by Argonne National Laboratory.
 4 |  *      See License.txt in top-level directory.
 5 |  */
 6 | /*TODO: NVIDIA copyright*/
 7 | 
 8 | #ifndef _SIMPLE_PMI_UTIL_H_
 9 | #define _SIMPLE_PMI_UTIL_H_
10 | 
11 | /* maximum sizes for arrays */
12 | #define SPMIU_MAXLINE 1024
13 | #define SPMIU_IDSIZE 32
14 | 
15 | /* we don't have access to MPIU_Assert and friends here in the PMI code */
16 | #if defined(HAVE_ASSERT_H)
17 | #include <assert.h>
18 | #define SPMIU_Assert(expr) assert(expr)
19 | #else
20 | #define SPMIU_Assert(expr)
21 | #endif
22 | 
23 | #if defined HAVE_ARPA_INET_H
24 | #include <arpa/inet.h>
25 | #endif /* HAVE_ARPA_INET_H */
26 | 
27 | /* prototypes for SPMIU routines */
28 | void SPMIU_Set_rank(int PMI_rank);
29 | void SPMIU_SetServer(void);
30 | void SPMIU_printf(int print_flag, const char *fmt, ...);
31 | int SPMIU_readline(int fd, char *buf, int max);
32 | int SPMIU_writeline(int fd, char *buf);
33 | int SPMIU_parse_keyvals(char *st);
34 | void SPMIU_dump_keyvals(void);
35 | char *SPMIU_getval(const char *keystr, char *valstr, int vallen);
36 | void SPMIU_chgval(const char *keystr, char *valstr);
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/pmi/simple-pmi/simple_pmiutil.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
 2 | /*
 3 |  *  (C) 2001 by Argonne National Laboratory.
 4 |  *      See License.txt in top-level directory.
 5 |  */
 6 | /*TODO: NVIDIA copyright*/
 7 | 
 8 | #ifndef _SIMPLE_PMI_UTIL_H_
 9 | #define _SIMPLE_PMI_UTIL_H_
10 | 
11 | /* maximum sizes for arrays */
12 | #define SPMIU_MAXLINE 1024
13 | #define SPMIU_IDSIZE 32
14 | 
15 | /* we don't have access to MPIU_Assert and friends here in the PMI code */
16 | #if defined(HAVE_ASSERT_H)
17 | #include <assert.h>
18 | #define SPMIU_Assert(expr) assert(expr)
19 | #else
20 | #define SPMIU_Assert(expr)
21 | #endif
22 | 
23 | #if defined HAVE_ARPA_INET_H
24 | #include <arpa/inet.h>
25 | #endif /* HAVE_ARPA_INET_H */
26 | 
27 | /* prototypes for SPMIU routines */
28 | void SPMIU_Set_rank(int PMI_rank);
29 | void SPMIU_SetServer(void);
30 | void SPMIU_printf(int print_flag, const char *fmt, ...);
31 | int SPMIU_readline(int fd, char *buf, int max);
32 | int SPMIU_writeline(int fd, char *buf);
33 | int SPMIU_parse_keyvals(char *st);
34 | void SPMIU_dump_keyvals(void);
35 | char *SPMIU_getval(const char *keystr, char *valstr, int vallen);
36 | void SPMIU_chgval(const char *keystr, char *valstr);
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/test_get_version.py:
--------------------------------------------------------------------------------
 1 | import cffi
 2 | import argparse
 3 | 
 4 | import numpy as np
 5 | from numba import cuda
 6 | 
 7 | from utils import uid_init, mpi_init
 8 | 
 9 | from nvshmem.bindings.device.numba import vendor_get_version_info, info_get_name
10 | 
11 | def test_get_version():
12 |     ffi = cffi.FFI()
13 | 
14 |     @cuda.jit(lto=True)
15 |     def kernel(arr, name):
16 |         ptr = ffi.from_buffer(arr)
17 |         ptr2 = ffi.from_buffer(arr[1:])
18 |         ptr3 = ffi.from_buffer(arr[2:])
19 |         vendor_get_version_info(ptr, ptr2, ptr3)
20 | 
21 |         nameptr = ffi.from_buffer(name)
22 |         info_get_name(nameptr)
23 | 
24 | 
25 |     arr = np.zeros(3, dtype=np.int32)
26 |     name = np.zeros(100, dtype=np.int8)
27 | 
28 |     kernel[1, 1](arr, name)
29 |     print(f"ver: {arr[0]}.{arr[1]}.{arr[2]}")
30 |     print("".join(chr(i) for i in name))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid")
36 |     args = parser.parse_args()
37 |     if args.init_type == "uid":
38 |         uid_init()
39 |     elif args.init_type == "mpi":
40 |         mpi_init()
41 | 
42 |     test_get_version()


--------------------------------------------------------------------------------
/src/include/internal/host/nvshmemi_bootstrap_library.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef NVSHMEMI_BOOTSTRAP_LIBRARY_H
 8 | #define NVSHMEMI_BOOTSTRAP_LIBRARY_H
 9 | 
10 | #include <cstddef>
11 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h"
12 | 
13 | enum { BOOTSTRAP_MPI = 0, BOOTSTRAP_SHMEM, BOOTSTRAP_PMI, BOOTSTRAP_PLUGIN, BOOTSTRAP_UID };
14 | 
15 | typedef struct bootstrap_attr {
16 |     bootstrap_attr() : initialize_shmem(0), mpi_comm(NULL), meta_data(NULL), uid_args(NULL) {}
17 |     int initialize_shmem;
18 |     void *mpi_comm;
19 |     void *meta_data;
20 |     void *uid_args;
21 | } bootstrap_attr_t;
22 | 
23 | int bootstrap_set_bootattr(int flags, void *nvshmem_attr, bootstrap_attr_t *boot_attr);
24 | int bootstrap_preinit(int flags, bootstrap_handle_t *handle);
25 | int bootstrap_init(int flags, bootstrap_attr_t *attr, bootstrap_handle_t *handle,
26 |                    int *bootstrap_mode);
27 | void bootstrap_finalize();
28 | 
29 | int bootstrap_loader_preinit(const char *plugin, bootstrap_handle_t *handle);
30 | int bootstrap_loader_init(const char *plugin, void *arg, bootstrap_handle_t *handle);
31 | int bootstrap_loader_finalize(bootstrap_handle_t *handle);
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/test_ring.py:
--------------------------------------------------------------------------------
 1 | import cffi
 2 | import argparse
 3 | 
 4 | from cuda.core.experimental import Device
 5 | 
 6 | from numba import cuda, int32
 7 | 
 8 | import nvshmem
 9 | from nvshmem.bindings.device.numba import my_pe, n_pes, int_p
10 | from nvshmem.bindings import barrier_all, my_pe as h_my_pe
11 | 
12 | from utils import uid_init, mpi_init
13 | 
14 | 
15 | def test_ring(dev: Device):
16 |     ffi = cffi.FFI()
17 | 
18 |     @cuda.jit(lto=True)
19 |     def app_kernel(dest):
20 |         ptr = ffi.from_buffer(dest)
21 |         mype = my_pe()
22 |         npes = n_pes()
23 |         peer = int32((mype + 1) % npes)
24 | 
25 |         int_p(ptr, mype, peer)
26 | 
27 |     dest = nvshmem.core.array((1,), dtype="int32")
28 | 
29 |     app_kernel[1, 1, 0](dest)
30 | 
31 |     barrier_all()
32 |     dev.sync()
33 | 
34 |     print(f"{h_my_pe()}: received message {dest[0]}")
35 | 
36 |     nvshmem.core.free_array(dest)
37 |     nvshmem.core.finalize()
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid")
43 |     args = parser.parse_args()
44 |     if args.init_type == "uid":
45 |         dev = uid_init()
46 |     elif args.init_type == "mpi":
47 |         dev = mpi_init()
48 | 
49 |     test_ring(dev)


--------------------------------------------------------------------------------
/src/modules/transport/common/transport_mlx5_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _TRANSPORT_MLX5_COMMON_H
 8 | #define _TRANSPORT_MLX5_COMMON_H
 9 | 
10 | #include <stdint.h>  // IWYU pragma: keep
11 | // IWYU pragma: no_include <bits/stdint-uintn.h>
12 | 
13 | bool nvshmemt_ib_common_query_mlx5_caps(struct ibv_context *context);
14 | int nvshmemt_ib_common_query_endianness_conversion_size(uint32_t *endianness_mode,
15 |                                                         struct ibv_context *context);
16 | int nvshmemt_ib_common_check_nic_ext_atomic_support(struct ibv_context *context);
17 | 
18 | /* These values are not defined on all systems.
19 |  * However, they can be traced back to a kernel enum with
20 |  * these values.
21 |  */
22 | #ifndef MLX5DV_UAR_ALLOC_TYPE_BF
23 | #define MLX5DV_UAR_ALLOC_TYPE_BF 0x0
24 | #endif
25 | 
26 | #ifndef MLX5DV_UAR_ALLOC_TYPE_NC
27 | #define MLX5DV_UAR_ALLOC_TYPE_NC 0x1
28 | #endif
29 | 
30 | enum {
31 |     MLX5_ATOMIC_CAP_OP_SUPPORT_CAS = 0x1,
32 |     MLX5_ATOMIC_CAP_OP_SUPPORT_FA = 0x2,
33 |     MLX5_ATOMIC_CAP_OP_SUPPORT_MASKED_CAS = 0x4,
34 |     MLX5_ATOMIC_CAP_OP_SUPPORT_MASKED_FA = 0x8,
35 | };
36 | 
37 | enum {
38 |     MLX5_ATOMIC_CAP_SIZE_SUPPORT_4B = 0x4,
39 |     MLX5_ATOMIC_CAP_SIZE_SUPPORT_8B = 0x8,
40 | };
41 | 
42 | #endif
43 | 


--------------------------------------------------------------------------------
/src/include/internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | #ifndef NVSHMEMI_BOOTSTRAP_DEFINES_H
 7 | #define NVSHMEMI_BOOTSTRAP_DEFINES_H
 8 | 
 9 | #include <limits.h>
10 | 
11 | typedef struct bootstrap_init_ops {
12 |     void *cookie;
13 |     int (*get_unique_id)(void *cookie);
14 | } bootstrap_init_ops_t;
15 | 
16 | enum {
17 |     BOOTSTRAP_OPTIONS_STYLE_INFO = 0,
18 |     BOOTSTRAP_OPTIONS_STYLE_RST = 1,
19 |     BOOTSTRAP_OPTIONS_STYLE_MAX = INT_MAX
20 | };
21 | 
22 | typedef struct bootstrap_handle {
23 |     int version;
24 |     int pg_rank;
25 |     int pg_size;
26 |     int mype_node;
27 |     int npes_node;
28 |     int (*allgather)(const void *sendbuf, void *recvbuf, int bytes,
29 |                      struct bootstrap_handle *handle);
30 |     int (*alltoall)(const void *sendbuf, void *recvbuf, int bytes, struct bootstrap_handle *handle);
31 |     int (*barrier)(struct bootstrap_handle *handle);
32 |     void (*global_exit)(int status);
33 |     int (*finalize)(struct bootstrap_handle *handle);
34 |     int (*show_info)(struct bootstrap_handle *handle, int style);
35 |     bootstrap_init_ops_t *pre_init_ops;
36 |     void *comm_state;
37 | } bootstrap_handle_v1;
38 | 
39 | typedef bootstrap_handle_v1 bootstrap_handle_t;
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/scripts/install_hydra.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script downloads hydra from a static link.
 4 | # And installs it at the user-specificed location
 5 | 
 6 | set -e
 7 | 
 8 | if [ "$#" -ne 2 ]; then
 9 |     echo "Usage: ./install_hydra.sh src_dir builddir"
10 |     echo "    src_dir: location where hydra source will be downloaded"
11 |     echo "    builddir: installation directory"
12 |     exit 1
13 | fi
14 | 
15 | srcdir=$1
16 | builddir=$2
17 | 
18 | if test -f $builddir/bin/nvshmrun.hydra; then
19 |     echo "hydra already installed"
20 |     exit 0
21 | fi
22 | 
23 | mkdir -p $srcdir
24 | cd $srcdir
25 | # Download hydra-4.3.2 source
26 | wget https://www.mpich.org/static/downloads/4.3.2/hydra-4.3.2.tar.gz
27 | gunzip hydra-4.3.2.tar.gz
28 | tar -xvf hydra-4.3.2.tar
29 | 
30 | # Install hydra
31 | cd hydra-4.3.2
32 | touch aclocal.m4;
33 | touch Makefile.am;
34 | touch Makefile.in;
35 | touch ./modules/mpl/aclocal.m4;
36 | touch ./modules/mpl/Makefile.am;
37 | touch ./modules/mpl/Makefile.in;
38 | 
39 | ./configure --prefix=$builddir --enable-cuda=no --enable-nvml=no
40 | make
41 | make install
42 | rm -f -- $builddir/include/mpl*
43 | mv $builddir/bin/mpiexec.hydra $builddir/bin/nvshmrun.hydra
44 | # create a soft link with name nvshmrun
45 | ln -s nvshmrun.hydra $builddir/bin/nvshmrun
46 | rm -f $builddir/bin/mpiexec $builddir/bin/mpirun
47 | 
48 | echo "Hydra binaries have been installed in $builddir/bin"
49 | 


--------------------------------------------------------------------------------
/src/include/non_abi/nvshmem_build_options.h.in:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #cmakedefine NVSHMEM_COMPLEX_SUPPORT
 4 | #cmakedefine NVSHMEM_DEBUG
 5 | #cmakedefine NVSHMEM_DEVEL
 6 | #cmakedefine NVSHMEM_TRACE
 7 | #cmakedefine NVSHMEM_DEFAULT_PMI2
 8 | #cmakedefine NVSHMEM_DEFAULT_PMIX
 9 | #cmakedefine NVSHMEM_DEFAULT_UCX
10 | #cmakedefine NVSHMEM_GPU_COLL_USE_LDST
11 | #cmakedefine NVSHMEM_IBDEVX_SUPPORT
12 | #cmakedefine NVSHMEM_IBRC_SUPPORT
13 | #cmakedefine NVSHMEM_LIBFABRIC_SUPPORT
14 | #cmakedefine NVSHMEM_MPI_SUPPORT
15 | #cmakedefine NVSHMEM_NVTX
16 | #cmakedefine NVSHMEM_PMIX_SUPPORT
17 | #cmakedefine NVSHMEM_SHMEM_SUPPORT
18 | #cmakedefine NVSHMEM_TIMEOUT_DEVICE_POLLING
19 | #cmakedefine NVSHMEM_UCX_SUPPORT
20 | #cmakedefine NVSHMEM_USE_DLMALLOC
21 | #cmakedefine NVSHMEM_USE_NCCL
22 | #cmakedefine NVSHMEM_USE_GDRCOPY
23 | #cmakedefine NVSHMEM_USE_MLX5DV
24 | #cmakedefine NVSHMEM_VERBOSE
25 | #cmakedefine NVSHMEM_BUILD_TESTS
26 | #cmakedefine NVSHMEM_BUILD_EXAMPLES
27 | #cmakedefine NVSHMEM_IBGDA_SUPPORT
28 | #cmakedefine NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY
29 | #cmakedefine NVSHMEM_ENABLE_ALL_DEVICE_INLINING
30 | #cmakedefine NVSHMEM_HOSTLIB_ONLY
31 | 
32 | #if defined NVSHMEM_HOSTLIB_ONLY
33 | #undef NVSHMEM_IBGDA_SUPPORT
34 | #undef NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY
35 | #define NVSHMEM_ENABLE_ALL_DEVICE_INLINING
36 | #endif
37 | 
38 | #if defined __clang_llvm_bitcode_lib__
39 | #define NVSHMEM_ENABLE_ALL_DEVICE_INLINING
40 | #endif


--------------------------------------------------------------------------------
/perftest/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(nvshmem_perftest_helper STATIC utils.cu)
 2 | 
 3 | set_target_properties(nvshmem_perftest_helper PROPERTIES
 4 | POSITION_INDEPENDENT_CODE ON
 5 | CXX_STANDARD_REQUIRED ON
 6 | CUDA_STANDARD_REQUIRED ON
 7 | CXX_STANDARD ${PERFTEST_CXX_STANDARD}
 8 | CUDA_STANDARD ${PERFTEST_CXX_STANDARD}
 9 | CUDA_SEPARABLE_COMPILATION ON
10 | )
11 | 
12 | target_include_directories(nvshmem_perftest_helper
13 |                            PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
14 | target_link_libraries(nvshmem_perftest_helper PUBLIC CUDA::cudart CUDA::cuda_driver)
15 | target_link_libraries(nvshmem_perftest_helper PUBLIC nvshmem_host nvshmem_device)
16 | 
17 | if(NVSHMEM_SHMEM_SUPPORT)
18 |   target_compile_definitions(nvshmem_perftest_helper PUBLIC NVSHMEMTEST_SHMEM_SUPPORT)
19 |   target_include_directories(nvshmem_perftest_helper PUBLIC SHMEM_INCLUDE)
20 | endif()
21 | 
22 | if(NVSHMEM_MPI_SUPPORT)
23 |    target_compile_definitions(nvshmem_perftest_helper PUBLIC NVSHMEMTEST_MPI_SUPPORT)
24 |    target_include_directories(nvshmem_perftest_helper PUBLIC $<BUILD_INTERFACE:${MPI_CXX_INCLUDE_DIRS}>)
25 | endif()
26 | 
27 | target_compile_options(nvshmem_perftest_helper
28 | PRIVATE $<$<CONFIG:Debug>:-O0;-g;>
29 | $<$<AND:$<BOOL:${NVSHMEM_VERBOSE}>,$<COMPILE_LANGUAGE:CUDA>>:-Xptxas -v>
30 | $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CONFIG:Debug>>:-O0;-g;-G>
31 | $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<BOOL:${NVCC_THREADS}>>:-t4>
32 | )
33 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
 1 | # Contributing to NVSHMEM tests 
 2 | 
 3 | ## What is an unit test ?
 4 | A unit-test under `test/unit` is limited to testing 1 top-level `nvshmem` internal API and mocking rest of the code/framework to bootstrap/teardown the aforementioned API to run either on bare-metal env or in a namespaced env (docker, VM, etc) with installed dependencies. Typically, these are rarely to never ran on GPU/NIC device. The test could include or depend directly on any nvshmem internal header files.
 5 | 
 6 | ## What is a functional test ?
 7 | A functional-test under `test/functional` is limited to testing N top-level `nvshmem` external APIs of a given library. Typically, this should rarely to never demand mocking rest of the code/framework to bootstrap/teardown the aforementioned APIs and would run on a bare-metal env on one or multiple CPU/GPU/NIC devices (single or multi-node). The test must not include or depend directly on any nvshmem internal header file or sources.
 8 | 
 9 | ## What is an integration test ?
10 | A integration-test under `test/integration` is limited to testing N x M top-level `nvshmem` and other consumer libraries API/interfaces. Typically, this should rarely to never demand mocking rest of the code in its neighbourhood and would run on a bare-metal env on one or multiple CPU/GPU/NIC devices (single or multi-node). Similar to functional test, it must not include or depend directly on any nvshmem internal header file or sources.
11 | 


--------------------------------------------------------------------------------
/src/host/stream/coll/rdxn/reduce_team.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "reduce_common.cuh"
 8 | #include "internal/non_abi/nvshmemi_h_to_d_coll_defs.cuh"
 9 | 
10 | /* This is a special kernel that is launched only with
11 | one thread and is used during team creation in nvshmemi_team_plit_strided fn */
12 | template <typename TYPE, rdxn_ops_t OP>
13 | __global__ void nvshmemi_reduce_kernel(int start, int stride, int size, TYPE *dst,
14 |                                        const TYPE *source, size_t nreduce, TYPE *pWrk,
15 |                                        volatile long *pSync, volatile long *sync_counter) {
16 | #ifdef __CUDA_ARCH__
17 |     gpu_rdxn_on_demand_2<TYPE, OP>(start, stride, size, dst, source, nreduce, pWrk, pSync,
18 |                                    sync_counter);
19 | #endif
20 | }
21 | 
22 | template __global__ void nvshmemi_reduce_kernel<unsigned char, (rdxn_ops)0>(
23 |     int, int, int, unsigned char *, unsigned char const *, unsigned long, unsigned char *,
24 |     long volatile *, long volatile *);
25 | template __global__ void nvshmemi_reduce_kernel<int, (rdxn_ops)4>(int, int, int, int *, int const *,
26 |                                                                   unsigned long, int *,
27 |                                                                   long volatile *, long volatile *);
28 | 


--------------------------------------------------------------------------------
/test/host/init/mpi_init.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <assert.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #include "utils.h"
12 | 
13 | int main(int c, char *v[]) {
14 |     int rank, nranks;
15 |     int mype_node, npes_node;
16 |     MPI_Comm mpi_comm;
17 |     nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
18 |     int dev_count;
19 |     MPI_Init(&c, &v);
20 | 
21 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
22 |     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
23 | 
24 |     DEBUG_PRINT("MPI: [%d of %d] hello MPI world! \n", rank, nranks);
25 | 
26 |     mpi_comm = MPI_COMM_WORLD;
27 |     attr.mpi_comm = &mpi_comm;
28 |     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
29 | 
30 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
31 |     npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
32 |     CUDA_CHECK(cudaGetDeviceCount(&dev_count));
33 |     int npes_per_gpu = (npes_node + dev_count - 1) / dev_count;
34 |     CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu));
35 | 
36 | #ifdef _NVSHMEM_DEBUG
37 |     int mype, npes;
38 |     mype = nvshmem_my_pe();
39 |     npes = nvshmem_n_pes();
40 |     DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes);
41 | #endif
42 | 
43 |     MPI_Barrier(MPI_COMM_WORLD);
44 | 
45 |     nvshmem_finalize();
46 | 
47 |     MPI_Finalize();
48 | 
49 |     return 0;
50 | }
51 | 


--------------------------------------------------------------------------------
/test/host/init/nvshmemx_init_status.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include "nvshmem.h"
 9 | #include "nvshmemx.h"
10 | #include "utils.h"
11 | 
12 | int main(int c, char *v[]) {
13 |     int mype_node, npes_node;
14 |     int dev_count;
15 | 
16 |     assert(nvshmemx_init_status() == NVSHMEM_STATUS_NOT_INITIALIZED);
17 |     nvshmem_init();
18 |     assert(nvshmemx_init_status() == NVSHMEM_STATUS_IS_BOOTSTRAPPED);
19 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
20 |     npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
21 |     CUDA_CHECK(cudaGetDeviceCount(&dev_count));
22 |     int npes_per_gpu = (npes_node + dev_count - 1) / dev_count;
23 |     CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu));
24 | 
25 | #ifdef _NVSHMEM_DEBUG
26 |     int mype = nvshmem_my_pe();
27 |     int npes = nvshmem_n_pes();
28 | #endif
29 |     DEBUG_PRINT("[%d of %d] hello shmem world! \n", mype, npes);
30 | 
31 |     nvshmem_barrier_all();
32 |     assert(nvshmemx_init_status() >= NVSHMEM_STATUS_IS_INITIALIZED);
33 | 
34 |     if (npes_per_gpu > 1) assert(nvshmemx_init_status() >= NVSHMEM_STATUS_LIMITED_MPG);
35 | 
36 |     nvshmem_finalize();
37 |     assert(nvshmemx_init_status() == NVSHMEM_STATUS_IS_BOOTSTRAPPED);
38 | 
39 |     nvshmem_init();
40 |     assert(nvshmemx_init_status() >= NVSHMEM_STATUS_IS_INITIALIZED);
41 |     nvshmem_finalize();
42 | 
43 |     return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/nvshmem4py/build_assets/numbast/templates/config_nvshmem.yml.j2:
--------------------------------------------------------------------------------
 1 | Name: NVSHMEM Device Bindings
 2 | Version: {{CONFIG_VERSION}}
 3 | Entry Point: {{ENTRY_POINT_PATH}}
 4 | # The list of files, from which the APIs is allow-listed.
 5 | File List:
 6 |     - {{NVSHMEM_HOME}}/src/include/device/nvshmem_coll_defines.cuh
 7 |     - {{NVSHMEM_HOME}}/src/include/device/nvshmem_defines.h
 8 |     - {{NVSHMEM_HOME}}/src/include/device/nvshmemx_coll_defines.cuh
 9 |     - {{NVSHMEM_HOME}}/src/include/device/nvshmemx_defines.h
10 | 
11 | GPU Arch:
12 |    # sm_70 is used to parse the declarations of nvshmem API. When code is jitted
13 |    # at runtime, Numba will use the runtime CC to generate code.
14 |   - sm_70 
15 | 
16 | Clang Include Paths:
17 |   - {{NVSHMEM_HOME}}/src/include
18 |   {{CUDA13_CCCL_INCLUDE_PATH}}
19 | 
20 | Macro-expanded Function Prefixes:
21 |   - nvshmem_
22 |   - nvshmemx_
23 | 
24 | Predefined Macros:
25 |   - __NVSHMEM_NUMBA_SUPPORT__
26 | 
27 | Output Name: {{OUTPUT_NAME}}
28 | 
29 | Cooperative Launch Required Functions Regex:
30 | {{COOPERATIVE_LAUNCH_REQUIRED_FUNCTIONS}}
31 | 
32 | API Prefix Removal:
33 |   Function:
34 |     - "nvshmem_"
35 |     - "nvshmemx_"
36 | 
37 | Additional Import:
38 |   - "nvshmem.bindings"
39 | 
40 | Module Callbacks:
41 |   setup: "lambda x: nvshmem.bindings.cumodule_init(int(x))"
42 |   teardown: "lambda x: nvshmem.bindings.cumodule_finalize(int(x))"
43 | 
44 | Shim Include Override: "\"entry_point.h\""
45 | 
46 | Exclude:
47 |   Function:
48 |     - "nvshmem_ptr"
49 |     - "nvshmem_mc_ptr"


--------------------------------------------------------------------------------
/test/host/init/shmem_init.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <assert.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #ifdef NVSHMEMTEST_SHMEM_SUPPORT
12 | #include "shmem.h"
13 | #include "shmemx.h"
14 | #endif
15 | #include "utils.h"
16 | 
17 | int main(int c, char *v[]) {
18 |     int nv_npes_node, nv_mype_node;
19 |     nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
20 |     int dev_count;
21 |     shmem_init();
22 |     DEBUG_PRINT("shmem_init done\n");
23 | 
24 |     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_SHMEM, &attr);
25 | #ifdef _NVSHMEM_DEBUG
26 |     int nv_mype, nv_npes;
27 |     nv_mype = nvshmem_my_pe();
28 |     nv_npes = nvshmem_n_pes();
29 |     DEBUG_PRINT("NVSHMEM: [%d of %d] hello nvshmem world! \n", nv_mype, nv_npes);
30 | #endif
31 |     nv_mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
32 |     nv_npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
33 |     DEBUG_PRINT("NVSHMEM TEAM NODE: [%d of %d] hello nvshmem team node world! \n", nv_mype_node,
34 |                 nv_npes_node);
35 |     CUDA_CHECK(cudaGetDeviceCount(&dev_count));
36 |     int npes_per_gpu = (nv_npes_node + dev_count - 1) / dev_count;
37 |     CUDA_CHECK(cudaSetDevice(nv_mype_node / npes_per_gpu));
38 | 
39 |     nvshmem_barrier_all();
40 |     shmem_barrier_all();
41 | 
42 |     nvshmem_finalize();
43 | 
44 |     shmem_finalize();
45 | 
46 |     return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/core/_internal_tracking.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | """
12 | Internal tracking for NVShmem
13 | 
14 | This file contains things like buffer management, status, etc.
15 | """
16 | from enum import IntEnum
17 | 
18 | """
19 | Map of Device IDs from cuda.core to MemoryResource (NvshmemResource) objects
20 | Used to avoid re-creating NvshmemResources every time someone calls nvshmem.core.allocate()
21 | """
22 | _mr_references = {}
23 | 
24 | """
25 | class for Internal Init Status
26 | """
27 | class InternalInitStatus(IntEnum):
28 |     UNINITIALIZED = 0
29 |     INITIALIZED = 1
30 |     DE_INITIALIZED = 2 # Keeps bootstrap
31 | 
32 | """
33 | Set to True after initializing. Used for safety checks before functions
34 | """
35 | _is_initialized = {"status": InternalInitStatus.UNINITIALIZED}
36 | 
37 | """
38 | Each NVSHMEM process needs to be assocaited with a device. We cache that here.
39 | """
40 | _cached_device = {"device": None}
41 | 
42 | """
43 | Debug mode is used to avoid redundant calls to Device()
44 | """
45 | _debug_mode = False
46 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/test_collective.py:
--------------------------------------------------------------------------------
 1 | import cffi
 2 | import argparse
 3 | 
 4 | from cuda.core.experimental import Device
 5 | 
 6 | from numba import cuda, int32
 7 | 
 8 | import nvshmem
 9 | from nvshmem.bindings import my_pe, n_pes
10 | from nvshmem.bindings.device.numba import int_p, barrier_all
11 | 
12 | from utils import uid_init, mpi_init
13 | 
14 | def test_collective(dev: Device):
15 | 
16 |     ffi = cffi.FFI()
17 | 
18 | 
19 |     @cuda.jit(lto=True)
20 |     def reduce_ring(dest, mype, npes):
21 |         target = ffi.from_buffer(dest)
22 |         peer = int32((mype + 1) % npes)
23 |         lvalue = mype
24 | 
25 |         for i in range(npes):
26 |             int_p(target, lvalue, peer)
27 |             barrier_all()
28 |             lvalue = target[0] + mype
29 |             barrier_all()
30 | 
31 |     mype = my_pe()
32 |     npes = n_pes()
33 | 
34 |     dest = nvshmem.core.array((1,), dtype="int32")
35 | 
36 |     reduce_ring[1, 1, 0](dest, mype, npes)
37 | 
38 |     dev.sync()
39 | 
40 |     print(f"{my_pe()}: received message {dest[0]}")
41 | 
42 |     nvshmem.core.free_array(dest)
43 |     nvshmem.core.finalize()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser()
48 |     parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid")
49 |     args = parser.parse_args()
50 |     if args.init_type == "uid":
51 |         dev = uid_init()
52 |     elif args.init_type == "mpi":
53 |         dev = mpi_init()
54 | 
55 |     test_collective(dev)


--------------------------------------------------------------------------------
/test/host/init/init_loop.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <nvshmem.h>
 4 | #include <nvshmemx.h>
 5 | #ifdef NVSHMEMTEST_MPI_SUPPORT
 6 | #include <mpi.h>
 7 | #endif
 8 | #include <cassert>
 9 | #include <vector>
10 | #include "utils.h"
11 | 
12 | #define INIT_DEFAULT_ITERS 150
13 | 
14 | int main(int argc, char *argv[]) {
15 |     int devices, num_iters = 0;
16 |     const char *test_num_iter = getenv("NVSHMEMTEST_INIT_NUM_ITERS");
17 |     read_args(argc, argv);
18 |     init_wrapper(&argc, &argv);
19 |     nvshmem_barrier_all();
20 |     nvshmem_finalize();
21 | 
22 |     if (test_num_iter) {
23 |         num_iters = atoi(test_num_iter);
24 |     }
25 | 
26 |     if (num_iters <= 0) {
27 |         num_iters = INIT_DEFAULT_ITERS;
28 |     }
29 | 
30 |     for (int i = 0; i < num_iters; i++) {
31 |         printf("Step %d\n", i);
32 |         nvshmem_init();
33 |         int *destination = NULL;
34 |         if (use_mmap) {
35 |             destination = (int *)allocate_mmap_buffer(sizeof(int), _mem_handle_type, use_egm);
36 |             free_mmap_buffer(destination);
37 |         } else {
38 |             destination = (int *)nvshmem_malloc(sizeof(int));
39 |             nvshmem_free(destination);
40 |         }
41 |         nvshmem_finalize();
42 |         printf("Step %d done\n", i);
43 |     }
44 |     nvshmem_init();     /* finalize_wrapper will call nvshmem_finalize();
45 |                            this is the corresponding init for it */
46 |     finalize_wrapper(); /* should finalize boostrap stuff as well */
47 | }
48 | 


--------------------------------------------------------------------------------
/src/host/mem/dlmalloc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | /*
 8 |  *   mspace is an opaque type representing an independent
 9 |  *     region of space that supports mspace_malloc, etc.
10 |  *     */
11 | 
12 | #include <stddef.h>
13 | 
14 | #ifndef _DLMALLOC_H
15 | #define _DLMALLOC_H
16 | typedef void* mspace;
17 | 
18 | /*XXX: same definitions in dlmalloc.c because dlmalloc.c does not include me*/
19 | #define NVSHMEM_SINGLE_HEAP 1
20 | #ifndef MALLOC_ALIGNMENT
21 | #if NVSHMEM_SINGLE_HEAP
22 | #define MALLOC_ALIGNMENT ((size_t)512U)
23 | #else
24 | #define MALLOC_ALIGNMENT ((size_t)256U)
25 | #endif
26 | #endif
27 | 
28 | #if defined __cplusplus
29 | extern "C" {
30 | #endif
31 | 
32 | /* API to create and destroy mspace*/
33 | mspace create_mspace_with_base(void* base, size_t capacity, int locked);
34 | void mspace_add_new_chunk(mspace msp, void* ptr, size_t bytes);
35 | size_t destroy_mspace(mspace msp);
36 | 
37 | /* API that prevents large chunks from being allocated with system alloc*/
38 | int mspace_track_large_chunks(mspace msp, int enable);
39 | 
40 | /* API for allocation and deallocation*/
41 | void* mspace_malloc(mspace msp, size_t bytes);
42 | void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);
43 | void mspace_free(mspace msp, void* mem);
44 | void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);
45 | void* mspace_realloc(mspace msp, void* ptr, size_t bytes);
46 | 
47 | #if defined __cplusplus
48 | }
49 | #endif
50 | #endif
51 | 


--------------------------------------------------------------------------------
/examples/dev-guide-ring.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include <stdio.h>
14 | #include <cuda.h>
15 | #include <nvshmem.h>
16 | #include <nvshmemx.h>
17 | 
18 | __global__ void simple_shift(int *destination) {
19 |     int mype = nvshmem_my_pe();
20 |     int npes = nvshmem_n_pes();
21 |     int peer = (mype + 1) % npes;
22 | 
23 |     nvshmem_int_p(destination, mype, peer);
24 | }
25 | 
26 | int main(void) {
27 |     int mype_node, msg;
28 |     cudaStream_t stream;
29 | 
30 |     nvshmem_init();
31 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
32 |     cudaSetDevice(mype_node);
33 |     cudaStreamCreate(&stream);
34 | 
35 |     int *destination = (int *)nvshmem_malloc(sizeof(int));
36 | 
37 |     simple_shift<<<1, 1, 0, stream>>>(destination);
38 |     nvshmemx_barrier_all_on_stream(stream);
39 |     cudaMemcpyAsync(&msg, destination, sizeof(int), cudaMemcpyDeviceToHost, stream);
40 | 
41 |     cudaStreamSynchronize(stream);
42 |     printf("%d: received message %d\n", nvshmem_my_pe(), msg);
43 | 
44 |     nvshmem_free(destination);
45 |     nvshmem_finalize();
46 |     return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/test/host/mem/calloc.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <nvshmem.h>
10 | #include <nvshmemx.h>
11 | #include "utils.h"
12 | 
13 | #define NELEM (1 << 20)
14 | 
15 | int main(int argc, char **argv) {
16 |     long *dev_buf, *host_buf;
17 |     size_t i, err = 0;
18 | 
19 |     init_wrapper(&argc, &argv);
20 | 
21 |     /* Check count == 0 */
22 |     dev_buf = (long *)nvshmem_calloc(0, sizeof(long));
23 |     if (dev_buf != NULL) {
24 |         printf("Error, zero element calloc did not return NULL\n");
25 |         ++err;
26 |     }
27 |     nvshmem_free(dev_buf);
28 | 
29 |     /* Check size == 0 */
30 |     dev_buf = (long *)nvshmem_calloc(NELEM, 0);
31 |     if (dev_buf != NULL) {
32 |         printf("Error, zero size calloc did not return NULL\n");
33 |         ++err;
34 |     }
35 |     nvshmem_free(dev_buf);
36 | 
37 |     /* Check that memory is cleared: calloc, set, free, calloc */
38 |     dev_buf = (long *)nvshmem_calloc(NELEM, sizeof(long));
39 |     cudaMemset(dev_buf, 0xAA, NELEM * sizeof(long));
40 |     nvshmem_free(dev_buf);
41 | 
42 |     host_buf = (long *)calloc(NELEM, sizeof(long));
43 |     dev_buf = (long *)nvshmem_calloc(NELEM, sizeof(long));
44 |     cudaMemcpy(host_buf, dev_buf, NELEM * sizeof(long), cudaMemcpyDeviceToHost);
45 | 
46 |     for (i = 0; i < NELEM; i++)
47 |         if (host_buf[i]) ++err;
48 | 
49 |     free(host_buf);
50 |     nvshmem_free(dev_buf);
51 |     finalize_wrapper();
52 | 
53 |     return err != 0;
54 | }
55 | 


--------------------------------------------------------------------------------
/src/modules/transport/common/transport_gdr_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _TRANSPORT_GDR_COMMON_H
 8 | #define _TRANSPORT_GDR_COMMON_H
 9 | 
10 | #include <stdint.h>  // IWYU pragma: keep
11 | // IWYU pragma: no_include <bits/stdint-uintn.h>
12 | #include <stddef.h>
13 | 
14 | #include "gdrapi.h"
15 | 
16 | struct gdrcopy_function_table {
17 |     gdr_t (*open)();
18 |     int (*close)(gdr_t g);
19 |     int (*pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token,
20 |                       uint32_t va_space, gdr_mh_t *handle);
21 |     int (*unpin_buffer)(gdr_t g, gdr_mh_t handle);
22 |     int (*get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
23 |     int (*map)(gdr_t g, gdr_mh_t handle, void **va, size_t size);
24 |     int (*unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size);
25 |     int (*copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
26 |     int (*copy_to_mapping)(gdr_mh_t handle, const void *map_d_ptr, void *h_ptr, size_t size);
27 |     void (*runtime_get_version)(int *major, int *minor);
28 |     int (*driver_get_version)(gdr_t g, int *major, int *minor);
29 | };
30 | 
31 | bool nvshmemt_gdrcopy_ftable_init(struct gdrcopy_function_table *gdrcopy_ftable, gdr_t *gdr_desc,
32 |                                   void **gdrcopy_handle, int log_level);
33 | void nvshmemt_gdrcopy_ftable_fini(struct gdrcopy_function_table *gdrcopy_ftable, gdr_t *gdr_desc,
34 |                                   void **gdrcopy_handle);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/pmi/pmi-2/COPYRIGHT:
--------------------------------------------------------------------------------
 1 | 
 2 | 				  COPYRIGHT
 3 | 
 4 | The following is a notice of limited availability of the code, and disclaimer
 5 | which must be included in the prologue of the code and in all source listings
 6 | of the code.
 7 | 
 8 | Copyright Notice
 9 |  + 2002 University of Chicago
10 | 
11 | Permission is hereby granted to use, reproduce, prepare derivative works, and
12 | to redistribute to others.  This software was authored by:
13 | 
14 | Mathematics and Computer Science Division
15 | Argonne National Laboratory, Argonne IL 60439
16 | 
17 | (and)
18 | 
19 | Department of Computer Science
20 | University of Illinois at Urbana-Champaign
21 | 
22 | 
23 | 			      GOVERNMENT LICENSE
24 | 
25 | Portions of this material resulted from work developed under a U.S.
26 | Government Contract and are subject to the following license: the Government
27 | is granted for itself and others acting on its behalf a paid-up, nonexclusive,
28 | irrevocable worldwide license in this computer software to reproduce, prepare
29 | derivative works, and perform publicly and display publicly.
30 | 
31 | 				  DISCLAIMER
32 | 
33 | This computer code material was prepared, in part, as an account of work
34 | sponsored by an agency of the United States Government.  Neither the United
35 | States, nor the University of Chicago, nor any of their employees, makes any
36 | warranty express or implied, or assumes any legal liability or responsibility
37 | for the accuracy, completeness, or usefulness of any information, apparatus,
38 | product, or process disclosed, or represents that its use would not infringe
39 | privately owned rights.
40 | 


--------------------------------------------------------------------------------
/src/modules/transport/common/CMakeLists.txt.in:
--------------------------------------------------------------------------------
 1 | macro(add_helper_library LIBRARY_NAME NEED_CUDA SOURCE_LIST)
 2 |   add_library(${LIBRARY_NAME} STATIC ${LINK_REQUIREMENTS} ${SOURCE_LIST})
 3 | 
 4 |   set_target_properties(${LIBRARY_NAME} PROPERTIES
 5 |     POSITION_INDEPENDENT_CODE ON
 6 |     CXX_STANDARD_REQUIRED ON
 7 |     CUDA_STANDARD_REQUIRED ON
 8 |     CXX_STANDARD 11
 9 |     CUDA_STANDARD 11
10 |     CUDA_SEPARABLE_COMPILATION ON
11 |   )
12 | 
13 |   target_include_directories(${LIBRARY_NAME} PUBLIC
14 |                              ${CMAKE_CURRENT_SOURCE_DIR}
15 |   )
16 | 
17 |   target_include_directories(${LIBRARY_NAME} PRIVATE
18 |                              ${CMAKE_SOURCE_DIR}/include
19 |                              ${CUDAToolkit_INCLUDE_DIRS}
20 |   )
21 | 
22 |   if(NEED_CUDA)
23 |     target_link_libraries(${LIBRARY_NAME} PRIVATE CUDA::cudart_static)
24 |   endif()
25 | endmacro()
26 | 
27 | add_helper_library(nvshmem_transport_common OFF transport_common.cpp)
28 | 
29 | if(NVSHMEM_USE_GDRCOPY)
30 |   add_helper_library(nvshmem_transport_gdr_common OFF transport_gdr_common.cpp)
31 |   target_include_directories(nvshmem_transport_gdr_common PUBLIC ${GDRCOPY_INCLUDE})
32 | endif()
33 | 
34 | if(NVSHMEM_BUILD_IBDEVX_TRANSPORT OR NVSHMEM_BUILD_IBGDA_TRANSPORT OR NVSHMEM_BUILD_IBRC_TRANSPORT)
35 |   add_helper_library(nvshmem_transport_ib_common ON transport_ib_common.cpp)
36 | 
37 |   if(NVSHMEM_BUILD_IBDEVX_TRANSPORT OR NVSHMEM_BUILD_IBGDA_TRANSPORT)
38 |     add_helper_library(nvshmem_transport_mlx5_common OFF transport_mlx5_common.cpp)
39 |     target_link_libraries(nvshmem_transport_mlx5_common PRIVATE MLX5_lib)
40 |   endif()
41 | endif()
42 | 


--------------------------------------------------------------------------------
/test/host/team/shmem_team_reuse_teams.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  * *
 4 |  * * See License.txt for license information
 5 |  * */
 6 | 
 7 | #include <stdio.h>
 8 | #include <nvshmem.h>
 9 | #include <nvshmemx.h>
10 | #include "utils.h"
11 | 
12 | int main(int argc, char **argv) {
13 |     int i, me, npes;
14 |     int ret = 0, errors = 0;
15 | 
16 |     init_wrapper(&argc, &argv);
17 | 
18 |     me = nvshmem_my_pe();
19 |     npes = nvshmem_n_pes();
20 | 
21 |     if (me == 0) printf("Reuse teams test\n");
22 | 
23 |     nvshmem_team_t old_team, new_team;
24 |     ret = nvshmem_team_split_strided(NVSHMEM_TEAM_WORLD, 0, 1, npes, NULL, 0, &old_team);
25 |     if (ret) ++errors;
26 | 
27 |     /* A total of npes-1 iterations are performed, where the active set in iteration i
28 |      * includes PEs i..npes-1.  The size of the team decreases by 1 each iteration.  */
29 |     for (i = 1; i < npes; i++) {
30 |         if (me == i) {
31 |             printf("%3d: creating new team (start, stride, size): %3d, %3d, %3d\n", me,
32 |                    nvshmem_team_translate_pe(old_team, 1, NVSHMEM_TEAM_WORLD), 1,
33 |                    nvshmem_team_n_pes(old_team) - 1);
34 |         }
35 | 
36 |         ret = nvshmem_team_split_strided(old_team, 1, 1, nvshmem_team_n_pes(old_team) - 1, NULL, 0,
37 |                                          &new_team);
38 |         if (old_team != NVSHMEM_TEAM_INVALID && ret) ++errors;
39 | 
40 |         nvshmem_team_destroy(old_team);
41 |         old_team = new_team;
42 |     }
43 |     nvshmem_team_destroy(old_team);
44 |     finalize_wrapper();
45 | 
46 |     return errors != 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/perftest/device/pt-to-pt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | nvshmem_add_perftest(shmem_atomic_bw.cu)
 2 | nvshmem_add_perftest(shmem_atomic_latency.cu)
 3 | nvshmem_add_perftest(shmem_atomic_ping_pong_latency.cu)
 4 | nvshmem_add_perftest(shmem_g_bw.cu)
 5 | nvshmem_add_perftest(shmem_g_latency.cu)
 6 | nvshmem_add_perftest(shmem_get_bw.cu)
 7 | nvshmem_add_perftest(shmem_get_latency.cu)
 8 | nvshmem_add_perftest(shmem_p_bw.cu)
 9 | nvshmem_add_perftest(shmem_p_latency.cu)
10 | nvshmem_add_perftest(shmem_p_ping_pong_latency.cu)
11 | nvshmem_add_perftest(shmem_put_atomic_ping_pong_latency.cu)
12 | nvshmem_add_perftest(shmem_put_bw.cu)
13 | nvshmem_add_perftest(shmem_put_latency.cu)
14 | nvshmem_add_perftest(shmem_put_ping_pong_latency.cu)
15 | nvshmem_add_perftest(shmem_put_signal_ping_pong_latency.cu)
16 | nvshmem_add_perftest(shmem_signal_ping_pong_latency.cu)
17 | nvshmem_add_perftest(shmem_st_bw.cu)
18 | 
19 | if(NVSHMEM_BUILD_BITCODE_LIBRARY)
20 |     nvshmem_add_cubin_perftest(shmem_atomic_bw.cu)
21 |     nvshmem_add_cubin_perftest(shmem_atomic_latency.cu)
22 |     nvshmem_add_cubin_perftest(shmem_atomic_ping_pong_latency.cu)
23 |     nvshmem_add_cubin_perftest(shmem_g_latency.cu)
24 |     nvshmem_add_cubin_perftest(shmem_get_latency.cu)
25 |     nvshmem_add_cubin_perftest(shmem_p_latency.cu)
26 |     nvshmem_add_cubin_perftest(shmem_p_ping_pong_latency.cu)
27 |     nvshmem_add_cubin_perftest(shmem_put_atomic_ping_pong_latency.cu)
28 |     nvshmem_add_cubin_perftest(shmem_put_latency.cu)
29 |     nvshmem_add_cubin_perftest(shmem_put_ping_pong_latency.cu)
30 |     nvshmem_add_cubin_perftest(shmem_put_signal_ping_pong_latency.cu)
31 |     nvshmem_add_cubin_perftest(shmem_signal_ping_pong_latency.cu)
32 | endif()
33 | 


--------------------------------------------------------------------------------
/test/device/coll/coll_test.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef COLL_TEST_H
 8 | #define COLL_TEST_H
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <assert.h>
12 | #include <unistd.h>
13 | #include "nvshmem.h"
14 | #include "nvshmemx.h"
15 | 
16 | #ifdef NVSHMEMTEST_MPI_SUPPORT
17 | #include "mpi.h"
18 | #endif
19 | #include "utils.h"
20 | #include <cuda_runtime.h>
21 | #include <algorithm>
22 | 
23 | #define ELEMS_PER_THREAD 32
24 | #define NVSHM_TEST_NUM_TPB 32
25 | #undef MAX_ELEMS
26 | #define MAX_ELEMS (ELEMS_PER_THREAD * NVSHM_TEST_NUM_TPB)
27 | #define MAX_NPES 128
28 | #define MAX_ITER 32
29 | #define LARGEST_DT uint64_t
30 | 
31 | #define CUDA_RUNTIME_CHECK(stmt)                                                  \
32 |     do {                                                                          \
33 |         cudaError_t result = (stmt);                                              \
34 |         if (cudaSuccess != result) {                                              \
35 |             fprintf(stderr, "[%s:%d] cuda failed with %s \n", __FILE__, __LINE__, \
36 |                     cudaGetErrorString(result));                                  \
37 |             status = -1;                                                          \
38 |             goto out;                                                             \
39 |         }                                                                         \
40 |         assert(cudaSuccess == result);                                            \
41 |     } while (0)
42 | 
43 | #endif /*COLL_TEST_H*/
44 | 


--------------------------------------------------------------------------------
/test/device/sync/sync_test.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "nvshmem.h"
 8 | #include "nvshmemx.h"
 9 | #include "utils.h"
10 | 
11 | __device__ int error_d;
12 | 
13 | __global__ void test_nvshmem_test_kernel(uint64_t *remote, int mype, int npes) {
14 |     nvshmemx_signal_op(remote, (uint64_t)mype + 1, NVSHMEM_SIGNAL_SET, (mype + 1) % npes);
15 | 
16 |     while (!nvshmem_uint64_test(remote, NVSHMEM_CMP_NE, 0))
17 |         ;
18 |     if (*remote != ((uint64_t)mype + npes - 1) % npes + 1) {
19 |         printf("PE %d received incorrect value", mype);
20 |         error_d = 1;
21 |     }
22 | }
23 | 
24 | int main(int argc, char *argv[]) {
25 |     read_args(argc, argv);
26 |     init_wrapper(&argc, &argv);
27 |     const int mype = nvshmem_my_pe();
28 |     const int npes = nvshmem_n_pes();
29 | 
30 |     int zero = 0, ret_val;
31 |     cudaMemcpyToSymbol(error_d, &zero, sizeof(int), 0);
32 |     cudaDeviceSynchronize();
33 | 
34 |     uint64_t *remote;
35 |     if (use_mmap) {
36 |         remote =
37 |             (uint64_t *)allocate_mmap_buffer(sizeof(uint64_t), _mem_handle_type, use_egm, true);
38 |     } else {
39 |         remote = (uint64_t *)nvshmem_malloc(sizeof(uint64_t));
40 |         cudaMemset(remote, 0, sizeof(uint64_t));
41 |     }
42 |     nvshmem_barrier_all();
43 |     /* The kernel is designed for 1 thread */
44 |     test_nvshmem_test_kernel<<<1, 1>>>(remote, mype, npes);
45 |     cudaDeviceSynchronize();
46 | 
47 |     cudaMemcpyFromSymbol(&ret_val, error_d, sizeof(int), 0);
48 | 
49 |     finalize_wrapper();
50 | 
51 |     return ret_val;
52 | }
53 | 


--------------------------------------------------------------------------------
/test/host/mem/malloc_simple.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #include "cuda_runtime.h"
12 | #include "utils.h"
13 | 
14 | #define MAX_SIZE 128 * 1024 * 1024
15 | 
16 | int main(int argc, char **argv) {
17 |     int status = 0;
18 |     int mype;
19 |     size_t size;
20 |     char *buffer = NULL;
21 |     char size_string[100];
22 | 
23 |     size = (size_t)MAX_SIZE * 2;
24 |     sprintf(size_string, "%zu", size);
25 | 
26 |     status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1);
27 |     if (status) {
28 |         ERROR_PRINT("setenv failed \n");
29 |         status = -1;
30 |         goto out;
31 |     }
32 | 
33 |     init_wrapper(&argc, &argv);
34 | 
35 |     mype = nvshmem_my_pe();
36 | #ifdef _NVSHMEM_DEBUG
37 |     npes = nvshmem_n_pes();
38 | #endif
39 | 
40 |     for (size = 1; size <= MAX_SIZE; size *= 2) {
41 |         buffer = (char *)nvshmem_malloc(size);
42 |         if (!buffer) {
43 |             ERROR_PRINT("shmem_malloc failed \n");
44 |             status = -1;
45 |             goto out;
46 |         }
47 | 
48 |         cudaMemset(buffer, 0, size);
49 | 
50 |         if (!mype)
51 |             DEBUG_PRINT("[%d of %d] allocated symmetric object: %p size: %zu bytes \n", mype, npes,
52 |                         buffer, size);
53 | 
54 |         nvshmem_free(buffer);
55 | 
56 |         if (!mype) DEBUG_PRINT("[%d of %d] free symmetric object: %p \n", mype, npes, buffer);
57 |     }
58 | 
59 | out:
60 |     finalize_wrapper();
61 |     return status;
62 | }
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | *.dirstamp
35 | *.deps
36 | *.log
37 | *.status
38 | 
39 | #emacs backup files
40 | #++++
41 | *.*~
42 | 
43 | # Vim backup files
44 | *.*.swp
45 | 
46 | # IDE trash folder
47 | .idea
48 | .vscode
49 | 
50 | # Generated sources
51 | src/device/comm/transfer_device.cu
52 | src/include/non_abi/nvshmem_build_options.h
53 | src/include/non_abi/nvshmem_version.h
54 | test/common/include/non_abi/device/pt-to-pt/transfer_device.cuh
55 | 
56 | # gtest artifacts
57 | _deps
58 | 
59 | # tarballs
60 | *.tar
61 | 
62 | # Build and install targets
63 | build/
64 | bin/
65 | install/
66 | perftest_install/
67 | examples_install/
68 | test_install/
69 | git_commit.txt
70 | version.txt
71 | 
72 | # local debug for coverity
73 | .gitlab-ci-4-coverity*.yml
74 | .gitlab-ci-4-coverage*.yml
75 | .Makefile*
76 | .version.mk
77 | .common.mk
78 | 
79 | # CMake artifacts
80 | *CMakeFiles*
81 | CMakeCache.txt
82 | *Makefile
83 | CPack*Config.cmake
84 | NVSHMEM*.cmake
85 | *CTest*.cmake
86 | cmake_install.cmake
87 | 
88 | # nvshmem4py objects
89 | nvshmem4py/CMakeFiles/
90 | nvshmem4py/*.cmake
91 | nvshmem4py/nvshmem/bindings/*.cpp
92 | nvshmem4py/nvshmem/bindings/_internal/*.cpp
93 | nvshmem4py/*.egg-info
94 | nvshmem4py/pyproject.toml
95 | 
96 | # Filesystem objects
97 | .DS_STORE
98 | .nfs*
99 | 


--------------------------------------------------------------------------------
/src/include/bootstrap_device_host/nvshmem_uniqueid.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef _NVSHMEM_UNIQUEID_H_
 3 | #define _NVSHMEM_UNIQUEID_H_
 4 | 
 5 | #define UNIQUEID_PADDING 124
 6 | #define UNIQUEID_ARGS_INVALID -1
 7 | #if !defined __CUDACC_RTC__
 8 | #include <stddef.h>  // for NULL
 9 | #define NVSHMEMX_UNIQUEID_INITIALIZER                          \
10 |     {                                                          \
11 |         (1 << 16) + sizeof(nvshmemx_uniqueid_t), /* version */ \
12 |         {                                                      \
13 |             0                                                  \
14 |         }                                                      \
15 |     }
16 | 
17 | #define NVSHMEMX_UNIQUEID_ARGS_INITIALIZER                          \
18 |     {                                                               \
19 |         (1 << 16) + sizeof(nvshmemx_uniqueid_args_t), /* version */ \
20 |             NULL,                                     /* id */      \
21 |             UNIQUEID_ARGS_INVALID,                    /* myrank */  \
22 |             UNIQUEID_ARGS_INVALID                     /* nranks */  \
23 |     }
24 | #endif
25 | typedef struct {
26 |     int version;
27 |     char internal[UNIQUEID_PADDING];
28 | } nvshmemx_uniqueid_v1;
29 | static_assert(sizeof(nvshmemx_uniqueid_v1) == 128, "uniqueid_v1 must be 128 bytes.");
30 | 
31 | typedef nvshmemx_uniqueid_v1 nvshmemx_uniqueid_t;
32 | 
33 | typedef struct {
34 |     int version;
35 |     nvshmemx_uniqueid_v1 *id;
36 |     int myrank;
37 |     int nranks;
38 | } nvshmemx_uniqueid_args_v1;
39 | static_assert(sizeof(nvshmemx_uniqueid_args_v1) == 24, "uniqueid_args_v1 must be 24 bytes.");
40 | 
41 | typedef nvshmemx_uniqueid_args_v1 nvshmemx_uniqueid_args_t;
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/src/host/coll/rdxn/rdxn.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef NVSHMEMI_RDXN_COMMON_CPU_H
 8 | #define NVSHMEMI_RDXN_COMMON_CPU_H
 9 | #include <driver_types.h>
10 | #include <stddef.h>
11 | 
12 | #include "cpu_coll.h"
13 | #include "non_abi/nvshmem_build_options.h"
14 | #include "device_host/nvshmem_common.cuh"
15 | #include "internal/host/nvshmem_internal.h"
16 | #include "device_host/nvshmem_types.h"
17 | #include "internal/host/util.h"
18 | #ifdef NVSHMEM_USE_NCCL
19 | #include "nccl.h"
20 | #endif
21 | 
22 | template <typename TYPE, rdxn_ops_t OP>
23 | void nvshmemi_call_rdxn_on_stream_kernel(nvshmem_team_t team, TYPE *dest, const TYPE *source,
24 |                                          size_t nreduce, cudaStream_t stream);
25 | 
26 | template <typename TYPE, rdxn_ops_t OP>
27 | int nvshmemi_reduce_on_stream(nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nreduce,
28 |                               cudaStream_t stream) {
29 | #ifdef NVSHMEM_USE_NCCL
30 |     nvshmemi_team_t *teami = nvshmemi_team_pool[team];
31 |     if (teami->nvls_rsc_base_ptr == NULL && nvshmemi_use_nccl &&
32 |         nvshmemi_get_nccl_op<OP>() != ncclNumOps && nvshmemi_get_nccl_dt<TYPE>() != ncclNumTypes) {
33 |         NCCL_CHECK(nccl_ftable.AllReduce(source, dest, nreduce, nvshmemi_get_nccl_dt<TYPE>(),
34 |                                          nvshmemi_get_nccl_op<OP>(), (ncclComm_t)teami->nccl_comm,
35 |                                          stream));
36 |     } else
37 | #endif /* NVSHMEM_USE_NCCL */
38 |     {
39 |         nvshmemi_call_rdxn_on_stream_kernel<TYPE, OP>(team, dest, source, nreduce, stream);
40 |     }
41 |     return 0;
42 | }
43 | 
44 | #endif /* NVSHMEMI_RDXN_COMMON_CPU_H */
45 | 


--------------------------------------------------------------------------------
/src/host/comm/fence.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "host/nvshmem_api.h"  // IWYU pragma: keep
 8 | #include <cuda_runtime.h>
 9 | #include <driver_types.h>
10 | 
11 | #include "internal/host/nvshmem_internal.h"
12 | #include "internal/host/nvshmemi_types.h"
13 | #include "internal/host/nvshmem_nvtx.hpp"
14 | #include "non_abi/nvshmemx_error.h"
15 | #include "internal/host_transport/transport.h"
16 | #include "internal/host/util.h"
17 | 
18 | void nvshmem_fence(void) {
19 |     NVTX_FUNC_RANGE_IN_GROUP(MEMORDER);
20 |     NVSHMEMI_CHECK_INIT_STATUS();
21 | 
22 |     int status;
23 |     int tbitmap = nvshmemi_state->transport_bitmap;
24 |     for (int j = 0; j < nvshmemi_state->num_initialized_transports; j++) {
25 |         if (tbitmap & 1) {
26 |             struct nvshmem_transport *tcurr =
27 |                 ((nvshmem_transport_t *)nvshmemi_state->transports)[j];
28 |             if ((tcurr->attr & NVSHMEM_TRANSPORT_ATTR_NO_ENDPOINTS)) {
29 |                 for (int s = 0; s < nvshmemi_options.MAX_PEER_STREAMS; s++) {
30 |                     cudaStream_t custrm = nvshmemi_state->custreams[s];
31 |                     CUDA_RUNTIME_CHECK_GOTO(cudaStreamSynchronize(custrm), status, out);
32 |                 }
33 |             } else if (tcurr->host_ops.fence) {
34 |                 for (int k = 0; k < nvshmemi_state->npes; k++) {
35 |                     status = tcurr->host_ops.fence(tcurr, k, 0, NVSHMEMX_QP_HOST);
36 |                     NVSHMEMI_NZ_ERROR_JMP(status, NVSHMEMX_ERROR_INTERNAL, out,
37 |                                           "nvshmem_fence() failed \n");
38 |                 }
39 |             }
40 |         }
41 |         tbitmap >>= 1;
42 |     }
43 | out:
44 |     return;
45 | }
46 | 


--------------------------------------------------------------------------------
/src/include/internal/bootstrap_host/nvshmemi_bootstrap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | #ifndef NVSHMEMI_BOOTSTRAP_H
 7 | #define NVSHMEMI_BOOTSTRAP_H
 8 | 
 9 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h"
10 | #include "non_abi/nvshmem_version.h"
11 | /* Version = major * 10000 + minor * 100 + patch*/
12 | /* ABI Introduced in NVSHMEM 2.8.0 */
13 | #define NVSHMEMI_BOOTSTRAP_ABI_VERSION                \
14 |     (NVSHMEM_BOOTSTRAP_PLUGIN_MAJOR_VERSION * 10000 + \
15 |      NVSHMEM_BOOTSTRAP_PLUGIN_MINOR_VERSION * 100 + NVSHMEM_BOOTSTRAP_PLUGIN_PATCH_VERSION)
16 | 
17 | #define NVSHMEM_BOOTSTRAP_MAJOR_VERSION(ver) (ver / 10000)
18 | #define NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(ver) (ver / 100)
19 | 
20 | static bool nvshmemi_is_bootstrap_compatible(int bootstrap_version, int nvshmem_version,
21 |                                              bool boot_backward_compatible) {
22 |     if (NVSHMEM_BOOTSTRAP_MAJOR_VERSION(bootstrap_version) !=
23 |         NVSHMEM_BOOTSTRAP_MAJOR_VERSION(nvshmem_version)) {
24 |         return false;
25 |     }
26 | 
27 |     if (NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(nvshmem_version) <
28 |         NVSHMEM_BOOTSTRAP_MAJOR_MINOR_VERSION(bootstrap_version)) {
29 |         if (boot_backward_compatible) {
30 |             return true;
31 |         }
32 |         return false;
33 |     }
34 |     return true;
35 | }
36 | 
37 | #if __cplusplus
38 | extern "C" {
39 | #endif
40 | int nvshmemi_bootstrap_plugin_init(void *mpi_comm, bootstrap_handle_t *handle,
41 |                                    const int nvshmem_version);
42 | int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t *handle, const int nvshmem_version);
43 | #if __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/test/host/interop/app.cu:
--------------------------------------------------------------------------------
 1 | #include "simplelib1.h"
 2 | #include "simplelib2.h"
 3 | #include <cstdio>
 4 | #include <cuda_runtime.h>
 5 | #include <cuda.h>
 6 | #include <nvshmem.h>
 7 | #include <nvshmemx.h>
 8 | #include <unistd.h>
 9 | 
10 | __device__ int num_errors_d;
11 | __global__ void app_nvshmem_kernel(int *array) {
12 |     int my_pe = nvshmem_my_pe();
13 |     int n_pes = nvshmem_n_pes();
14 |     int next_pe = (my_pe + 1) % n_pes;
15 |     int prev_pe = (my_pe - 1 + n_pes) % n_pes;
16 |     nvshmem_int_p(array, my_pe, next_pe);
17 |     nvshmem_barrier_all();
18 | 
19 |     if (array[0] != prev_pe) {
20 |         printf("app: incorrect value found, expected = %d, found = %d\n", prev_pe, array[0]);
21 |         num_errors_d = 1;
22 |     }
23 | }
24 | 
25 | int app_dowork() {
26 |     int *array = (int *)nvshmem_calloc(1, sizeof(int));
27 |     int num_errors = 0;
28 |     app_nvshmem_kernel<<<1, 1>>>(array);
29 |     cudaDeviceSynchronize();
30 |     cudaMemcpyFromSymbol(&num_errors, num_errors_d, sizeof(int));
31 |     nvshmem_free(array);
32 |     return num_errors;
33 | }
34 | 
35 | int main(int argc, char **argv) {
36 |     nvshmem_init();
37 |     int mype = nvshmem_my_pe();
38 |     int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
39 |     int npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
40 |     int dev_count;
41 |     cudaGetDeviceCount(&dev_count);
42 |     int npes_per_gpu = (npes_node + dev_count - 1) / dev_count;
43 |     cudaSetDevice(mype_node / npes_per_gpu);
44 |     nvshmem_barrier_all();
45 | 
46 |     simplelib1_init();
47 |     simplelib2_init();
48 | 
49 |     int num_errors = app_dowork();
50 |     num_errors += simplelib1_dowork();
51 |     num_errors += simplelib2_dowork();
52 | 
53 |     nvshmem_finalize();
54 |     simplelib1_finalize();
55 |     simplelib2_finalize();
56 | 
57 |     return num_errors;
58 | }
59 | 


--------------------------------------------------------------------------------
/src/host/init/query_host.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stddef.h>                                                        // for size_t
 8 | #include "device_host/nvshmem_types.h"                                     // for nvshmem_team_t
 9 | #include "device_host_transport/nvshmem_constants.h"                       // for NVSHMEM_MAJOR...
10 | #include "host/nvshmem_api.h"                                              // for nvshmem_team_...
11 | #include "internal/bootstrap_host_transport/nvshmemi_bootstrap_defines.h"  // for bootstrap_han...
12 | #include "non_abi/nvshmem_version.h"                                       // for NVSHMEM_VENDO...
13 | #include "internal/host/nvshmemi_types.h"
14 | 
15 | int nvshmem_my_pe(void) { return nvshmemi_boot_handle.pg_rank; }
16 | 
17 | int nvshmem_n_pes(void) { return nvshmemi_boot_handle.pg_size; }
18 | 
19 | void nvshmem_info_get_name(char *name) {
20 |     size_t i;
21 |     const char *str = NVSHMEM_VENDOR_STRING;
22 | 
23 |     /* Copy up to NVSHMEM_MAX_NAME_LEN-1 chars, then add NULL terminator */
24 |     for (i = 0; i < NVSHMEM_MAX_NAME_LEN - 1 && str[i] != '\0'; i++) name[i] = str[i];
25 | 
26 |     name[i] = '\0';
27 | }
28 | 
29 | void nvshmem_info_get_version(int *major, int *minor) {
30 |     *major = NVSHMEM_MAJOR_VERSION;
31 |     *minor = NVSHMEM_MINOR_VERSION;
32 | }
33 | 
34 | void nvshmemx_vendor_get_version_info(int *major, int *minor, int *patch) {
35 |     *major = NVSHMEM_VENDOR_MAJOR_VERSION;
36 |     *minor = NVSHMEM_VENDOR_MINOR_VERSION;
37 |     *patch = NVSHMEM_VENDOR_PATCH_VERSION;
38 | }
39 | 
40 | int nvshmemx_my_pe(nvshmemx_team_t team) { return nvshmem_team_my_pe((nvshmem_team_t)team); }
41 | 
42 | int nvshmemx_n_pes(nvshmemx_team_t team) { return nvshmem_team_n_pes((nvshmem_team_t)team); }
43 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/device/numba/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for NVSHMEM4Py unit tests
 3 | 
 4 | TODO: this is vended from top level utils.py, let's find a way to centralize these functions
 5 | """
 6 | 
 7 | from mpi4py import MPI
 8 | import numpy as np
 9 | 
10 | import nvshmem.core
11 | 
12 | from cuda.core.experimental import Device, system
13 | 
14 | import os
15 | 
16 | def get_local_rank_per_node():
17 |     comm = MPI.COMM_WORLD
18 |     rank = comm.Get_rank()
19 |     size = comm.Get_size()
20 | 
21 |     # Split COMM_WORLD into sub-communicators of processes on the same node
22 |     node_comm = comm.Split_type(MPI.COMM_TYPE_SHARED)
23 | 
24 |     local_rank = node_comm.Get_rank()
25 |     local_size = node_comm.Get_size()
26 | 
27 |     return local_rank
28 | 
29 | def uid_init():
30 |     # This will use mpi4py to perform a UID based init with bcast.
31 |     comm = MPI.COMM_WORLD
32 |     rank = comm.Get_rank()
33 |     nranks = comm.Get_size()
34 | 
35 |     local_rank_per_node = get_local_rank_per_node()
36 |     dev = Device(local_rank_per_node)
37 |     dev.set_current()
38 | 
39 |     # Create an empty uniqueid for all ranks
40 |     uniqueid = nvshmem.core.get_unique_id(empty=True)
41 |     if rank == 0:
42 |         # Rank 0 gets a real uniqueid
43 |         uniqueid = nvshmem.core.get_unique_id()
44 | 
45 |     # Broadcast UID to all ranks
46 |     comm.Bcast(uniqueid._data.view(np.int8), root=0)
47 | 
48 |     nvshmem.core.init(device=dev, uid=uniqueid, rank=rank, nranks=nranks,
49 |                       mpi_comm=None, initializer_method="uid")
50 | 
51 |     return dev
52 | 
53 | def mpi_init():
54 |     local_rank_per_node = get_local_rank_per_node()
55 |     dev = Device(local_rank_per_node)
56 |     dev.set_current()
57 |     nvshmem.core.init(device=dev, uid=None, rank=None, nranks=None,
58 |                       mpi_comm=MPI.COMM_WORLD, initializer_method="mpi")
59 | 
60 |     return dev
61 | 


--------------------------------------------------------------------------------
/test/host/init/uid_init.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <assert.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #include "utils.h"
12 | 
13 | int main(int c, char *v[]) {
14 |     int rank, nranks;
15 |     int mype_node, npes_node;
16 |     MPI_Comm mpi_comm;
17 |     nvshmemx_init_attr_t attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
18 |     nvshmemx_uniqueid_t id = NVSHMEMX_UNIQUEID_INITIALIZER;
19 |     int dev_count;
20 |     MPI_Init(&c, &v);
21 | 
22 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
23 |     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
24 | 
25 |     DEBUG_PRINT("MPI: [%d of %d] hello MPI world! \n", rank, nranks);
26 |     if (rank == 0) {
27 |         nvshmemx_get_uniqueid(&id);
28 |     }
29 | 
30 |     MPI_Bcast(&id, sizeof(nvshmemx_uniqueid_t), MPI_UINT8_T, 0, MPI_COMM_WORLD);
31 |     nvshmemx_set_attr_uniqueid_args(rank, nranks, &id, &attr);
32 |     /* Verify if structure is set correctly */
33 |     assert(attr.args.uid_args.id == &id);
34 |     assert(attr.args.uid_args.myrank == rank);
35 |     assert(attr.args.uid_args.nranks == nranks);
36 | 
37 |     nvshmemx_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID, &attr);
38 | 
39 |     mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
40 |     npes_node = nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE);
41 |     CUDA_CHECK(cudaGetDeviceCount(&dev_count));
42 |     int npes_per_gpu = (npes_node + dev_count - 1) / dev_count;
43 |     CUDA_CHECK(cudaSetDevice(mype_node / npes_per_gpu));
44 | 
45 | #ifdef _NVSHMEM_DEBUG
46 |     int mype, npes;
47 |     mype = nvshmem_my_pe();
48 |     npes = nvshmem_n_pes();
49 |     DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes);
50 | #endif
51 | 
52 |     MPI_Barrier(MPI_COMM_WORLD);
53 | 
54 |     nvshmem_finalize();
55 | 
56 |     MPI_Finalize();
57 | 
58 |     return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/src/host/coll/reducescatter/reducescatter.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef NVSHMEMI_REDUCESCATTER_COMMON_CPU_H
 8 | #define NVSHMEMI_REDUCESCATTER_COMMON_CPU_H
 9 | #include <driver_types.h>
10 | #include <stddef.h>
11 | 
12 | #include "cpu_coll.h"
13 | #include "non_abi/nvshmem_build_options.h"
14 | #include "device_host/nvshmem_common.cuh"
15 | #include "internal/host/nvshmem_internal.h"
16 | #include "device_host/nvshmem_types.h"
17 | #include "internal/host/util.h"
18 | #ifdef NVSHMEM_USE_NCCL
19 | #include "nccl.h"
20 | #endif
21 | 
22 | template <typename TYPE, rdxn_ops_t OP>
23 | void nvshmemi_call_reducescatter_on_stream_kernel(nvshmem_team_t team, TYPE *dest,
24 |                                                   const TYPE *source, size_t nreduce,
25 |                                                   cudaStream_t stream);
26 | 
27 | template <typename TYPE, rdxn_ops_t OP>
28 | int nvshmemi_reducescatter_on_stream(nvshmem_team_t team, TYPE *dest, const TYPE *source,
29 |                                      size_t nreduce, cudaStream_t stream) {
30 | #ifdef NVSHMEM_USE_NCCL
31 |     nvshmemi_team_t *teami = nvshmemi_team_pool[team];
32 |     if (teami->nvls_rsc_base_ptr == NULL && nvshmemi_use_nccl &&
33 |         nvshmemi_get_nccl_op<OP>() != ncclNumOps && nvshmemi_get_nccl_dt<TYPE>() != ncclNumTypes) {
34 |         NCCL_CHECK(nccl_ftable.ReduceScatter(source, dest, nreduce, nvshmemi_get_nccl_dt<TYPE>(),
35 |                                              nvshmemi_get_nccl_op<OP>(),
36 |                                              (ncclComm_t)teami->nccl_comm, stream));
37 |     } else
38 | #endif /* NVSHMEM_USE_NCCL */
39 |     {
40 |         nvshmemi_call_reducescatter_on_stream_kernel<TYPE, OP>(team, dest, source, nreduce, stream);
41 |     }
42 |     return 0;
43 | }
44 | 
45 | #endif /* NVSHMEMI_REDUCESCATTER_COMMON_CPU_H */
46 | 


--------------------------------------------------------------------------------
/src/include/internal/host/custom_malloc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | /*
 8 |  *   mspace is an opaque type representing an independent
 9 |  *     region of space that supports mspace_malloc, etc.
10 |  *     */
11 | #ifndef NVSHMEMI_CUSTOM_MALLOC_H
12 | #define NVSHMEMI_CUSTOM_MALLOC_H
13 | 
14 | #include <stddef.h>  // for size_t
15 | #include <map>       // for map
16 | #include <utility>   // for pair
17 | 
18 | #define NVSHMEMI_MALLOC_ALIGNMENT ((size_t)512U)
19 | 
20 | class mspace {
21 |    private:
22 |     /* free_chunks_start is mapping of start address of each free chunk to size of that chunk */
23 |     /* free_chunks_end is mapping of end address of each free chunk to size of that chunk */
24 |     std::map<void *, size_t> free_chunks_start, free_chunks_end;
25 |     /* in_use_cunks is a mapping of each in use chunks start address to size of the chunk */
26 |     std::map<void *, size_t> inuse_chunks;
27 |     size_t total_size = 0; /* size of total space managed by mspace */
28 |    public:
29 |     mspace() {}
30 |     mspace(void *base, size_t capacity);
31 |     void print();
32 |     void add_free_chunk(char *base, size_t capacity);
33 |     void add_new_chunk(void *base, size_t capacity);
34 |     int track_large_chunks(int enable);
35 |     void *allocate(size_t bytes);
36 |     void deallocate(void *mem);
37 |     void *allocate_zeroed(size_t n_elements, size_t elem_size);
38 |     void *allocate_aligned(size_t alignment, size_t bytes);
39 |     void *reallocate(void *ptr, size_t size);
40 |     bool checkInuse(void *ptr, size_t size);
41 |     void *get_startInusePtr() {
42 |         if (inuse_chunks.empty()) {
43 |             return NULL;
44 |         }
45 |         return inuse_chunks.begin()->first;
46 |     }
47 |     std::map<void *, size_t> *get_inuse_chunks() { return &inuse_chunks; }
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/test_highlevel_bindings.py:
--------------------------------------------------------------------------------
 1 | import cffi
 2 | import argparse
 3 | 
 4 | from cuda.core.experimental import Device
 5 | 
 6 | from numba import cuda, int32
 7 | from numba.types import float32, Array
 8 | from numba.core.extending import overload
 9 | 
10 | import nvshmem
11 | from nvshmem.bindings import barrier_all
12 | from nvshmem.bindings.device.numba import my_pe, n_pes, int_p, float_p
13 | from utils import uid_init, mpi_init
14 | 
15 | 
16 | def test_highlevel_bindings(dev: Device):
17 | 
18 |     ffi = cffi.FFI()
19 | 
20 | 
21 |     def p():
22 |         pass
23 | 
24 | 
25 |     @overload(p)
26 |     def p_ol(arr, mype, peer):
27 |         if arr == Array(dtype=int32, ndim=arr.ndim, layout=arr.layout):
28 | 
29 |             def impl(arr, mype, peer):
30 |                 ptr = ffi.from_buffer(arr)
31 |                 int_p(ptr, mype, peer)
32 | 
33 |             return impl
34 |         elif arr == Array(dtype=float32, ndim=arr.ndim, layout=arr.layout):
35 | 
36 |             def impl(arr, mype, peer):
37 |                 ptr = ffi.from_buffer(arr)
38 |                 float_p(ptr, mype, peer)
39 | 
40 |             return impl
41 | 
42 | 
43 |     @cuda.jit(lto=True)
44 |     def app_kernel(dest):
45 |         mype = my_pe()
46 |         npes = n_pes()
47 |         peer = int32((mype + 1) % npes)
48 | 
49 |         p(dest, mype, peer)
50 | 
51 |     dest = nvshmem.core.array((1,), dtype="float32")
52 | 
53 |     app_kernel[1, 1, 0](dest)
54 | 
55 |     barrier_all()
56 |     dev.sync()
57 | 
58 |     nvshmem.core.free_array(dest)
59 |     nvshmem.core.finalize()
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument("--init-type", "-i", type=str, help="Init type to use", choices=["mpi", "uid"], default="uid")
65 |     args = parser.parse_args()
66 |     if args.init_type == "uid":
67 |         dev = uid_init()
68 |     elif args.init_type == "mpi":
69 |         dev = mpi_init()
70 | 
71 |     test_highlevel_bindings(dev)


--------------------------------------------------------------------------------
/test/device/sync/wait_until_all.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include "nvshmem.h"
 8 | #include "nvshmemx.h"
 9 | #include "utils.h"
10 | 
11 | __device__ int error_d;
12 | 
13 | #define N 100
14 | 
15 | __global__ void test_kernel(uint64_t *flags, int *status, int mype, int npes) {
16 |     for (int i = 0; i < npes; i++) nvshmemx_signal_op(&flags[mype], 1, NVSHMEM_SIGNAL_SET, i);
17 |     nvshmem_quiet();
18 |     nvshmem_uint64_wait_until_all(flags, npes, status, NVSHMEM_CMP_EQ, 1);
19 | 
20 |     /* Check the flags array */
21 |     for (int i = 0; i < npes; i++) {
22 |         if (flags[i] != 1) {
23 |             printf("Incorrect flag value = %lu, expected = %d\n", flags[i], 1);
24 |             error_d = 1;
25 |         }
26 |     }
27 | }
28 | 
29 | int main(int argc, char **argv) {
30 |     int ret_val = 0;
31 |     read_args(argc, argv);
32 |     init_wrapper(&argc, &argv);
33 |     int mype = nvshmem_my_pe();
34 |     int npes = nvshmem_n_pes();
35 | 
36 |     int zero = 0;
37 |     cudaMemcpyToSymbol(error_d, &zero, sizeof(int), 0);
38 |     cudaDeviceSynchronize();
39 | 
40 |     uint64_t *flags;
41 |     if (use_mmap) {
42 |         flags = (uint64_t *)allocate_mmap_buffer(npes * sizeof(uint64_t), _mem_handle_type, use_egm,
43 |                                                  true);
44 |     } else {
45 |         flags = (uint64_t *)nvshmem_malloc(npes * sizeof(uint64_t));
46 |         cudaMemset(flags, 0, npes * sizeof(uint64_t));
47 |     }
48 |     int *status = NULL;
49 |     nvshmem_barrier_all();
50 | 
51 |     cudaDeviceSynchronize();
52 |     test_kernel<<<1, 1>>>(flags, status, mype, npes);
53 |     cudaDeviceSynchronize();
54 | 
55 |     cudaMemcpyFromSymbol(&ret_val, error_d, sizeof(int), 0);
56 |     if (use_mmap) {
57 |         free_mmap_buffer(flags);
58 |     } else {
59 |         nvshmem_free(flags);
60 |     }
61 |     finalize_wrapper();
62 | 
63 |     return ret_val;
64 | }
65 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/device/numba/test_device_sync.py:
--------------------------------------------------------------------------------
 1 | from cuda.core.experimental import Device, Stream
 2 | import numba.cuda as cuda
 3 | import nvshmem.core
 4 | import nvshmem.core.device.numba 
 5 | 
 6 | import pytest
 7 | 
 8 | @pytest.mark.mpi
 9 | @pytest.mark.parametrize("teams", [nvshmem.core.Teams.TEAM_NODE, nvshmem.core.Teams.TEAM_WORLD, nvshmem.core.Teams.TEAM_SHARED])
10 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.sync, nvshmem.core.device.numba.sync_block, nvshmem.core.device.numba.sync_warp])
11 | def test_device_sync(nvshmem_init_fini, teams, func):
12 |     print(f"Testing {func.__name__} on team {teams}")
13 | 
14 |     nblocks = 1
15 |     nthreads = 1
16 |     dev = Device()
17 |     dev.sync()
18 | 
19 |     print(f"From PE {nvshmem.core.my_pe()}")
20 | 
21 |     @cuda.jit
22 |     def test_sync(teams):
23 |         func(teams)
24 |     
25 |     nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int
26 |     cu_stream_ref = Stream.from_handle(nb_stream.handle.value)
27 | 
28 |     test_sync[nblocks, nthreads, nb_stream](teams)
29 |     nvshmem.core.barrier(teams, stream=cu_stream_ref)
30 |     cu_stream_ref.sync()
31 |     dev.sync()
32 |     print("Done testing sync")
33 | 
34 | 
35 | @pytest.mark.mpi
36 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.sync_all, nvshmem.core.device.numba.sync_all_block, nvshmem.core.device.numba.sync_all_warp])
37 | def test_device_sync_all(nvshmem_init_fini, func):
38 |     print(f"Testing {func.__name__}")
39 | 
40 |     nblocks = 1
41 |     nthreads = 1
42 | 
43 |     dev = Device()
44 |     dev.sync()
45 | 
46 |     print(f"From PE {nvshmem.core.my_pe()}")
47 | 
48 |     @cuda.jit
49 |     def test_sync_all():
50 |         func()
51 |     
52 |     nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int
53 |     cu_stream_ref = Stream.from_handle(nb_stream.handle.value)
54 | 
55 |     test_sync_all[nblocks, nthreads, nb_stream]()
56 | 
57 |     cu_stream_ref.sync()
58 |     dev.sync()
59 |     print("Done testing sync_all")


--------------------------------------------------------------------------------
/src/host/coll/fcollect/fcollect_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <driver_types.h>                    // for cudaStream_t
 8 | #include <stddef.h>                          // for size_t
 9 | #include "device_host/nvshmem_common.cuh"    // for NVSHMEMI_REPT_FOR_STAN...
10 | #include "device_host/nvshmem_types.h"       // for nvshmem_team_t
11 | #include "fcollect.h"                        // for nvshmemi_fcollect_on_s...
12 | #include "host/nvshmemx_coll_api.h"          // for nvshmemx_char_fcollect...
13 | #include "internal/host/nvshmem_internal.h"  // for NVSHMEMI_CHECK_INIT_ST...
14 | #include "internal/host/nvshmem_nvtx.hpp"    // for nvtx_cond_range, NVTX_...
15 | #include "internal/host/util.h"              // for NVSHMEM_API_NOT_SUPPOR...
16 | 
17 | #define DEFN_NVSHMEMX_TYPENAME_FCOLLECT_ON_STREAM(TYPENAME, TYPE)                                  \
18 |     int nvshmemx_##TYPENAME##_fcollect_on_stream(                                                  \
19 |         nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, cudaStream_t stream) { \
20 |         NVTX_FUNC_RANGE_IN_GROUP(COLL);                                                            \
21 |         NVSHMEMI_CHECK_INIT_STATUS();                                                              \
22 |         NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS();                                         \
23 |         return nvshmemi_fcollect_on_stream<TYPE>(team, dest, source, nelems, stream);              \
24 |     }
25 | 
26 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_TYPENAME_FCOLLECT_ON_STREAM)
27 | 
28 | int nvshmemx_fcollectmem_on_stream(nvshmem_team_t team, void *dest, const void *source,
29 |                                    size_t nelems, cudaStream_t stream) {
30 |     return nvshmemx_char_fcollect_on_stream(team, (char *)dest, (const char *)source, nelems,
31 |                                             stream);
32 | }
33 | 


--------------------------------------------------------------------------------
/nvshmem4py/nvshmem/bindings/device/numba/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | from cuda.pathfinder import find_nvidia_header_directory
12 | 
13 | from numba import cuda 
14 | 
15 | import os
16 | import warnings
17 | 
18 | from nvshmem.core.nvshmem_types import NvshmemWarning
19 | 
20 | if os.path.exists(os.path.join(os.path.dirname(__file__), "_numbast.py")):
21 |     from ._numbast import *
22 | 
23 |     INCLUDE_PATH = find_nvidia_header_directory("nvshmem")
24 |     if "nvshmem.h" not in os.listdir(INCLUDE_PATH):
25 |         raise RuntimeError("nvshmem.h not found, package may not be properly installed")
26 | 
27 |     if not os.path.exists(INCLUDE_PATH):
28 |         raise RuntimeError(f"NVSHMEM headers not found at {INCLUDE_PATH}. Please confirm that nvshmem is installed correctly.")
29 | 
30 |     CCCL_INCLUDE_PATH = find_nvidia_header_directory("cccl")
31 | 
32 |     if not os.path.exists(CCCL_INCLUDE_PATH):
33 |         raise RuntimeError(f"CCCL headers not found at {CCCL_INCLUDE_PATH}. Please confirm that cccl is installed correctly.")
34 | 
35 |     # Path to this folder to look for entry point file
36 |     this_folder = os.path.dirname(os.path.abspath(__file__))
37 |     if not os.path.exists(os.path.join(this_folder, "entry_point.h")):
38 |         raise RuntimeError("entry_point.h not found, package may not be properly installed")
39 | 
40 |     cuda.config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = ":".join([INCLUDE_PATH, CCCL_INCLUDE_PATH, this_folder])
41 | 
42 | else:
43 |     warnings.warn("Numba device bindings are not enabled", NvshmemWarning)
44 |     _numbast = None
45 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/device/numba/test_device_barrier.py:
--------------------------------------------------------------------------------
 1 | from cuda.core.experimental import Device, Stream
 2 | import numba.cuda as cuda
 3 | import nvshmem.core
 4 | import nvshmem.core.device.numba 
 5 | 
 6 | import pytest
 7 | 
 8 | @pytest.mark.mpi
 9 | @pytest.mark.parametrize("teams", [nvshmem.core.Teams.TEAM_NODE, nvshmem.core.Teams.TEAM_WORLD, nvshmem.core.Teams.TEAM_SHARED])
10 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.barrier, nvshmem.core.device.numba.barrier_block, nvshmem.core.device.numba.barrier_warp])
11 | def test_device_barrier(nvshmem_init_fini, teams, func):
12 |     print(f"Testing {func.__name__} on team {teams}")
13 | 
14 |     nblocks = 1
15 |     nthreads = 1
16 |     dev = Device()
17 |     dev.sync()
18 | 
19 |     print(f"From PE {nvshmem.core.my_pe()}")
20 | 
21 |     @cuda.jit
22 |     def test_barrier(teams):
23 |         func(teams)
24 |     
25 |     nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int
26 |     cu_stream_ref = Stream.from_handle(nb_stream.handle.value)
27 | 
28 |     test_barrier[nblocks, nthreads, nb_stream](teams)
29 |     nvshmem.core.barrier(teams, stream=cu_stream_ref)
30 |     cu_stream_ref.sync()
31 |     dev.sync()
32 |     print("Done testing barrier")
33 | 
34 | 
35 | @pytest.mark.mpi
36 | @pytest.mark.parametrize("func", [nvshmem.core.device.numba.barrier_all, nvshmem.core.device.numba.barrier_all_block, nvshmem.core.device.numba.barrier_all_warp])
37 | def test_device_barrier_all(nvshmem_init_fini, func):
38 |     print(f"Testing {func.__name__}")
39 | 
40 |     nblocks = 1
41 |     nthreads = 1
42 | 
43 |     dev = Device()
44 |     dev.sync()
45 | 
46 |     print(f"From PE {nvshmem.core.my_pe()}")
47 | 
48 |     @cuda.jit
49 |     def test_barrier_all():
50 |         func()
51 |     
52 |     nb_stream = cuda.stream() # WAR: Numba-CUDA takes numba stream object or int
53 |     cu_stream_ref = Stream.from_handle(nb_stream.handle.value)
54 | 
55 |     test_barrier_all[nblocks, nthreads, nb_stream]()
56 | 
57 |     cu_stream_ref.sync()
58 |     dev.sync()
59 |     print("Done testing barrier_all")
60 | 


--------------------------------------------------------------------------------
/nvshmem4py/examples/simple_p2p_kernel.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # NVIDIA CORPORATION and its licensors retain all intellectual property
 4 | # and proprietary rights in and to this software, related documentation
 5 | # and any modifications thereto.  Any use, reproduction, disclosure or
 6 | # distribution of this software and related documentation without an express
 7 | # license agreement from NVIDIA CORPORATION is strictly prohibited.
 8 | #
 9 | # See License.txt for license information
10 | 
11 | """
12 | This file shows a minimal example of using NVSHMEM4Py to run a collective operation on CuPy arrays
13 | """
14 | 
15 | import cupy
16 | import nvshmem.core
17 | from cuda.core.experimental import Device, system
18 | from numba import cuda
19 | 
20 | @cuda.jit
21 | def simple_shift(arr, dst_pe):
22 |     arr[0] = dst_pe
23 | 
24 | # Initialize NVSHMEM Using an MPI communicator
25 | local_rank_per_node = MPI.COMM_WORLD.Get_rank() % system.num_devices
26 | dev = Device(local_rank_per_node)
27 | dev.set_current()
28 | stream = dev.create_stream()
29 | nvshmem.core.init(device=dev, mpi_comm=MPI.COMM_WORLD, initializer_method="mpi")
30 | 
31 | # Helper function to return a CuPy ArrayView backed by NVSHMEM symmetric memory
32 | array = nvshmem.core.array((1,), dtype="int32")
33 | 
34 | my_pe = nvshmem.core.my_pe()
35 | # A unidirectional ring - always get the neighbor to the right
36 | dst_pe = (my_pe + 1) % nvshmem.core.n_pes()
37 | 
38 | # This function returns an Array which can be directly load/store'd to over NVLink
39 | # The dst_PE must be in the same NVL domain as the PE calling this function, otherwise it will raise an Exception
40 | dev_dst = nvshmem.core.get_peer_array(b, dst_pe)
41 | 
42 | 
43 | block = 1
44 | grid = (size + block - 1) // block
45 | simple_shift[block, grid, 0, 0](array, my_pe)
46 | nvshmem.core.barrier(nvshmem.core.Teams.TEAM_NODE, stream)
47 | # This should print the neighbor's PE
48 | print(f"From PE {my_pe}, array contains {array}")
49 | 
50 | nvshmem.core.free_array(arr_src)
51 | nvshmem.core.free_array(arr_dst)
52 | nvshmem.core.finalize()
53 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/common/env_defs.h:
--------------------------------------------------------------------------------
 1 | /****
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Copyright 2011 Sandia Corporation. Under the terms of Contract
 5 |  * DE-AC04-94AL85000 with Sandia Corporation, the U.S.  Government
 6 |  * retains certain rights in this software.
 7 |  *
 8 |  * Copyright (c) 2017 Intel Corporation. All rights reserved.
 9 |  * This software is available to you under the BSD license.
10 |  *
11 |  * Portions of this file are derived from Sandia OpenSHMEM.
12 |  *
13 |  * See License.txt for license information
14 |  ****/
15 | 
16 | /* NVSHMEMI_ENV_DEF( name, kind, default, category, short description )
17 |  *
18 |  * Kinds: long, size, bool, string
19 |  * Categories: NVSHMEMI_ENV_CAT_OPENSHMEM, NVSHMEMI_ENV_CAT_OTHER,
20 |  *             NVSHMEMI_ENV_CAT_COLLECTIVES, NVSHMEMI_ENV_CAT_TRANSPORT,
21 |  *             NVSHMEMI_ENV_CAT_HIDDEN
22 |  */
23 | 
24 | #ifndef NVSHMEM_ENV_DEFS_INTERNAL
25 | #include "bootstrap_host_transport/env_defs_internal.h"  // IWYU pragma: keep
26 | #endif
27 | 
28 | #ifdef NVSHMEMI_ENV_DEF
29 | 
30 | NVSHMEMI_ENV_DEF(DEBUG, string, "", NVSHMEMI_ENV_CAT_OPENSHMEM,
31 |                  "Set to enable debugging messages.\n"
32 |                  "Optional values: VERSION, WARN, INFO, ABORT, TRACE")
33 | 
34 | /** Bootstrap **/
35 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SOCK_IFNAME, string, "", NVSHMEMI_ENV_CAT_BOOTSTRAP,
36 |                  "Name of the UID bootstrap socket interface name")
37 | 
38 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SOCK_FAMILY, string, "AF_INET", NVSHMEMI_ENV_CAT_BOOTSTRAP,
39 |                  "Name of the UID bootstrap socket family name")
40 | 
41 | NVSHMEMI_ENV_DEF(BOOTSTRAP_UID_SESSION_ID, string, "", NVSHMEMI_ENV_CAT_BOOTSTRAP,
42 |                  "Name of the UID bootstrap session identifier")
43 | 
44 | /** Debugging **/
45 | NVSHMEMI_ENV_DEF(DEBUG_SUBSYS, string, "", NVSHMEMI_ENV_CAT_HIDDEN,
46 |                  "Comma separated list of debugging message sources. Prefix with '^' to exclude.\n"
47 |                  "Values: INIT, COLL, P2P, PROXY, TRANSPORT, MEM, BOOTSTRAP, TOPO, UTIL, ALL")
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/include/internal/host/cuda_interface_sync.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef _CUDA_INTERFACE_SYNC_H_
 8 | #define _CUDA_INTERFACE_SYNC_H_
 9 | #include "device_host/nvshmem_common.cuh"
10 | 
11 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL(type, TYPE)               \
12 |     void call_nvshmemi_##type##_wait_until_on_stream_kernel(volatile TYPE *ivar, int cmp, \
13 |                                                             TYPE cmp_value, cudaStream_t cstream);
14 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL)
15 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ON_STREAM_KERNEL
16 | 
17 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL(type, TYPE)          \
18 |     void call_nvshmemi_##type##_wait_until_all_on_stream_kernel(                         \
19 |         volatile TYPE *ivars, size_t nelems, const int *status, int cmp, TYPE cmp_value, \
20 |         cudaStream_t cstream);
21 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL)
22 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_ON_STREAM_KERNEL
23 | 
24 | #define DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL(type, TYPE)    \
25 |     void call_nvshmemi_##type##_wait_until_all_vector_on_stream_kernel(                   \
26 |         volatile TYPE *ivars, size_t nelems, const int *status, int cmp, TYPE *cmp_value, \
27 |         cudaStream_t cstream);
28 | NVSHMEMI_REPT_FOR_WAIT_TYPES(DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL)
29 | #undef DECL_CALL_NVSHMEMI_TYPENAME_WAIT_UNTIL_ALL_VECTOR_ON_STREAM_KERNEL
30 | 
31 | void call_nvshmemi_signal_wait_until_on_stream_kernel(volatile uint64_t *sig_addr, int cmp,
32 |                                                       uint64_t cmp_value, cudaStream_t cstream);
33 | 
34 | void call_nvshmemi_signal_op_kernel(uint64_t *sig_addr, uint64_t signal, int sig_op, int pe,
35 |                                     cudaStream_t cstrm);
36 | #endif
37 | 


--------------------------------------------------------------------------------
/nvshmem4py/setup.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import glob
 3 | import os
 4 | import shutil
 5 | import sys
 6 | import tempfile
 7 | 
 8 | from Cython.Build import cythonize
 9 | from setuptools import setup, Extension, find_packages
10 | from packaging.version import Version
11 | import Cython
12 | 
13 | # Set package name dynamically
14 | PACKAGE_NAME = os.environ.get("PACKAGE_NAME")
15 | 
16 | ext_modules = [
17 |     "nvshmem.bindings.nvshmem"
18 | ]
19 | 
20 | def calculate_modules(module):
21 |     module = module.split(".")
22 | 
23 |     lowpp_mod = module.copy()
24 |     lowpp_mod_pyx = os.path.join(*module[:-1], f"{module[-1]}.pyx")
25 |     lowpp_mod = ".".join(lowpp_mod)
26 |     lowpp_ext = Extension(
27 |         lowpp_mod,
28 |         sources=[lowpp_mod_pyx],
29 |         language="c++",
30 |     )
31 | 
32 |     cy_mod = module.copy()
33 |     cy_mod[-1] = f"cy{cy_mod[-1]}"
34 |     cy_mod_pyx = os.path.join(*cy_mod[:-1], f"{cy_mod[-1]}.pyx")
35 |     cy_mod = ".".join(cy_mod)
36 |     cy_ext = Extension(
37 |         cy_mod,
38 |         sources=[cy_mod_pyx],
39 |         language="c++",
40 |     )
41 | 
42 |     inter_mod = module.copy()
43 |     inter_mod.insert(-1, "_internal")
44 |     inter_mod_pyx = os.path.join(*inter_mod[:-1], f"{inter_mod[-1]}.pyx")
45 |     inter_mod = ".".join(inter_mod)
46 |     inter_ext = Extension(
47 |         inter_mod,
48 |         sources=[inter_mod_pyx],
49 |         language="c++",
50 |     )
51 | 
52 |     return lowpp_ext, cy_ext, inter_ext
53 | 
54 | 
55 | # Note: the extension attributes are overwritten in build_extension()
56 | ext_modules = [e for ext in ext_modules for e in calculate_modules(ext)]
57 | 
58 | 
59 | compiler_directives = {"embedsignature": True, "show_performance_hints": False}
60 | 
61 | setup(
62 |     name=PACKAGE_NAME,
63 |     ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives=compiler_directives),
64 |     zip_safe=False,
65 |     packages=find_packages(include=["nvshmem", "nvshmem.*"]),
66 |     include_package_data=True,
67 |     options={"build_ext": {"inplace": True}},
68 |     install_requires=open(f"{os.path.dirname(__file__)}/requirements.txt").read().splitlines()
69 | )
70 | 


--------------------------------------------------------------------------------
/nvshmem4py/test/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for NVSHMEM4Py unit tests
 3 | """
 4 | 
 5 | from mpi4py import MPI
 6 | import numpy as np
 7 | 
 8 | import nvshmem.core
 9 | from nvshmem.core.nvshmem_types import *
10 | 
11 | from cuda.core.experimental import Device, system
12 | from cuda.core.experimental._stream import Stream
13 | from cuda.core.experimental._memory import MemoryResource, Buffer
14 | import cuda.bindings.driver as driver
15 | import ctypes
16 | 
17 | import os
18 | 
19 | def get_local_rank_per_node():
20 |     comm = MPI.COMM_WORLD
21 |     rank = comm.Get_rank()
22 |     size = comm.Get_size()
23 | 
24 |     # Split COMM_WORLD into sub-communicators of processes on the same node
25 |     node_comm = comm.Split_type(MPI.COMM_TYPE_SHARED)
26 | 
27 |     local_rank = node_comm.Get_rank()
28 |     local_size = node_comm.Get_size()
29 |     print(f"Local rank {local_rank} global rank {rank} and node size {local_size} of global size {size} ranks")
30 |     return local_rank
31 | 
32 | def uid_init():
33 |     # This will use mpi4py to perform a UID based init with bcast.
34 |     comm = MPI.COMM_WORLD
35 |     rank = comm.Get_rank()
36 |     nranks = comm.Get_size()
37 | 
38 |     local_rank_per_node = get_local_rank_per_node()
39 |     dev = Device(local_rank_per_node)
40 |     dev.set_current()
41 | 
42 |     # Create an empty uniqueid for all ranks
43 |     uniqueid = nvshmem.core.get_unique_id(empty=True)
44 |     if rank == 0:
45 |         # Rank 0 gets a real uniqueid
46 |         uniqueid = nvshmem.core.get_unique_id()
47 | 
48 |     # Broadcast UID to all ranks
49 |     comm.Bcast(uniqueid._data.view(np.int8), root=0)
50 | 
51 |     nvshmem.core.init(device=dev, uid=uniqueid, rank=rank, nranks=nranks,
52 |                       mpi_comm=None, initializer_method="uid")
53 | 
54 |     return dev
55 | 
56 | def mpi_init():
57 |     local_rank_per_node = get_local_rank_per_node()
58 |     dev = Device(local_rank_per_node)
59 |     dev.set_current()
60 |     nvshmem.core.init(device=dev, uid=None, rank=None, nranks=None,
61 |                       mpi_comm=MPI.COMM_WORLD, initializer_method="mpi")
62 |     print(f"MPI initialized on device {dev.device_id}")
63 |     return dev
64 | 


--------------------------------------------------------------------------------
/src/host/coll/alltoall/alltoall_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <driver_types.h>                    // for cudaStream_t
 8 | #include <stddef.h>                          // for size_t, ptrdiff_t
 9 | #include <stdint.h>                          // for int16_t, int32_t, int64_t
10 | #include "alltoall.h"                        // for nvshmemi_alltoall_on_s...
11 | #include "device_host/nvshmem_common.cuh"    // for NVSHMEMI_REPT_FOR_STAN...
12 | #include "device_host/nvshmem_types.h"       // for nvshmem_team_t
13 | #include "host/nvshmemx_coll_api.h"          // for nvshmemx_alltoallmem_o...
14 | #include "internal/host/nvshmem_internal.h"  // for NVSHMEMI_CHECK_INIT_ST...
15 | #include "internal/host/nvshmem_nvtx.hpp"    // for nvtx_cond_range, NVTX_...
16 | #include "internal/host/util.h"              // for NVSHMEM_API_NOT_SUPPOR...
17 | 
18 | #define DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM(TYPENAME, TYPE)                                  \
19 |     int nvshmemx_##TYPENAME##_alltoall_on_stream(                                                  \
20 |         nvshmem_team_t team, TYPE *dest, const TYPE *source, size_t nelems, cudaStream_t stream) { \
21 |         NVTX_FUNC_RANGE_IN_GROUP(COLL);                                                            \
22 |         NVSHMEMI_CHECK_INIT_STATUS();                                                              \
23 |         NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS();                                         \
24 |         return nvshmemi_alltoall_on_stream<TYPE>(team, dest, source, nelems, stream);              \
25 |     }
26 | 
27 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM)
28 | #undef DEFN_NVSHMEMX_TYPENAME_ALLTOALL_ON_STREAM
29 | 
30 | int nvshmemx_alltoallmem_on_stream(nvshmem_team_t team, void *dest, const void *source,
31 |                                    size_t nelems, cudaStream_t stream) {
32 |     return nvshmemx_char_alltoall_on_stream(team, (char *)dest, (const char *)source, nelems,
33 |                                             stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/host/coll/broadcast/broadcast_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2017-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <driver_types.h>                    // for cudaStream_t
 8 | #include <stddef.h>                          // for size_t
 9 | #include "broadcast.h"                       // for nvshmemi_broadcast_on_...
10 | #include "device_host/nvshmem_common.cuh"    // for NVSHMEMI_REPT_FOR_STAN...
11 | #include "device_host/nvshmem_types.h"       // for nvshmem_team_t
12 | #include "host/nvshmemx_coll_api.h"          // for nvshmemx_char_broadcas...
13 | #include "internal/host/nvshmem_internal.h"  // for NVSHMEMI_CHECK_INIT_ST...
14 | #include "internal/host/nvshmem_nvtx.hpp"    // for nvtx_cond_range, NVTX_...
15 | #include "internal/host/util.h"              // for NVSHMEM_API_NOT_SUPPOR...
16 | 
17 | #define DEFN_NVSHMEMX_BROADCAST_ON_STREAM(TYPENAME, TYPE)                                         \
18 |     int nvshmemx_##TYPENAME##_broadcast_on_stream(nvshmem_team_t team, TYPE *dest,                \
19 |                                                   const TYPE *source, size_t nelems, int PE_root, \
20 |                                                   cudaStream_t stream) {                          \
21 |         NVTX_FUNC_RANGE_IN_GROUP(COLL);                                                           \
22 |         NVSHMEMI_CHECK_INIT_STATUS();                                                             \
23 |         NVSHMEM_API_NOT_SUPPORTED_WITH_LIMITED_MPG_RUNS();                                        \
24 |         return nvshmemi_broadcast_on_stream<TYPE>(team, dest, source, nelems, PE_root, stream);   \
25 |     }
26 | 
27 | NVSHMEMI_REPT_FOR_STANDARD_RMA_TYPES(DEFN_NVSHMEMX_BROADCAST_ON_STREAM)
28 | #undef DEFN_NVSHMEMX_BROADCAST_ON_STREAM
29 | 
30 | int nvshmemx_broadcastmem_on_stream(nvshmem_team_t team, void *dest, const void *source,
31 |                                     size_t nelems, int PE_root, cudaStream_t stream) {
32 |     return nvshmemx_char_broadcast_on_stream(team, (char *)dest, (const char *)source, nelems,
33 |                                              PE_root, stream);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/include/device_host_transport/nvshmem_common_transport.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #ifndef __NVSHMEM_TRANSPORT_COMMON_H
 8 | #define __NVSHMEM_TRANSPORT_COMMON_H
 9 | 
10 | #if !defined __CUDACC_RTC__
11 | #include <stdint.h>
12 | #include <limits.h>
13 | #else
14 | #include <cuda/std/cstdint>
15 | #include <cuda/std/climits>
16 | #endif
17 | 
18 | typedef enum {
19 |     NVSHMEMI_OP_PUT = 1,
20 |     NVSHMEMI_OP_P = 2,
21 |     NVSHMEMI_OP_PUT_SIGNAL = 3,
22 |     NVSHMEMI_OP_GET = 4,
23 |     NVSHMEMI_OP_G = 5,
24 |     NVSHMEMI_OP_FENCE = 6,
25 |     NVSHMEMI_OP_AMO = 7,
26 |     NVSHMEMI_OP_QUIET = 8,
27 |     NVSHMEMI_OP_QP_OP_OFFSET = 100,
28 |     NVSHMEMI_OP_PUT_QP = 101,
29 |     NVSHMEMI_OP_P_QP = 102,
30 |     NVSHMEMI_OP_PUT_SIGNAL_QP = 103,
31 |     NVSHMEMI_OP_GET_QP = 104,
32 |     NVSHMEMI_OP_G_QP = 105,
33 |     NVSHMEMI_OP_FENCE_QP = 106,
34 |     NVSHMEMI_OP_AMO_QP = 107,
35 |     NVSHMEMI_OP_QUIET_QP = 108,
36 |     NVSHMEMI_OP_SENTINEL = INT_MAX,
37 | } nvshmemi_op_t;
38 | 
39 | typedef enum { NVSHMEM_SIGNAL_SET = 9, NVSHMEM_SIGNAL_ADD = 10 } nvshmemx_signal_op_t;
40 | 
41 | typedef enum {
42 |     NVSHMEMI_AMO_ACK = 1,
43 |     NVSHMEMI_AMO_INC = 2,
44 |     NVSHMEMI_AMO_SET = 3,
45 |     NVSHMEMI_AMO_ADD = 4,
46 |     NVSHMEMI_AMO_AND = 5,
47 |     NVSHMEMI_AMO_OR = 6,
48 |     NVSHMEMI_AMO_XOR = 7,
49 |     NVSHMEMI_AMO_SIGNAL = 8,
50 |     NVSHMEMI_AMO_SIGNAL_SET = NVSHMEM_SIGNAL_SET,  // Note - NVSHMEM_SIGNAL_SET == 9
51 |     NVSHMEMI_AMO_SIGNAL_ADD = NVSHMEM_SIGNAL_ADD,  // Note - NVSHMEM_SIGNAL_ADD == 10
52 |     NVSHMEMI_AMO_END_OF_NONFETCH = 11,             // end of nonfetch atomics
53 |     NVSHMEMI_AMO_FETCH = 12,
54 |     NVSHMEMI_AMO_FETCH_INC = 13,
55 |     NVSHMEMI_AMO_FETCH_ADD = 14,
56 |     NVSHMEMI_AMO_FETCH_AND = 15,
57 |     NVSHMEMI_AMO_FETCH_OR = 16,
58 |     NVSHMEMI_AMO_FETCH_XOR = 17,
59 |     NVSHMEMI_AMO_SWAP = 18,
60 |     NVSHMEMI_AMO_COMPARE_SWAP = 19,
61 |     NVSHMEMI_AMO_OP_SENTINEL = INT_MAX,
62 | } nvshmemi_amo_t;
63 | 
64 | typedef struct {
65 |     volatile uint64_t data;
66 |     volatile uint64_t flag;
67 | } g_elem_t;
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/perftest/host/coll/sync_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include "coll_test.h"
14 | 
15 | int main(int argc, char *argv[]) {
16 |     int status = 0;
17 |     int mype;
18 |     size_t size = 1;
19 |     struct timeval t_start, t_stop;
20 |     float ms = 0;
21 |     double latency_value;
22 |     cudaEvent_t start_event, stop_event;
23 |     cudaStream_t stream;
24 | 
25 |     read_args(argc, argv);
26 | 
27 |     init_wrapper(&argc, &argv);
28 | 
29 |     mype = nvshmem_my_pe();
30 | #ifdef _NVSHMEM_DEBUG
31 |     int npes = nvshmem_n_pes();
32 | #endif
33 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
34 |     CUDA_CHECK(cudaEventCreate(&start_event));
35 |     CUDA_CHECK(cudaEventCreate(&stop_event));
36 | 
37 |     DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes);
38 | 
39 |     for (size_t iter = 0; iter < iters + warmup_iters; iter++) {
40 |         if (iter == warmup_iters) CUDA_CHECK(cudaEventRecord(start_event, stream));
41 | 
42 |         nvshmemx_team_sync_on_stream(NVSHMEM_TEAM_WORLD, stream);
43 |     }
44 |     CUDA_CHECK(cudaEventRecord(stop_event, stream));
45 |     CUDA_CHECK(cudaStreamSynchronize(stream));
46 |     CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event));
47 | 
48 |     if (!mype) {
49 |         latency_value = (ms / iters) * 1000;
50 |         print_table_basic("sync_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size,
51 |                           &latency_value, 1);
52 |     }
53 | 
54 |     nvshmem_barrier_all();
55 | 
56 |     CUDA_CHECK(cudaStreamDestroy(stream));
57 |     CUDA_CHECK(cudaEventDestroy(start_event));
58 |     CUDA_CHECK(cudaEventDestroy(stop_event));
59 | 
60 |     finalize_wrapper();
61 | 
62 |     return status;
63 | }
64 | 


--------------------------------------------------------------------------------
/perftest/host/coll/sync_all_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include "coll_test.h"
14 | int coll_max_iters = MAX_ITERS;
15 | 
16 | int main(int c, char *v[]) {
17 |     int status = 0;
18 |     int mype;
19 |     size_t size = 1;
20 |     double latency_value;
21 |     int iters = MAX_ITERS;
22 |     int skip = MAX_SKIP;
23 |     struct timeval t_start, t_stop;
24 |     float ms = 0;
25 |     cudaEvent_t start_event, stop_event;
26 |     cudaStream_t stream;
27 | 
28 |     init_wrapper(&c, &v);
29 | 
30 |     mype = nvshmem_my_pe();
31 | #ifdef _NVSHMEM_DEBUG
32 |     int npes = nvshmem_n_pes();
33 | #endif
34 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
35 |     CUDA_CHECK(cudaEventCreate(&start_event));
36 |     CUDA_CHECK(cudaEventCreate(&stop_event));
37 | 
38 |     DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes);
39 | 
40 |     for (iters = 0; iters < coll_max_iters + skip; iters++) {
41 |         if (iters == skip) CUDA_CHECK(cudaEventRecord(start_event, stream));
42 | 
43 |         nvshmemx_sync_all_on_stream(stream);
44 |     }
45 |     CUDA_CHECK(cudaEventRecord(stop_event, stream));
46 |     CUDA_CHECK(cudaStreamSynchronize(stream));
47 |     CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event));
48 |     if (!mype) {
49 |         latency_value = (ms / coll_max_iters) * 1000;
50 |         print_table_basic("sync_all_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size,
51 |                           &latency_value, 1);
52 |     }
53 | 
54 |     nvshmem_barrier_all();
55 | 
56 |     CUDA_CHECK(cudaStreamDestroy(stream));
57 |     CUDA_CHECK(cudaEventDestroy(start_event));
58 |     CUDA_CHECK(cudaEventDestroy(stop_event));
59 | 
60 |     finalize_wrapper();
61 | 
62 |     return status;
63 | }
64 | 


--------------------------------------------------------------------------------
/test/device/query/hello-team.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <cuda.h>
 9 | #include "nvshmem.h"
10 | #include "nvshmemx.h"
11 | #include "utils.h"
12 | 
13 | #define N 1
14 | 
15 | __device__ int errors_d;
16 | 
17 | __global__ void hello_world(void) {
18 |     int val;
19 | 
20 |     printf("Device - world PE %d of %d, node PE %d of %d\n", nvshmem_team_my_pe(NVSHMEM_TEAM_WORLD),
21 |            nvshmem_team_n_pes(NVSHMEM_TEAM_WORLD), nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE),
22 |            nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE));
23 | 
24 |     val = nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID);
25 | 
26 |     if (val != -1) {
27 |         printf("Error: device nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID) = %d\n", val);
28 |         ++errors_d;
29 |     }
30 | 
31 |     val = nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID);
32 | 
33 |     if (val != -1) {
34 |         printf("Error: device nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID) = %d\n", val);
35 |         ++errors_d;
36 |     }
37 | }
38 | 
39 | int main(int argc, char **argv) {
40 |     int errors_h = 0;
41 |     int val = 0;
42 |     init_wrapper(&argc, &argv);
43 | 
44 |     nvshmem_barrier_all(); /* Ensure NVSHMEM device init has completed */
45 |     cudaMemcpyToSymbol(errors_d, &val, sizeof(int), 0, cudaMemcpyHostToDevice);
46 | 
47 |     printf("Host   - world PE %d of %d, node PE %d of %d\n", nvshmem_team_my_pe(NVSHMEM_TEAM_WORLD),
48 |            nvshmem_team_n_pes(NVSHMEM_TEAM_WORLD), nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE),
49 |            nvshmem_team_n_pes(NVSHMEMX_TEAM_NODE));
50 | 
51 |     val = nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID);
52 | 
53 |     if (val != -1) {
54 |         printf("Error: host nvshmem_team_my_pe(NVSHMEM_TEAM_INVALID) = %d\n", val);
55 |         ++errors_h;
56 |     }
57 | 
58 |     val = nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID);
59 | 
60 |     if (val != -1) {
61 |         printf("Error: host nvshmem_team_n_pes(NVSHMEM_TEAM_INVALID) = %d\n", val);
62 |         ++errors_h;
63 |     }
64 | 
65 |     hello_world<<<1, N>>>();
66 | 
67 |     cudaMemcpyFromSymbol(&val, errors_d, sizeof(int), 0, cudaMemcpyDeviceToHost);
68 |     finalize_wrapper();
69 |     return errors_h + val;
70 | }
71 | 


--------------------------------------------------------------------------------
/test/host/interop/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(simplelib1 SHARED simplelib1.cu)
 2 | add_library(simplelib2 SHARED simplelib2.cu)
 3 | add_executable(app app.cu)
 4 | add_executable(app_multi_init app_multi_init.cu)
 5 | 
 6 | set_target_properties(simplelib1 simplelib2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 7 | 
 8 | set_target_properties(app simplelib1 simplelib2 PROPERTIES INSTALL_RPATH "$ORIGIN/../../../../lib" BUILD_WITH_INSTALL_RPATH TRUE)
 9 | target_link_options(simplelib1 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/simplelib1.sym")
10 | target_link_options(simplelib2 PRIVATE "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/simplelib2.sym")
11 | 
12 | # Tile-granular APIs need C++ 17
13 | set(TEST_CXX_STANDARD 17)
14 | set(TEST_CXX_ARG c++17)
15 | set_target_properties(simplelib1 PROPERTIES
16 | CXX_STANDARD "${TEST_CXX_STANDARD}"
17 | CUDA_STANDARD "${TEST_CXX_STANDARD}"
18 | )
19 | set_target_properties(simplelib2 PROPERTIES
20 | CXX_STANDARD "${TEST_CXX_STANDARD}"
21 | CUDA_STANDARD "${TEST_CXX_STANDARD}"
22 | )
23 | set_target_properties(app PROPERTIES
24 | CXX_STANDARD "${TEST_CXX_STANDARD}"
25 | CUDA_STANDARD "${TEST_CXX_STANDARD}"
26 | )
27 | set_target_properties(app_multi_init PROPERTIES
28 | CXX_STANDARD "${TEST_CXX_STANDARD}"
29 | CUDA_STANDARD "${TEST_CXX_STANDARD}"
30 | )
31 | 
32 | target_include_directories(simplelib1 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
33 | target_include_directories(simplelib2 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
34 | target_include_directories(app PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
35 | target_include_directories(app_multi_init PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
36 | 
37 | target_link_libraries(simplelib1 PRIVATE nvshmem_test_helper)
38 | target_link_libraries(simplelib2 PRIVATE nvshmem_test_helper)
39 | target_link_libraries(app PRIVATE simplelib1 simplelib2 nvshmem_test_helper)
40 | target_link_libraries(app_multi_init PRIVATE simplelib1 simplelib2 nvshmem_test_helper)
41 | 
42 | install(TARGETS simplelib1 LIBRARY DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop")
43 | install(TARGETS simplelib2 LIBRARY DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop")
44 | install(TARGETS app RUNTIME DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop")
45 | install(TARGETS app_multi_init RUNTIME DESTINATION "${NVSHMEM_TEST_INSTALL_PREFIX}/host/interop")
46 | 


--------------------------------------------------------------------------------
/src/modules/bootstrap/uid/ncclSocket/ncclsocket_utils.h:
--------------------------------------------------------------------------------
 1 | /*************************************************************************
 2 |  * Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  ************************************************************************/
 6 | 
 7 | #ifndef NCCL_SOCKET_UTILS_H_
 8 | #define NCCL_SOCKET_UTILS_H_
 9 | 
10 | #include <stdlib.h>  // for atoi
11 | #include <string.h>  // for strlen, strncmp
12 | 
13 | struct netIf {
14 |   char prefix[64];
15 |   int port;
16 | };
17 | 
18 | static inline int parseStringList(const char* string, struct netIf* ifList, int maxList) {
19 |   if (!string) return 0;
20 | 
21 |   const char* ptr = string;
22 | 
23 |   int ifNum = 0;
24 |   int ifC = 0;
25 |   char c;
26 |   do {
27 |     c = *ptr;
28 |     if (c == ':') {
29 |       if (ifC > 0) {
30 |         ifList[ifNum].prefix[ifC] = '\0';
31 |         ifList[ifNum].port = atoi(ptr+1);
32 |         ifNum++; ifC = 0;
33 |       }
34 |       while (c != ',' && c != '\0') c = *(++ptr);
35 |     } else if (c == ',' || c == '\0') {
36 |       if (ifC > 0) {
37 |         ifList[ifNum].prefix[ifC] = '\0';
38 |         ifList[ifNum].port = -1;
39 |         ifNum++; ifC = 0;
40 |       }
41 |     } else {
42 |       ifList[ifNum].prefix[ifC] = c;
43 |       ifC++;
44 |     }
45 |     ptr++;
46 |   } while (ifNum < maxList && c);
47 |   return ifNum;
48 | }
49 | 
50 | static bool matchIf(const char* string, const char* ref, bool matchExact) {
51 |   // Make sure to include '\0' in the exact case
52 |   int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
53 |   return strncmp(string, ref, matchLen) == 0;
54 | }
55 | 
56 | static bool matchPort(const int port1, const int port2) {
57 |   if (port1 == -1) return true;
58 |   if (port2 == -1) return true;
59 |   if (port1 == port2) return true;
60 |   return false;
61 | }
62 | 
63 | static inline bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) {
64 |   // Make an exception for the case where no user list is defined
65 |   if (listSize == 0) return true;
66 | 
67 |   for (int i=0; i<listSize; i++) {
68 |     if (matchIf(string, ifList[i].prefix, matchExact)
69 |         && matchPort(port, ifList[i].port)) {
70 |       return true;
71 |     }
72 |   }
73 |   return false;
74 | }
75 | 
76 | #endif
77 | 


--------------------------------------------------------------------------------
/test/host/mem/malloc_loop.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * See License.txt for license information
 5 |  */
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <unistd.h>
10 | #include "nvshmem.h"
11 | #include "nvshmemx.h"
12 | #include "cuda_runtime.h"
13 | #include "utils.h"
14 | 
15 | #define MAX_SIZE 100 * 1024 * 1024
16 | #define ITER 20
17 | #define REPEAT 50
18 | 
19 | int main(int argc, char **argv) {
20 |     int status = 0;
21 |     int mype;
22 |     size_t size;
23 |     char **buffer;
24 |     int iter = ITER;
25 |     int repeat = REPEAT;
26 |     char size_string[100];
27 | 
28 |     size = (size_t)MAX_SIZE * iter * 2;
29 |     sprintf(size_string, "%zu", size);
30 | 
31 |     status = setenv("NVSHMEM_SYMMETRIC_SIZE", size_string, 1);
32 |     if (status) {
33 |         ERROR_PRINT("setenv failed \n");
34 |         goto out;
35 |     }
36 | 
37 |     srand(1);
38 |     init_wrapper(&argc, &argv);
39 | 
40 |     mype = nvshmem_my_pe();
41 | 
42 |     buffer = (char **)calloc(iter, sizeof(char *));
43 |     if (!buffer) {
44 |         ERROR_PRINT("malloc failed \n");
45 |         goto out;
46 |     }
47 | 
48 |     for (int r = 0; r < repeat; r++) {
49 |         uint32_t lsize = r;
50 |         if (!mype) DEBUG_PRINT("[iter %d of %d] allocations: ", r, repeat);
51 |         for (int i = 0; i < iter; i++) {
52 |             lsize = rand_r(&lsize) % (MAX_SIZE - 1) + 1;
53 | 
54 |             buffer[i] = (char *)nvshmem_malloc(lsize);
55 |             if (!buffer[i]) {
56 |                 ERROR_PRINT("shmem_malloc failed \n");
57 |                 goto out;
58 |             }
59 | 
60 |             cudaMemset(buffer[i], 0, lsize);
61 |             if (!mype) DEBUG_PRINT("ptr: %p size: %zuB; ", (void *)buffer[i], lsize);
62 |         }
63 |         if (!mype) DEBUG_PRINT("\n \n");
64 | 
65 |         if (!mype) DEBUG_PRINT("[%d of %d] freeing all buffers: ", r, repeat);
66 | 
67 |         for (int i = 0; i < iter; i++) {
68 |             if (!mype) DEBUG_PRINT("ptr: %p; ", (void *)buffer[i]);
69 |             nvshmem_free(buffer[i]);
70 |         }
71 |         if (!mype) DEBUG_PRINT("\n \n");
72 | 
73 |         if (!mype) DEBUG_PRINT("[iter %d of %d] end of iter \n \n", r, repeat);
74 |     }
75 | 
76 |     free(buffer);
77 | 
78 |     finalize_wrapper();
79 | 
80 | out:
81 |     return status;
82 | }
83 | 


--------------------------------------------------------------------------------
/perftest/host/coll/barrier_on_stream.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2018-2025, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * NVIDIA CORPORATION and its licensors retain all intellectual property
 5 |  * and proprietary rights in and to this software, related documentation
 6 |  * and any modifications thereto.  Any use, reproduction, disclosure or
 7 |  * distribution of this software and related documentation without an express
 8 |  * license agreement from NVIDIA CORPORATION is strictly prohibited.
 9 |  *
10 |  * See License.txt for license information
11 |  */
12 | 
13 | #include "coll_test.h"
14 | 
15 | int main(int argc, char *argv[]) {
16 |     int status = 0;
17 |     int mype;
18 |     size_t size = 1;
19 | 
20 |     read_args(argc, argv);
21 |     float ms;
22 |     double latency_value;
23 |     cudaStream_t stream;
24 |     cudaEvent_t start_event, stop_event;
25 | 
26 |     init_wrapper(&argc, &argv);
27 | 
28 |     mype = nvshmem_my_pe();
29 | #ifdef _NVSHMEM_DEBUG
30 |     int npes = nvshmem_n_pes();
31 | #endif
32 |     CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
33 |     CUDA_CHECK(cudaEventCreate(&start_event));
34 |     CUDA_CHECK(cudaEventCreate(&stop_event));
35 | 
36 |     DEBUG_PRINT("SHMEM: [%d of %d] hello shmem world! \n", mype, npes);
37 | 
38 |     for (size_t iter = 0; iter < warmup_iters; iter++) {
39 |         nvshmemx_barrier_on_stream(NVSHMEM_TEAM_WORLD, stream);
40 |     }
41 |     CUDA_CHECK(cudaStreamSynchronize(stream));
42 |     nvshmem_barrier_all();
43 | 
44 |     CUDA_CHECK(cudaEventRecord(start_event, stream));
45 |     for (size_t iter = 0; iter < iters; iter++) {
46 |         nvshmemx_barrier_on_stream(NVSHMEM_TEAM_WORLD, stream);
47 |     }
48 |     CUDA_CHECK(cudaEventRecord(stop_event, stream));
49 |     CUDA_CHECK(cudaStreamSynchronize(stream));
50 |     CUDA_CHECK(cudaEventElapsedTime(&ms, start_event, stop_event));
51 | 
52 |     if (!mype) {
53 |         latency_value = (ms / iters) * 1000;
54 |         print_table_basic("barrier_on_stream", "None", "size (Bytes)", "latency", "us", '-', &size,
55 |                           &latency_value, 1);
56 |     }
57 | 
58 |     nvshmem_barrier_all();
59 | 
60 |     CUDA_CHECK(cudaStreamDestroy(stream));
61 |     CUDA_CHECK(cudaEventDestroy(start_event));
62 |     CUDA_CHECK(cudaEventDestroy(stop_event));
63 | 
64 |     finalize_wrapper();
65 | 
66 |     return status;
67 | }
68 | 


--------------------------------------------------------------------------------
/src/modules/transport/common/mlx5_prm.h:
--------------------------------------------------------------------------------
 1 | /* SPDX-License-Identifier: BSD-3-Clause
 2 |  * Copyright 2016 6WIND S.A.
 3 |  * Copyright 2016 Mellanox Technologies, Ltd
 4 |  */
 5 | 
 6 | #ifndef RTE_PMD_MLX5_PRM_H_
 7 | #define RTE_PMD_MLX5_PRM_H_
 8 | 
 9 | #include <stdint.h>
10 | 
11 | #define u8 uint8_t
12 | 
13 | #define MLX5_ADAPTER_PAGE_SHIFT 12
14 | 
15 | enum {
16 |     MLX5_CQE_SIZE_64B = 0x0,
17 |     MLX5_CQE_SIZE_128B = 0x1,
18 | };
19 | 
20 | struct mlx5_ifc_cqc_bits {
21 |     u8 status[0x4];
22 |     u8 as_notify[0x1];
23 |     u8 initiator_src_dct[0x1];
24 |     u8 dbr_umem_valid[0x1];
25 |     u8 reserved_at_7[0x1];
26 |     u8 cqe_sz[0x3];
27 |     u8 cc[0x1];
28 |     u8 reserved_at_c[0x1];
29 |     u8 scqe_break_moderation_en[0x1];
30 |     u8 oi[0x1];
31 |     u8 cq_period_mode[0x2];
32 |     u8 cqe_comp_en[0x1];
33 |     u8 mini_cqe_res_format[0x2];
34 |     u8 st[0x4];
35 |     u8 reserved_at_18[0x1];
36 |     u8 cqe_comp_layout[0x7];
37 |     u8 dbr_umem_id[0x20];
38 |     u8 reserved_at_40[0x14];
39 |     u8 page_offset[0x6];
40 |     u8 reserved_at_5a[0x2];
41 |     u8 mini_cqe_res_format_ext[0x2];
42 |     u8 cq_timestamp_format[0x2];
43 |     u8 reserved_at_60[0x3];
44 |     u8 log_cq_size[0x5];
45 |     u8 uar_page[0x18];
46 |     u8 reserved_at_80[0x4];
47 |     u8 cq_period[0xc];
48 |     u8 cq_max_count[0x10];
49 |     u8 reserved_at_a0[0x18];
50 |     u8 c_eqn[0x8];
51 |     u8 reserved_at_c0[0x3];
52 |     u8 log_page_size[0x5];
53 |     u8 reserved_at_c8[0x18];
54 |     u8 reserved_at_e0[0x20];
55 |     u8 reserved_at_100[0x8];
56 |     u8 last_notified_index[0x18];
57 |     u8 reserved_at_120[0x8];
58 |     u8 last_solicit_index[0x18];
59 |     u8 reserved_at_140[0x8];
60 |     u8 consumer_counter[0x18];
61 |     u8 reserved_at_160[0x8];
62 |     u8 producer_counter[0x18];
63 |     u8 local_partition_id[0xc];
64 |     u8 process_id[0x14];
65 |     u8 reserved_at_1A0[0x20];
66 |     u8 dbr_addr[0x40];
67 | };
68 | 
69 | struct mlx5_ifc_create_cq_in_bits {
70 |     u8 opcode[0x10];
71 |     u8 uid[0x10];
72 |     u8 reserved_at_20[0x10];
73 |     u8 op_mod[0x10];
74 |     u8 reserved_at_40[0x40];
75 |     struct mlx5_ifc_cqc_bits cq_context;
76 |     u8 cq_umem_offset[0x40];
77 |     u8 cq_umem_id[0x20];
78 |     u8 cq_umem_valid[0x1];
79 |     u8 reserved_at_2e1[0x1f];
80 |     u8 reserved_at_300[0x580];
81 |     u8 pas[];
82 | };
83 | 
84 | #endif /* RTE_PMD_MLX5_PRM_H_ */
85 | 


--------------------------------------------------------------------------------