├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── bcl ├── backends │ ├── experimental │ │ └── nvshmem │ │ │ ├── allocator.hpp │ │ │ ├── atomics.hpp │ │ │ ├── backend.hpp │ │ │ ├── comm.hpp │ │ │ ├── gpu_side_allocator.hpp │ │ │ ├── malloc.hpp │ │ │ └── ptr.hpp │ ├── gasnet-ex │ │ ├── atomics.hpp │ │ ├── backend.hpp │ │ ├── comm.hpp │ │ ├── detail │ │ │ └── gasnet_ad.hpp │ │ ├── ops.hpp │ │ └── request.hpp │ ├── mpi │ │ ├── alloc.hpp │ │ ├── async_allocator.hpp │ │ ├── atomics.hpp │ │ ├── backend.hpp │ │ ├── comm.hpp │ │ ├── mpi_types.hpp │ │ ├── ops.hpp │ │ ├── request.hpp │ │ └── team_conv.hpp │ ├── shmem │ │ ├── alloc.hpp │ │ ├── atomics.hpp │ │ ├── backend.hpp │ │ ├── comm.hpp │ │ ├── detail │ │ │ └── compare_and_swap_impl.hpp │ │ ├── ops.hpp │ │ └── request.hpp │ └── upcxx │ │ ├── backend.hpp │ │ └── comm.hpp ├── bcl.hpp ├── containers │ ├── Array.hpp │ ├── BloomFilter.hpp │ ├── CachedCopy.hpp │ ├── CircularQueue.hpp │ ├── Container.hpp │ ├── DArray.hpp │ ├── DMatrix.hpp │ ├── FastQueue.hpp │ ├── HashMap.hpp │ ├── HashMap │ │ ├── HashMapAL.hpp │ │ ├── HashMapEntry.hpp │ │ ├── HashMapFuture.hpp │ │ └── HashMapIterators.hpp │ ├── HashMapBuffer.hpp │ ├── ManyToManyDistributor.hpp │ ├── SPMatrix.hpp │ ├── algorithms │ │ ├── ca_gemm.hpp │ │ ├── cblas_wrapper.hpp │ │ ├── experimental_gemm.hpp │ │ ├── gemm.hpp │ │ └── spgemm.hpp │ ├── detail │ │ ├── Blocking.hpp │ │ ├── index.hpp │ │ └── mkl │ │ │ ├── mkl_error_handle.hpp │ │ │ ├── mkl_sparse_wrapper.hpp │ │ │ └── spmatrix.hpp │ ├── experimental │ │ ├── ChecksumQueue.hpp │ │ ├── ListQueue.hpp │ │ ├── SafeChecksumQueue.hpp │ │ ├── SlabQueue.hpp │ │ ├── cuda │ │ │ ├── CudaMatrix.hpp │ │ │ ├── CudaSPMatrix.hpp │ │ │ ├── DArray.hpp │ │ │ ├── DHashMap.cuh │ │ │ ├── DHashMap.hpp │ │ │ ├── DuplQueue.hpp │ │ │ ├── FastQueue.hpp │ │ │ ├── algorithms │ │ │ │ ├── algorithm.hpp │ │ │ │ ├── spgemm.hpp │ │ │ │ └── spmm.hpp │ │ │ ├── launch_kernel.cuh │ │ │ ├── sequential │ │ │ │ ├── CudaCSRMatrix.hpp │ │ │ │ ├── cusparse_error.cuh │ │ │ │ ├── cusparse_util.cuh │ │ │ │ └── device_vector.cuh │ │ │ └── util │ │ │ │ ├── comm.cuh │ │ │ │ ├── cuda_future.hpp │ │ │ │ ├── error.cuh │ │ │ │ ├── hash.cuh │ │ │ │ └── matrix_indexing.hpp │ │ ├── rpc.hpp │ │ └── rpc_checksum.hpp │ └── sequential │ │ ├── CSRMatrix.hpp │ │ ├── CSRMatrixMemoryMapped.hpp │ │ ├── SimpleHash.hpp │ │ ├── SparseAccumulator.hpp │ │ ├── SparseHashAccumulator.hpp │ │ ├── matrix_io.hpp │ │ └── vector.hpp └── core │ ├── GlobalPtr.hpp │ ├── GlobalRef.hpp │ ├── alloc.hpp │ ├── comm.hpp │ ├── detail │ ├── detail.hpp │ ├── hash_functions.hpp │ └── optional.hpp │ ├── except.hpp │ ├── future.hpp │ ├── malloc.hpp │ ├── teams.hpp │ ├── util.hpp │ └── util │ └── Backoff.hpp ├── cmake ├── FindGASNET_EX.cmake ├── FindSHMEM.cmake ├── FindUPCXX.cmake └── bclConfig.cmake ├── docs ├── Doxyfile.in ├── Makefile ├── architecture │ └── architecture.txt ├── backends │ └── backend.txt ├── conf.py ├── containers │ └── CircularQueue.txt ├── core │ ├── atomics.txt │ └── comm.txt ├── index.rst └── requirements.txt ├── examples ├── CMakeLists.txt ├── benchmarks │ └── mer-bench │ │ ├── Makefile │ │ ├── all-to-all-async.cpp │ │ ├── all-to-all.cpp │ │ ├── atomics-latency.cpp │ │ ├── experimental │ │ ├── Makefile │ │ ├── put_bench.cpp │ │ ├── put_harness.cpp │ │ ├── put_upcxx.cpp │ │ └── queue_impls.hpp │ │ └── irregular-lookup.cpp ├── experimental │ ├── Makefile │ ├── array │ │ ├── Makefile │ │ ├── dense_matmul.cpp │ │ ├── sp_sp_matrix.cpp │ │ ├── sp_sp_matrix_timing.cpp │ │ ├── workstealing_spmm.cpp │ │ └── workstealing_subk_spmm.cpp │ ├── genome-assembly │ │ └── contig-gen │ │ │ ├── Makefile │ │ │ ├── cg_267.cpp │ │ │ ├── hash_funcs.h │ │ │ ├── hash_funcs.hpp │ │ │ ├── kmer_t.hpp │ │ │ ├── packing.hpp │ │ │ └── read_kmers.hpp │ ├── nvshmem │ │ ├── Makefile │ │ ├── bcl_memcpy.cu │ │ ├── darray_test.cu │ │ ├── device_vector_test.cu │ │ ├── hashtable │ │ │ ├── Makefile │ │ │ ├── hashtable.cu │ │ │ └── hashtable2.cu │ │ ├── matrix │ │ │ ├── Makefile │ │ │ ├── a_owns_test.cu │ │ │ ├── a_owns_ws.cu │ │ │ ├── analyze_cf.cu │ │ │ ├── analyze_matrix.cpp │ │ │ ├── b_owns_test.cu │ │ │ ├── cusp_util.hpp │ │ │ ├── gespmm_util.hpp │ │ │ ├── grb_util.hpp │ │ │ ├── matrix_test.cu │ │ │ ├── nsparse_util.hpp │ │ │ ├── sparse_test.cu │ │ │ ├── sparse_test_cusp.cu │ │ │ ├── spgemm_aowns.cu │ │ │ ├── spgemm_aowns_ws.cu │ │ │ ├── spgemm_mpi.cu │ │ │ ├── spmm_mpi.cu │ │ │ ├── spmm_test.cu │ │ │ ├── spmm_test_gspmm.cu │ │ │ └── test_gspmm.cu │ │ ├── merbench │ │ │ ├── Makefile │ │ │ ├── irregular-lookup-kernel-block.cu │ │ │ ├── irregular-lookup-kernel-warp.cu │ │ │ ├── irregular-lookup-kernel.cu │ │ │ └── irregular-lookup.cu │ │ └── queue │ │ │ ├── Makefile │ │ │ ├── duplqueue.cu │ │ │ ├── duplqueue_multiqueue.cu │ │ │ ├── duplqueue_pushpop.cu │ │ │ ├── duplqueue_warp.cu │ │ │ ├── fastqueue-warp.cu │ │ │ └── fastqueue.cu │ └── test_rpc.cpp ├── fastqueue │ ├── Makefile │ ├── bucket_count.hpp │ ├── isx_benchmark.cpp │ ├── single_insert.cpp │ └── vector_insert.cpp ├── hashmap │ ├── CMakeLists.txt │ ├── Makefile │ ├── buffered_inserts.cpp │ └── insert_find.cpp ├── matrix │ ├── Makefile │ ├── gen_mat.cpp │ ├── generate_spmat.hpp │ ├── matrix_basic.cpp │ ├── matrix_basic_sparse.cpp │ ├── matrix_getrow.cpp │ ├── matrix_getrow_async.cpp │ ├── matrix_getrow_futar.cpp │ ├── matrix_row.cpp │ ├── rw_gemm.cpp │ └── spmm.cpp ├── ranges │ ├── Makefile │ └── remote_span.cpp ├── rpc │ ├── Makefile │ ├── buffered_rpc_checksum.cpp │ └── buffered_rpc_circular.cpp └── simple │ ├── CMakeLists.txt │ ├── Makefile │ ├── global_ptr.cpp │ └── hello_world.cpp └── tests ├── atomics ├── Makefile ├── compare_and_swap.cpp └── fetch_and_add.cpp ├── comm ├── Makefile ├── arget01.cpp ├── arput01.cpp ├── eventually_visible01.cpp ├── eventually_visible02.cpp ├── eventually_visible03.cpp └── rput01.cpp ├── containers ├── ChecksumQueue │ ├── ChecksumQueue01.cpp │ ├── ChecksumQueue02.cpp │ ├── ChecksumQueue03.cpp │ ├── ChecksumQueue04.cpp │ ├── Makefile │ └── SafeChecksumQueue01.cpp ├── CircularQueue │ ├── CircularQueue01.cpp │ ├── CircularQueue02.cpp │ ├── CircularQueue03.cpp │ ├── CircularQueue04.cpp │ ├── Makefile │ ├── run_many_times.sh │ └── simplify_mpi_issue.cpp ├── HashMap │ ├── Makefile │ ├── find_atomic.cpp │ └── insert_find.cpp └── experimental │ └── rpc │ ├── Makefile │ ├── rpc1.cpp │ └── rpc_checksum1.cpp └── run_tests.sh /.travis.yml: -------------------------------------------------------------------------------- 1 | install: 2 | 3 | language: c++ 4 | 5 | services: 6 | - docker 7 | 8 | compiler: 9 | - gcc 10 | 11 | matrix: 12 | include: 13 | - os: linux 14 | addons: 15 | apt: 16 | sources: 17 | - ubuntu-toolchain-r-test 18 | packages: 19 | - g++-8 20 | - libopenmpi-dev 21 | - openmpi-bin 22 | - wget 23 | 24 | before_install: 25 | - docker pull benbrock/bcl:debug 26 | - eval "${MATRIX_EVAL}" 27 | 28 | script: 29 | - docker run --name container --shm-size=2048M --user ubuntu -d benbrock/bcl:mpich-debug tail -f /dev/null # keep container alive 30 | - docker cp `pwd` container:/home/ubuntu/src # copy new changes 31 | - docker exec --user root container bash -c "chown -R ubuntu:ubuntu /home/ubuntu/src" 32 | - docker exec --user ubuntu container bash -c "cd ./src/bcl/tests/ && ./run_tests.sh mpi shmem gasnet_ex" # run the test script 33 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | cmake_minimum_required(VERSION 3.10) 6 | 7 | set(CMAKE_CXX_STANDARD 17) 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 9 | set(CMAKE_CXX_EXTENSIONS OFF) 10 | 11 | project(bcl LANGUAGES CXX) 12 | 13 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) 14 | 15 | enable_testing() 16 | 17 | set(bcl_DIR ${CMAKE_CURRENT_LIST_DIR}/cmake) 18 | find_package(bcl REQUIRED) 19 | 20 | add_subdirectory(examples) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Benjamin Brock 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /bcl/backends/experimental/nvshmem/atomics.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | namespace cuda { 9 | 10 | /* 11 | TODO: Change to new atomics API 12 | __device__ int compare_and_swap(BCL::cuda::ptr ptr, int old_value, int new_value) { 13 | return nvshmem_int_cswap(ptr.rptr(), old_value, new_value, ptr.rank_); 14 | } 15 | 16 | __device__ int fetch_and_add(BCL::cuda::ptr ptr, int value) { 17 | return nvshmem_int_fadd(ptr.rptr(), value, ptr.rank_); 18 | } 19 | */ 20 | 21 | } // end cuda 22 | } // end BCL 23 | -------------------------------------------------------------------------------- /bcl/backends/experimental/nvshmem/gpu_side_allocator.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace emalloc { 10 | 11 | char* segment_ptr; 12 | size_t segment_size; 13 | 14 | __device__ char* d_segment_ptr; 15 | __device__ size_t d_segment_size; 16 | __device__ unsigned long long* d_heap_ptr; 17 | 18 | __global__ void set_device_vars(char* segptr, size_t size, unsigned long long* hptr) { 19 | d_segment_ptr = segptr; 20 | d_segment_size = size; 21 | d_heap_ptr = hptr; 22 | *d_heap_ptr = 0; 23 | } 24 | 25 | void init_emalloc(void* ptr, size_t n_bytes) { 26 | segment_ptr = (char *) ptr; 27 | segment_size = n_bytes; 28 | 29 | unsigned long long* hptr; 30 | cudaMalloc((void **) &hptr, sizeof(unsigned long long)); 31 | 32 | set_device_vars<<<1, 1>>>(segment_ptr, segment_size, hptr); 33 | cudaDeviceSynchronize(); 34 | } 35 | 36 | __device__ void* emalloc(size_t size) { 37 | unsigned long long value = atomicAdd(d_heap_ptr, (unsigned long long) size); 38 | if (value + size <= d_segment_size) { 39 | return d_segment_ptr + value; 40 | } else { 41 | return nullptr; 42 | } 43 | } 44 | 45 | // Do nothing... 46 | __device__ void efree(void* ptr) { 47 | } 48 | 49 | } // end emalloc 50 | namespace BCL { 51 | 52 | namespace cuda { 53 | 54 | BCL::cuda::ptr gpu_side_segment; 55 | 56 | void init_gpu_side_allocator(size_t size) { 57 | gpu_side_segment = BCL::cuda::alloc(size); 58 | emalloc::init_emalloc(gpu_side_segment.local(), size); 59 | } 60 | 61 | void finalize_gpu_side_allocator() { 62 | BCL::cuda::dealloc(gpu_side_segment); 63 | } 64 | 65 | } // end BCL 66 | 67 | } // end cuda 68 | -------------------------------------------------------------------------------- /bcl/backends/gasnet-ex/atomics.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | template 10 | T compare_and_swap(BCL::GlobalPtr ptr, T old_val, T new_val) { 11 | static_assert(std::is_same::value); 12 | void* dst_ptr = gasnet_resolve_address(ptr); 13 | int32_t rv; 14 | gex_Event_t event = gex_AD_OpNB_I32(ad_i32, &rv, ptr.rank, dst_ptr, 15 | GEX_OP_FCAS, old_val, new_val, 0); 16 | 17 | gex_Event_Wait(event); 18 | return rv; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /bcl/backends/gasnet-ex/backend.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include "detail/gasnet_ad.hpp" 9 | #include "comm.hpp" 10 | 11 | namespace BCL { 12 | 13 | extern uint64_t shared_segment_size; 14 | extern void* smem_base_ptr; 15 | 16 | gex_Client_t client; 17 | gex_EP_t ep; 18 | gex_TM_t tm; 19 | const char* clientName = "BCL"; 20 | 21 | gasnet_seginfo_t* gasnet_seginfo; 22 | 23 | extern inline void init_malloc(); 24 | 25 | uint64_t my_rank; 26 | uint64_t my_nprocs; 27 | 28 | bool bcl_finalized; 29 | 30 | namespace backend { 31 | 32 | inline uint64_t rank() { 33 | return BCL::my_rank; 34 | } 35 | 36 | inline uint64_t nprocs() { 37 | return BCL::my_nprocs; 38 | } 39 | 40 | } // end backend 41 | 42 | inline void barrier() { 43 | gex_Event_t event = gex_Coll_BarrierNB(tm, 0); 44 | gex_Event_Wait(event); 45 | } 46 | 47 | inline void flush() {} 48 | 49 | template 50 | inline void* gasnet_resolve_address(const GlobalPtr ptr) { 51 | return reinterpret_cast(BCL::gasnet_seginfo[ptr.rank].addr) + ptr.ptr; 52 | } 53 | 54 | inline void init(uint64_t shared_segment_size = 256, bool thread_safe = false) { 55 | BCL::shared_segment_size = 1024*1024*shared_segment_size; 56 | 57 | gex_Client_Init(&client, &ep, &tm, clientName, NULL, NULL, 0); 58 | 59 | if (thread_safe) { 60 | #ifndef GASNET_PAR 61 | throw BCL::error("Need to use a par build of GASNet-EX"); 62 | #endif 63 | } 64 | 65 | gex_Segment_t segment; 66 | gex_Segment_Attach(&segment, tm, BCL::shared_segment_size); 67 | 68 | smem_base_ptr = gex_Segment_QueryAddr(segment); 69 | 70 | if (smem_base_ptr == NULL) { 71 | throw std::runtime_error("BCL: Could not allocate shared memory segment."); 72 | } 73 | 74 | my_rank = gex_System_QueryJobRank(); 75 | my_nprocs = gex_System_QueryJobSize(); 76 | 77 | gasnet_seginfo = (gasnet_seginfo_t*) malloc(sizeof(gasnet_seginfo_t) * nprocs()); 78 | gasnet_getSegmentInfo(gasnet_seginfo, BCL::nprocs()); 79 | 80 | init_malloc(); 81 | init_atomics(); 82 | 83 | bcl_finalized = false; 84 | 85 | BCL::barrier(); 86 | } 87 | 88 | inline void finalize() { 89 | BCL::barrier(); 90 | finalize_atomics(); 91 | free(gasnet_seginfo); 92 | bcl_finalized = true; 93 | } 94 | 95 | } // end BCL 96 | -------------------------------------------------------------------------------- /bcl/backends/gasnet-ex/detail/gasnet_ad.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | // NOTE: The GASNet AD Op wrapping code in this file is 10 | // directly borrowed from UPC++ `atomic.hpp`. 11 | 12 | namespace BCL { 13 | 14 | extern gex_TM_t tm; 15 | gex_AD_t ad_i32; 16 | gex_AD_t ad_f32; 17 | 18 | template 19 | gex_AD_t& get_gex_ad(); 20 | template <> 21 | gex_AD_t& get_gex_ad() { return ad_i32; } 22 | template <> 23 | gex_AD_t& get_gex_ad() { return ad_f32; } 24 | 25 | template 26 | constexpr gex_DT_t get_gex_dt(); 27 | template<> 28 | constexpr gex_DT_t get_gex_dt() { return GEX_DT_I32; } 29 | template<> 30 | constexpr gex_DT_t get_gex_dt() { return GEX_DT_U32; } 31 | template<> 32 | constexpr gex_DT_t get_gex_dt() { return GEX_DT_I64; } 33 | template<> 34 | constexpr gex_DT_t get_gex_dt() { return GEX_DT_U64; } 35 | template<> 36 | constexpr gex_DT_t get_gex_dt() { return GEX_DT_FLT; } 37 | 38 | template 39 | gex_Event_t shim_gex_AD_OpNB( 40 | gex_AD_t ad, T *p, size_t rank, void *addr, 41 | int op, T val1, T val2, int flags 42 | ); 43 | 44 | template<> 45 | gex_Event_t shim_gex_AD_OpNB( 46 | gex_AD_t ad, int32_t *p, size_t rank, void *addr, 47 | int op, int32_t val1, int32_t val2, int flags 48 | ) { 49 | return gex_AD_OpNB_I32(ad, p, rank, addr, op, val1, val2, flags); 50 | } 51 | 52 | template<> 53 | gex_Event_t shim_gex_AD_OpNB( 54 | gex_AD_t ad, uint32_t *p, size_t rank, void *addr, 55 | int op, uint32_t val1, uint32_t val2, int flags 56 | ) { 57 | return gex_AD_OpNB_U32(ad, p, rank, addr, op, val1, val2, flags); 58 | } 59 | 60 | template<> 61 | gex_Event_t shim_gex_AD_OpNB( 62 | gex_AD_t ad, int64_t *p, size_t rank, void *addr, 63 | int op, int64_t val1, int64_t val2, int flags 64 | ) { 65 | return gex_AD_OpNB_I64(ad, p, rank, addr, op, val1, val2, flags); 66 | } 67 | 68 | template<> 69 | gex_Event_t shim_gex_AD_OpNB( 70 | gex_AD_t ad, uint64_t *p, size_t rank, void *addr, 71 | int op, uint64_t val1, uint64_t val2, int flags 72 | ) { 73 | return gex_AD_OpNB_U64(ad, p, rank, addr, op, val1, val2, flags); 74 | } 75 | 76 | template<> 77 | gex_Event_t shim_gex_AD_OpNB( 78 | gex_AD_t ad, float *p, size_t rank, void *addr, 79 | int op, float val1, float val2, int flags 80 | ) { 81 | return gex_AD_OpNB_FLT(ad, p, rank, addr, op, val1, val2, flags); 82 | } 83 | 84 | void init_atomics() { 85 | gex_OP_t ops = GEX_OP_FADD | GEX_OP_FCAS | GEX_OP_GET | GEX_OP_FXOR | GEX_OP_FOR | GEX_OP_FAND; 86 | gex_Flags_t flags = 0; 87 | gex_AD_Create(&ad_i32, tm, get_gex_dt(), ops, flags); 88 | gex_AD_Create(&ad_f32, tm, get_gex_dt(), GEX_OP_FADD, flags); 89 | } 90 | 91 | void finalize_atomics() { 92 | gex_AD_Destroy(ad_i32); 93 | gex_AD_Destroy(ad_f32); 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /bcl/backends/gasnet-ex/request.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include "backend.hpp" 12 | 13 | namespace BCL { 14 | 15 | class request { 16 | gex_Event_t request_ = GEX_EVENT_INVALID; 17 | public: 18 | request() = default; 19 | request(const request&) = default; 20 | 21 | request(const gex_Event_t& request) : request_(request) {} 22 | 23 | void wait() { 24 | if (request_ != GEX_EVENT_INVALID) { 25 | gex_Event_Wait(request_); 26 | request_ = GEX_EVENT_INVALID; 27 | } 28 | } 29 | 30 | bool check() { 31 | if (request_ == GEX_EVENT_INVALID) { 32 | return true; 33 | } else { 34 | int success = !gex_Event_Test(request_); 35 | 36 | if (success) { 37 | request_ = GEX_EVENT_INVALID; 38 | } else { 39 | gasnet_AMPoll(); 40 | 41 | success = !gex_Event_Test(request_); 42 | 43 | if (success) { 44 | request_ = GEX_EVENT_INVALID; 45 | } 46 | } 47 | 48 | return success; 49 | } 50 | } 51 | }; 52 | 53 | } 54 | 55 | #include 56 | -------------------------------------------------------------------------------- /bcl/backends/mpi/alloc.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include "backend.hpp" 10 | 11 | namespace BCL { 12 | } 13 | -------------------------------------------------------------------------------- /bcl/backends/mpi/async_allocator.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | template 12 | class mpi_allocator { 13 | public: 14 | using value_type = T; 15 | using size_type = std::size_t; 16 | using pointer = T*; 17 | using const_pointer = const T*; 18 | using reference = T&; 19 | using const_reference = const T&; 20 | 21 | mpi_allocator() = default; 22 | mpi_allocator(const mpi_allocator&) = default; 23 | 24 | pointer allocate(size_type n) { 25 | T* ptr; 26 | MPI_Alloc_mem(n*sizeof(value_type), MPI_INFO_NULL, &ptr); 27 | return ptr; 28 | } 29 | 30 | void deallocate(pointer ptr, size_type n = 0) { 31 | MPI_Free_mem(ptr); 32 | } 33 | 34 | template 35 | void construct(pointer ptr, Args&&... args) { 36 | new(ptr) T(std::forward(args)...); 37 | } 38 | 39 | void destroy(pointer ptr) { 40 | ptr->~T(); 41 | } 42 | 43 | bool operator==(const mpi_allocator&) const { 44 | return true; 45 | } 46 | 47 | bool operator!=(const mpi_allocator& other) const { 48 | return !operator==(other); 49 | } 50 | }; 51 | 52 | template 53 | using async_allocator = mpi_allocator; 54 | 55 | /* 56 | template 57 | using async_allocator = std::allocator; 58 | */ 59 | 60 | } 61 | -------------------------------------------------------------------------------- /bcl/backends/mpi/atomics.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include "mpi_types.hpp" 9 | 10 | namespace BCL { 11 | 12 | extern MPI_Win win; 13 | 14 | template 15 | inline T compare_and_swap(BCL::GlobalPtr ptr, T old_val, T new_val) { 16 | static_assert(std::is_integral::value, "BCL::compare_and_swap(): only integral types are supported"); 17 | T result; 18 | MPI_Datatype type = get_mpi_type(); 19 | int error_code = MPI_Compare_and_swap(&new_val, &old_val, &result, 20 | type, 21 | ptr.rank, ptr.ptr, 22 | BCL::win); 23 | 24 | BCL_DEBUG( 25 | if (error_code != MPI_SUCCESS) { 26 | throw debug_error("BCL compare_and_swap(): MPI_Compare_and_swap return error code " + std::to_string(error_code)); 27 | } 28 | ) 29 | 30 | error_code = MPI_Win_flush_local(ptr.rank, BCL::win); 31 | 32 | BCL_DEBUG( 33 | if (error_code != MPI_SUCCESS) { 34 | throw debug_error("BCL compare_and_swap(): MPI_Win_flush_local return error code " + std::to_string(error_code)); 35 | } 36 | ) 37 | 38 | return result; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /bcl/backends/mpi/mpi_types.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | // Helpers to get MPI datatype corresponding to a C++ type. 12 | // Defined in accordance with MPI 3.1 report, page 26. 13 | 14 | // XXX: these *could* all be static constexpr members, if 15 | // not for a bug in Spectrum MPI. 16 | 17 | template 18 | struct get_mpi_type_impl_; 19 | 20 | template <> 21 | struct get_mpi_type_impl_ { 22 | // static constexpr MPI_Datatype mpi_type = MPI_FLOAT; 23 | static MPI_Datatype mpi_type() { return MPI_FLOAT; } 24 | }; 25 | 26 | template <> 27 | struct get_mpi_type_impl_ { 28 | // static constexpr MPI_Datatype mpi_type = MPI_DOUBLE; 29 | static MPI_Datatype mpi_type() { return MPI_DOUBLE; } 30 | }; 31 | 32 | template <> 33 | struct get_mpi_type_impl_ { 34 | // static constexpr MPI_Datatype mpi_type = MPI_LONG_DOUBLE; 35 | static MPI_Datatype mpi_type() { return MPI_LONG_DOUBLE; } 36 | }; 37 | 38 | /* 39 | template <> 40 | struct get_mpi_type_impl_<_Bool> { 41 | static constexpr MPI_Datatype mpi_type = MPI_C_BOOL; 42 | }; 43 | */ 44 | 45 | template <> 46 | struct get_mpi_type_impl_ { 47 | // static constexpr MPI_Datatype mpi_type = MPI_INT8_T; 48 | static MPI_Datatype mpi_type() { return MPI_INT8_T; } 49 | }; 50 | 51 | template <> 52 | struct get_mpi_type_impl_ { 53 | // static constexpr MPI_Datatype mpi_type = MPI_INT16_T; 54 | static MPI_Datatype mpi_type() { return MPI_INT16_T; } 55 | }; 56 | 57 | template <> 58 | struct get_mpi_type_impl_ { 59 | // static constexpr MPI_Datatype mpi_type = MPI_INT32_T; 60 | static MPI_Datatype mpi_type() { return MPI_INT32_T; } 61 | }; 62 | 63 | template <> 64 | struct get_mpi_type_impl_ { 65 | // static constexpr MPI_Datatype mpi_type = MPI_INT64_T; 66 | static MPI_Datatype mpi_type() { return MPI_INT64_T; } 67 | }; 68 | 69 | template <> 70 | struct get_mpi_type_impl_ { 71 | // static constexpr MPI_Datatype mpi_type = MPI_UINT8_T; 72 | static MPI_Datatype mpi_type() { return MPI_UINT8_T; } 73 | }; 74 | 75 | template <> 76 | struct get_mpi_type_impl_ { 77 | // static constexpr MPI_Datatype mpi_type = MPI_UINT16_T; 78 | static MPI_Datatype mpi_type() { return MPI_UINT16_T; } 79 | }; 80 | 81 | template <> 82 | struct get_mpi_type_impl_ { 83 | // static constexpr MPI_Datatype mpi_type = MPI_UINT32_T; 84 | static MPI_Datatype mpi_type() { return MPI_UINT32_T; } 85 | }; 86 | 87 | template <> 88 | struct get_mpi_type_impl_ { 89 | // static constexpr MPI_Datatype mpi_type = MPI_UINT64_T; 90 | static MPI_Datatype mpi_type() { return MPI_UINT64_T; } 91 | }; 92 | 93 | template 94 | MPI_Datatype get_mpi_type() { 95 | // return get_mpi_type_impl_::mpi_type; 96 | return get_mpi_type_impl_::mpi_type(); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /bcl/backends/mpi/request.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include "backend.hpp" 9 | 10 | namespace BCL { 11 | 12 | class request { 13 | MPI_Request request_ = MPI_REQUEST_NULL; 14 | public: 15 | request() = default; 16 | 17 | request(const request&) = default; 18 | request& operator=(const request&) = default; 19 | 20 | request(request&&) = default; 21 | request& operator=(request&&) = default; 22 | 23 | request(const MPI_Request& request) : request_(request) {} 24 | 25 | void wait() { 26 | if (request_ == MPI_REQUEST_NULL) { 27 | return; 28 | // throw std::runtime_error("request: waiting on an expired request"); 29 | } 30 | MPI_Wait(&request_, MPI_STATUS_IGNORE); 31 | request_ = MPI_REQUEST_NULL; 32 | } 33 | 34 | bool check() { 35 | if (request_ == MPI_REQUEST_NULL) { 36 | return true; 37 | } 38 | int status; 39 | MPI_Test(const_cast(&request_), &status, MPI_STATUS_IGNORE); 40 | if (status) { 41 | request_ = MPI_REQUEST_NULL; 42 | } 43 | return status; 44 | } 45 | }; 46 | 47 | } 48 | 49 | #include 50 | -------------------------------------------------------------------------------- /bcl/backends/mpi/team_conv.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | namespace BCL { 13 | 14 | namespace backend { 15 | 16 | struct MPICommWrapper { 17 | MPI_Comm comm_ = MPI_COMM_NULL; 18 | 19 | MPICommWrapper(const MPICommWrapper&) = delete; 20 | MPICommWrapper() = default; 21 | 22 | MPICommWrapper(MPICommWrapper&& other) { 23 | comm_ = other.comm_; 24 | other.comm_ = MPI_COMM_NULL; 25 | } 26 | 27 | MPICommWrapper& operator=(MPICommWrapper&& other) { 28 | comm_ = other.comm_; 29 | other.comm_ = MPI_COMM_NULL; 30 | return *this; 31 | } 32 | 33 | MPICommWrapper(const BCL::Team& team) { 34 | std::vector ranks; 35 | ranks.reserve(team.nprocs()); 36 | 37 | for (size_t i = 0; i < team.nprocs(); i++) { 38 | ranks.push_back(team.to_world(i)); 39 | } 40 | 41 | MPI_Group world_group, group; 42 | int rv = MPI_Comm_group(MPI_COMM_WORLD, &world_group); 43 | assert(rv == MPI_SUCCESS); 44 | 45 | rv = MPI_Group_incl(world_group, ranks.size(), ranks.data(), &group); 46 | assert(rv == MPI_SUCCESS); 47 | 48 | rv = MPI_Comm_create(MPI_COMM_WORLD, group, &comm_); 49 | assert(rv == MPI_SUCCESS); 50 | 51 | MPI_Group_free(&world_group); 52 | MPI_Group_free(&group); 53 | } 54 | 55 | ~MPICommWrapper() { 56 | if (comm_ != MPI_COMM_NULL) { 57 | int finalized; 58 | MPI_Finalized(&finalized); 59 | if (!finalized) { 60 | MPI_Comm_free(&comm_); 61 | } 62 | } 63 | } 64 | 65 | MPI_Comm comm() const { 66 | return comm_; 67 | } 68 | }; 69 | 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /bcl/backends/shmem/alloc.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include "backend.hpp" 9 | 10 | namespace BCL { 11 | } 12 | -------------------------------------------------------------------------------- /bcl/backends/shmem/atomics.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | #include "detail/compare_and_swap_impl.hpp" 11 | 12 | namespace BCL { 13 | 14 | template 15 | T compare_and_swap(GlobalPtr ptr, T old_val, T new_val) { 16 | static_assert(std::is_integral::value, "BCL::compare_and_swap(): only integral types are supported"); 17 | return compare_and_swap_impl_::op(ptr, old_val, new_val); 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /bcl/backends/shmem/backend.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include 10 | 11 | #include "alloc.hpp" 12 | #include "comm.hpp" 13 | #include "ops.hpp" 14 | 15 | namespace BCL { 16 | 17 | extern uint64_t shared_segment_size; 18 | extern void *smem_base_ptr; 19 | 20 | extern inline void init_malloc(); 21 | 22 | bool we_initialized; 23 | bool bcl_finalized; 24 | 25 | uint64_t my_rank; 26 | uint64_t my_nprocs; 27 | 28 | namespace backend { 29 | 30 | inline uint64_t rank() { 31 | return BCL::my_rank; 32 | } 33 | 34 | inline uint64_t nprocs() { 35 | return BCL::my_nprocs; 36 | } 37 | 38 | } // end backend 39 | 40 | inline void barrier() { 41 | shmem_barrier_all(); 42 | } 43 | 44 | inline void flush() { 45 | shmem_quiet(); 46 | } 47 | 48 | // MPI communicator, shared_segment_size in MB, 49 | // and whether to start the progress thread. 50 | inline void init(uint64_t shared_segment_size = 256, bool thread_safe = false) { 51 | BCL::shared_segment_size = 1024*1024*shared_segment_size; 52 | 53 | if (!thread_safe) { 54 | shmem_init(); 55 | } else { 56 | int provided; 57 | shmem_init_thread(SHMEM_THREAD_MULTIPLE, &provided); 58 | 59 | if (provided < SHMEM_THREAD_MULTIPLE) { 60 | throw BCL::error("BCL requested SHMEM_THREAD_MULTIPLE, but was deniced." 61 | "You need a thread-safe SHMEM implementation."); 62 | } 63 | } 64 | 65 | BCL::my_rank = shmem_my_pe(); 66 | BCL::my_nprocs = shmem_n_pes(); 67 | 68 | BCL::smem_base_ptr = shmem_malloc(BCL::shared_segment_size); 69 | 70 | if (BCL::smem_base_ptr == NULL) { 71 | throw std::runtime_error("BCL: Could not allocate shared memory segment."); 72 | } 73 | 74 | bcl_finalized = false; 75 | 76 | init_malloc(); 77 | 78 | BCL::barrier(); 79 | } 80 | 81 | inline void finalize() { 82 | BCL::barrier(); 83 | shmem_free(smem_base_ptr); 84 | shmem_finalize(); 85 | 86 | bcl_finalized = true; 87 | } 88 | 89 | } // end BCL 90 | -------------------------------------------------------------------------------- /bcl/backends/shmem/detail/compare_and_swap_impl.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | namespace BCL { 11 | 12 | template 13 | struct compare_and_swap_impl_; 14 | 15 | template <> 16 | struct compare_and_swap_impl_ { 17 | static int op(BCL::GlobalPtr ptr, int old_val, int new_val) { 18 | return shmem_int_atomic_compare_swap(ptr.rptr(), old_val, new_val, ptr.rank); 19 | } 20 | }; 21 | 22 | template <> 23 | struct compare_and_swap_impl_ { 24 | static long long op(BCL::GlobalPtr ptr, long long old_val, long long new_val) { 25 | return shmem_longlong_atomic_compare_swap(ptr.rptr(), old_val, new_val, ptr.rank); 26 | } 27 | }; 28 | 29 | template <> 30 | struct compare_and_swap_impl_ { 31 | static unsigned int op(BCL::GlobalPtr ptr, unsigned int old_val, unsigned int new_val) { 32 | return shmem_uint_atomic_compare_swap(ptr.rptr(), old_val, new_val, ptr.rank); 33 | } 34 | }; 35 | 36 | template <> 37 | struct compare_and_swap_impl_ { 38 | static unsigned long long op(BCL::GlobalPtr ptr, unsigned long long old_val, unsigned long long new_val) { 39 | return shmem_ulonglong_atomic_compare_swap(ptr.rptr(), old_val, new_val, ptr.rank); 40 | } 41 | }; 42 | 43 | } 44 | -------------------------------------------------------------------------------- /bcl/backends/shmem/request.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | namespace BCL { 11 | 12 | class request { 13 | public: 14 | request() = default; 15 | request(const request&) = default; 16 | 17 | void wait() { 18 | shmem_quiet(); 19 | } 20 | 21 | bool check() const { 22 | return true; 23 | } 24 | }; 25 | 26 | } 27 | 28 | #include 29 | -------------------------------------------------------------------------------- /bcl/backends/upcxx/backend.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include "comm.hpp" 9 | 10 | namespace BCL { 11 | extern uint64_t shared_segment_size; 12 | extern void* smem_base_ptr; 13 | 14 | extern void init_malloc(); 15 | 16 | uint64_t my_rank; 17 | uint64_t my_nprocs; 18 | 19 | bool bcl_finalized; 20 | 21 | uint64_t rank() { 22 | return BCL::my_rank; 23 | } 24 | 25 | uint64_t nprocs() { 26 | return BCL::my_nprocs; 27 | } 28 | 29 | void barrier() { 30 | upcxx::barrier(); 31 | } 32 | 33 | template 34 | upcxx::global_ptr upcxx_resolve_address(GlobalPtr ptr) { 35 | return upcxx::global_ptr(upcxx::detail::internal_only(), 36 | ptr.rank, ptr.rptr()); 37 | } 38 | 39 | void init(uint64_t shared_segment_size = 256) { 40 | BCL::shared_segment_size = 1024*1024*shared_segment_size; 41 | 42 | // TODO: check if UPC++ is already initialized? 43 | upcxx::init(); 44 | 45 | upcxx::global_ptr ptr = upcxx::allocate(BCL::shared_segment_size); 46 | 47 | if (ptr == NULL) { 48 | throw std::runtime_error("BCL: Could not allocate shared memory segment."); 49 | } 50 | 51 | smem_base_ptr = ptr.local(); 52 | 53 | my_rank = upcxx::rank_me(); 54 | my_nprocs = upcxx::rank_n(); 55 | 56 | 57 | init_malloc(); 58 | 59 | bcl_finalized = false; 60 | 61 | BCL::barrier(); 62 | } 63 | 64 | void finalize() { 65 | BCL::barrier(); 66 | upcxx::deallocate(BCL::smem_base_ptr); 67 | upcxx::finalize(); 68 | bcl_finalized = true; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /bcl/backends/upcxx/comm.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include "backend.hpp" 8 | 9 | namespace BCL { 10 | 11 | extern void barrier(); 12 | extern uint64_t rank(); 13 | extern uint64_t nprocs(); 14 | 15 | template 16 | void read(const GlobalPtr& src, T* dst, size_t size) { 17 | upcxx::rget(upcxx_resolve_address(src), dst, size).wait(); 18 | } 19 | 20 | template 21 | void write(const T* src, const GlobalPtr& dst, size_t size) { 22 | upcxx::rput(src, upcxx_resolve_address(dst), size).wait(); 23 | } 24 | 25 | template 26 | T broadcast(T& val, uint64_t root) { 27 | return upcxx::broadcast(val, root).wait(); 28 | } 29 | 30 | template 31 | T allreduce(const T& val, Op fn) { 32 | return upcxx::allreduce(val, fn).wait(); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /bcl/bcl.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #ifdef BCL_BACKEND_SHMEM 14 | #include 15 | #elif BCL_BACKEND_GASNET_EX 16 | #include 17 | #elif BCL_BACKEND_UPCXX 18 | #include 19 | #else 20 | #ifndef BCL_BACKEND_MPI 21 | #define BCL_BACKEND_MPI 22 | #endif 23 | #include 24 | #endif 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | namespace BCL { 31 | // TODO: put these in a compilation unit. 32 | uint64_t shared_segment_size; 33 | void *smem_base_ptr; 34 | } 35 | -------------------------------------------------------------------------------- /bcl/containers/CachedCopy.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | template 12 | struct CachedCopy { 13 | 14 | CachedCopy() = default; 15 | CachedCopy(const CachedCopy&) = default; 16 | CachedCopy(CachedCopy&&) = default; 17 | 18 | CachedCopy& operator=(const CachedCopy& other) = default; 19 | 20 | CachedCopy(BCL::GlobalPtr ptr) : ptr_(ptr) { 21 | refresh(); 22 | } 23 | 24 | void refresh() { 25 | BCL::rget(ptr_, reinterpret_cast(buf_), 1); 26 | } 27 | 28 | T& operator*() { 29 | return *reinterpret_cast(buf_); 30 | } 31 | 32 | T* operator->() { 33 | return reinterpret_cast(buf_); 34 | } 35 | 36 | BCL::GlobalPtr ptr_ = nullptr; 37 | char buf_[sizeof(T)]; 38 | }; 39 | 40 | } 41 | -------------------------------------------------------------------------------- /bcl/containers/DArray.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace BCL { 12 | 13 | namespace distribution { 14 | struct blocked { 15 | }; 16 | } 17 | 18 | template > 20 | class DArray { 21 | public: 22 | size_t my_size = 0; 23 | size_t local_size = 0; 24 | 25 | std::vector > data; 26 | 27 | size_t size() const noexcept { 28 | return my_size; 29 | } 30 | 31 | DArray(size_t size) : my_size(size) { 32 | local_size = (size + BCL::nprocs() - 1) / BCL::nprocs(); 33 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 34 | data.push_back(BCL::Array (rank, local_size)); 35 | } 36 | } 37 | 38 | ArrayRef operator[](size_t idx) { 39 | uint64_t node = idx / local_size; 40 | uint64_t node_slot = idx - node*local_size; 41 | return data[node][node_slot]; 42 | } 43 | }; 44 | } 45 | -------------------------------------------------------------------------------- /bcl/containers/HashMap/HashMapAL.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | struct HashMapAL { 10 | constexpr static int none = 0x0; 11 | constexpr static int find = (0x1 << 0); 12 | constexpr static int insert = (0x1 << 1); 13 | constexpr static int insert_find = insert | find; 14 | constexpr static int modify = insert_find; 15 | 16 | int val; 17 | 18 | HashMapAL(int val) : val(val) {} 19 | HashMapAL& operator=(const HashMapAL&) = default; 20 | 21 | operator int() const { 22 | return val; 23 | } 24 | 25 | HashMapAL& operator=(int val) { 26 | this->val = val; 27 | return *this; 28 | } 29 | 30 | bool operator==(int val) const { 31 | return this->val == val; 32 | } 33 | }; 34 | 35 | } 36 | -------------------------------------------------------------------------------- /bcl/containers/HashMap/HashMapEntry.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | namespace BCL { 11 | 12 | template < 13 | typename K, 14 | typename V, 15 | typename KeySerialize = BCL::serialize , 16 | typename ValSerialize = BCL::serialize 17 | > 18 | class HashMapEntry { 19 | public: 20 | BCL::Container key; 21 | BCL::Container val; 22 | int used = 0; 23 | 24 | HashMapEntry(const K &key, const V &val) { 25 | insert(key, val); 26 | } 27 | 28 | HashMapEntry() = default; 29 | HashMapEntry(const HashMapEntry& entry) = default; 30 | HashMapEntry(HashMapEntry&& entry) = default; 31 | HashMapEntry& operator=(const HashMapEntry&) = default; 32 | HashMapEntry& operator=(HashMapEntry&&) = default; 33 | 34 | void insert(const K &key, const V &val) { 35 | this->key.set(key); 36 | this->val.set(val); 37 | } 38 | 39 | K get_key() const { 40 | return key.get(); 41 | } 42 | 43 | V get_val() const { 44 | return val.get(); 45 | } 46 | 47 | void set_key(const K &key) { 48 | this->key.set(key); 49 | } 50 | 51 | void set_val(const V &val) { 52 | this->val.set(val); 53 | } 54 | }; 55 | 56 | } 57 | -------------------------------------------------------------------------------- /bcl/containers/HashMap/HashMapFuture.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | // TODO: eliminate success_ by making rv_ std::optional. 11 | 12 | namespace BCL { 13 | 14 | template 15 | class HashMapFuture { 16 | public: 17 | using hash_type = std::remove_cv_t>; 18 | using key_type = typename hash_type::key_type; 19 | using mapped_type = typename hash_type::mapped_type; 20 | using HME = typename hash_type::HME; 21 | 22 | HashMapFuture(HashMapFuture&&) = default; 23 | HashMapFuture& operator=(HashMapFuture&&) = default; 24 | HashMapFuture(const HashMapFuture&) = delete; 25 | HashMapFuture& operator=(const HashMapFuture&) = delete; 26 | 27 | HashMapFuture(const key_type& key, H& hash_map) : key_(key), hash_map_(hash_map) { 28 | hash_ = hash_map_.hash_fn(key); 29 | 30 | uint64_t slot = (hash_ + hash_map_.get_probe(probe_++)) % hash_map_.size; 31 | entry_ = std::move(hash_map_.arget_entry(slot)); 32 | } 33 | 34 | template 35 | std::future_status wait_for(const std::chrono::duration& timeout_duration) { 36 | if (success_) { 37 | return std::future_status::ready; 38 | } 39 | if (entry_.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 40 | return std::future_status::timeout; 41 | } else { 42 | HME entry = entry_.get(); 43 | int status = entry.used; 44 | if (status == hash_map_.free_flag) { 45 | success_ = true; 46 | value_ = entry; 47 | return std::future_status::ready; 48 | } else { 49 | if (entry.get_key() == key_) { 50 | success_ = true; 51 | value_ = entry; 52 | return std::future_status::ready; 53 | } else { 54 | uint64_t slot = (hash_ + hash_map_.get_probe(probe_++)) % hash_map_.size; 55 | entry_ = std::move(hash_map_.arget_entry(slot)); 56 | return std::future_status::timeout; 57 | } 58 | } 59 | } 60 | } 61 | 62 | std::optional get() { 63 | size_t count = 0; 64 | while (!success_ && wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 65 | entry_.wait(); 66 | } 67 | 68 | if (value_.used == hash_map_.ready_flag && value_.get_key() == key_) { 69 | return value_.get_val(); 70 | } else { 71 | return {}; 72 | } 73 | } 74 | 75 | private: 76 | uint64_t hash_; 77 | uint64_t probe_ = 0; 78 | bool success_ = false; 79 | BCL::future entry_; 80 | HME value_; 81 | H& hash_map_; 82 | key_type key_; 83 | }; 84 | 85 | template 86 | using HMF = HashMapFuture; 87 | 88 | } // end BCL 89 | -------------------------------------------------------------------------------- /bcl/containers/algorithms/cblas_wrapper.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #ifdef USE_MKL 8 | #include 9 | #else 10 | #include 11 | #endif 12 | 13 | namespace BCL { 14 | namespace experimental { 15 | 16 | void cblas_gemm_wrapper_(const CBLAS_ORDER layout, 17 | const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb, 18 | const int m, const int n, const int k, 19 | const float alpha, const float* a, const int lda, 20 | const float* b, const int ldb, const float beta, 21 | float* c, const int ldc) { 22 | cblas_sgemm(layout, transa, transb, 23 | m, n, k, 24 | alpha, a, lda, 25 | b, ldb, beta, 26 | c, ldc); 27 | } 28 | 29 | void cblas_gemm_wrapper_(const CBLAS_ORDER layout, 30 | const CBLAS_TRANSPOSE transa, const CBLAS_TRANSPOSE transb, 31 | const int m, const int n, const int k, 32 | const double alpha, const double* a, const int lda, 33 | const double* b, const int ldb, const double beta, 34 | double* c, const int ldc) { 35 | cblas_dgemm(layout, transa, transb, 36 | m, n, k, 37 | alpha, a, lda, 38 | b, ldb, beta, 39 | c, ldc); 40 | } 41 | 42 | template 43 | std::vector cblas_test(const std::vector& a, const std::vector& b, 44 | size_t M, size_t N, size_t K) { 45 | std::vector c(M*N); 46 | cblas_gemm_wrapper_(CblasRowMajor, CblasNoTrans, CblasNoTrans, 47 | M, N, K, 48 | 1.0, a.data(), K, 49 | b.data(), N, 1.0, 50 | c.data(), N); 51 | return c; 52 | } 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /bcl/containers/detail/index.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL 8 | { 9 | 10 | template && !std::is_reference_v)> 12 | class index { 13 | public: 14 | using index_type = T; 15 | 16 | using first_type = T; 17 | using second_type = T; 18 | 19 | index_type operator[](index_type dim) const noexcept { 20 | if (dim == 0) { 21 | return first; 22 | } else { 23 | return second; 24 | } 25 | } 26 | 27 | template && 29 | std::numeric_limits::max() >= std::numeric_limits::max()) 30 | > 31 | operator index() const noexcept { 32 | return index(first, second); 33 | } 34 | 35 | index(index_type first, index_type second) : first(first), second(second) {} 36 | 37 | bool operator==(index other) const noexcept { 38 | return first == other.first && second == other.second; 39 | } 40 | 41 | index() = default; 42 | ~index() = default; 43 | index(const index&) = default; 44 | index& operator=(const index&) = default; 45 | index(index&&) = default; 46 | index& operator=(index&&) = default; 47 | 48 | index_type first; 49 | index_type second; 50 | }; 51 | 52 | } // end BCL 53 | 54 | namespace std { 55 | 56 | template 58 | size_t get(BCL::index idx) 59 | { 60 | return idx[I]; 61 | } 62 | 63 | } // end std 64 | -------------------------------------------------------------------------------- /bcl/containers/detail/mkl/mkl_error_handle.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | namespace BCL { 11 | 12 | void mkl_error_handle(sparse_status_t status, std::string lbl = "") { 13 | std::string prefix = "BCL: Internal MKL error: "; 14 | if (status == SPARSE_STATUS_SUCCESS) { 15 | } else if (status == SPARSE_STATUS_NOT_INITIALIZED) { 16 | throw std::runtime_error(prefix + "SPARSE_STATUS_NOT_INITIALIZED: [" + lbl + "]"); 17 | } else if (status == SPARSE_STATUS_ALLOC_FAILED) { 18 | throw std::runtime_error(prefix + "SPARSE_STATUS_ALLOC_FAILED: [" + lbl + "]"); 19 | } else if (status == SPARSE_STATUS_INVALID_VALUE) { 20 | throw std::runtime_error(prefix + "SPARSE_STATUS_INVALID_VALUE: [" + lbl + "]"); 21 | } else if (status == SPARSE_STATUS_EXECUTION_FAILED) { 22 | throw std::runtime_error(prefix + "SPARSE_STATUS_EXECUTION_FAILED: [" + lbl + "]"); 23 | } else if (status == SPARSE_STATUS_INTERNAL_ERROR) { 24 | throw std::runtime_error(prefix + "SPARSE_STATUS_INTERNAL_ERROR: [" + lbl + "]"); 25 | } else if (status == SPARSE_STATUS_NOT_SUPPORTED) { 26 | throw std::runtime_error(prefix + "SPARSE_STATUS_NOT_SUPPORTED: [" + lbl + "]"); 27 | } else { 28 | throw std::runtime_error(prefix + "Unrecognized MKL sparse_status_t"); 29 | } 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /bcl/containers/detail/mkl/mkl_sparse_wrapper.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | namespace mkl { 11 | 12 | sparse_status_t 13 | mkl_sparse_create_csr_wrapper_(sparse_matrix_t* A, 14 | sparse_index_base_t index, 15 | MKL_INT rows, MKL_INT cols, 16 | const MKL_INT* rows_start, const MKL_INT* rows_end, 17 | const MKL_INT* col_indx, const float* values) 18 | { 19 | MKL_INT* rows_start_ = const_cast(rows_start); 20 | MKL_INT* rows_end_ = const_cast(rows_end); 21 | MKL_INT* col_indx_ = const_cast(col_indx); 22 | float* values_ = const_cast(values); 23 | return mkl_sparse_s_create_csr(A, index, rows, cols, rows_start_, rows_end_, col_indx_, values_); 24 | } 25 | 26 | sparse_status_t 27 | mkl_sparse_add_wrapper_(sparse_operation_t operation, const sparse_matrix_t A, 28 | float alpha, const sparse_matrix_t B, sparse_matrix_t* C) { 29 | return mkl_sparse_s_add(operation, A, alpha, B, C); 30 | } 31 | 32 | template 33 | struct mkl_sparse_set_value_wrapper_; 34 | 35 | template <> 36 | struct mkl_sparse_set_value_wrapper_ { 37 | sparse_status_t operator()(sparse_matrix_t A, 38 | MKL_INT row, MKL_INT col, 39 | const float& value) noexcept { 40 | return mkl_sparse_s_set_value(A, row, col, value); 41 | } 42 | }; 43 | 44 | template <> 45 | struct mkl_sparse_set_value_wrapper_ { 46 | sparse_status_t operator()(sparse_matrix_t A, 47 | MKL_INT row, MKL_INT col, 48 | const double& value) noexcept { 49 | return mkl_sparse_d_set_value(A, row, col, value); 50 | } 51 | }; 52 | 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/algorithms/algorithm.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "spgemm.hpp" 4 | #include "spmm.hpp" 5 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/launch_kernel.cuh: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | 10 | // This is just a convenience function, *launch*, to launch a kernel. 11 | 12 | namespace BCL { 13 | 14 | namespace cuda { 15 | 16 | struct LaunchInfo { 17 | size_t tid = 0; 18 | size_t ltid = 0; 19 | size_t gtid = 0; 20 | 21 | size_t extent = 0; 22 | size_t local_extent = 0; 23 | size_t global_extent = 0; 24 | 25 | __host__ __device__ operator size_t() const noexcept { 26 | return tid; 27 | } 28 | }; 29 | 30 | template 31 | __global__ void bcl_cuda_kernel_launch_impl_(size_t extent, Fn fn, Args... args) { 32 | size_t tid = threadIdx.x + blockIdx.x *blockDim.x; 33 | LaunchInfo info; 34 | info.tid = tid; 35 | info.ltid = tid; 36 | info.extent = extent; 37 | info.local_extent = extent; 38 | if (tid < extent) { 39 | fn(info, args...); 40 | } 41 | } 42 | 43 | template 44 | __global__ void bcl_cuda_global_kernel_launch_impl_(size_t host, size_t local_extent, size_t global_extent, Fn fn, Args... args) { 45 | size_t tid = threadIdx.x + blockIdx.x * blockDim.x; 46 | size_t gtid = host * local_extent + tid; 47 | 48 | LaunchInfo info; 49 | info.tid = gtid; 50 | info.ltid = tid; 51 | info.gtid = gtid; 52 | info.extent = global_extent; 53 | info.local_extent = local_extent; 54 | info.global_extent = global_extent; 55 | 56 | if (gtid < global_extent && tid < local_extent) { 57 | fn(info, args...); 58 | } 59 | } 60 | 61 | template 62 | void launch(size_t extent, Fn fn, Args&& ... args) { 63 | size_t block_size = std::min(std::size_t(1024), extent); 64 | size_t num_blocks = (extent + block_size - 1) / block_size; 65 | bcl_cuda_kernel_launch_impl_<<>>(extent, fn, args...); 66 | } 67 | 68 | template 69 | void global_launch(size_t extent, Fn fn, Args&& ... args) { 70 | size_t local_extent = (extent + BCL::nprocs() - 1) / BCL::nprocs(); 71 | 72 | size_t block_size = std::min(std::size_t(1024), local_extent); 73 | size_t num_blocks = (local_extent + block_size - 1) / block_size; 74 | 75 | bcl_cuda_global_kernel_launch_impl_<<>>(BCL::rank(), local_extent, extent, fn, args...); 76 | } 77 | 78 | } // end cuda 79 | } // end BCL 80 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/sequential/cusparse_error.cuh: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | namespace cuda { 12 | 13 | void throw_cusparse(cusparseStatus_t status) { 14 | if (status != CUSPARSE_STATUS_SUCCESS) { 15 | if (status == CUSPARSE_STATUS_INTERNAL_ERROR) { 16 | throw std::runtime_error("CUSPARSE_STATUS_INTERNAL_ERROR"); 17 | } else if (status == CUSPARSE_STATUS_INVALID_VALUE) { 18 | throw std::runtime_error("CUSPARSE_STATUS_INVALID_VALUE"); 19 | } else if (status == CUSPARSE_STATUS_ALLOC_FAILED) { 20 | throw std::runtime_error("CUSPARSE_STATUS_ALLOC_FAILED"); 21 | } else if (status == CUSPARSE_STATUS_NOT_INITIALIZED) { 22 | throw std::runtime_error("CUSPARSE_STATUS_NOT_INITIALIZED"); 23 | } else if (status == CUSPARSE_STATUS_ARCH_MISMATCH) { 24 | throw std::runtime_error("CUSPARSE_STATUS_ARCH_MISMATCH"); 25 | } else if (status == CUSPARSE_STATUS_EXECUTION_FAILED) { 26 | throw std::runtime_error("CUSPARSE_STATUS_EXECUTION_FAILED"); 27 | } else if (status == CUSPARSE_STATUS_INTERNAL_ERROR) { 28 | throw std::runtime_error("CUSPARSE_STATUS_INTERNAL_ERROR"); 29 | } else if (status == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED) { 30 | throw std::runtime_error("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"); 31 | } else if (status == CUSPARSE_STATUS_NOT_SUPPORTED) { 32 | throw std::runtime_error("CUSPARSE_STATUS_NOT_SUPPORTED"); 33 | } else if (status == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { 34 | throw std::runtime_error("CUSPARSE_STATUS_INSUFFICIENT_RESOURCES"); 35 | } 36 | throw std::runtime_error("Unknown CUSPARSE error"); 37 | } 38 | } 39 | 40 | } // end cuda 41 | 42 | } // end BCL -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/util/comm.cuh: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #ifndef COMMON 6 | #define COMMON 7 | 8 | #define TID (threadIdx.x+blockIdx.x*blockDim.x) 9 | #define LANE (threadIdx.x&31) 10 | #define WARPID ((threadIdx.x+blockIdx.x*blockDim.x)>>5) 11 | 12 | __device__ uint32_t flagMask0 = 3; 13 | __device__ uint32_t flagMask1 = 768; 14 | __device__ uint32_t flagMask2 = 196608; 15 | __device__ uint32_t flagMask3 = 50331648; 16 | __device__ uint32_t flagMask4 = 1; 17 | __device__ uint32_t flagMask5 = 65536; 18 | 19 | #define align_up(num, align) \ 20 | (((num) + ((align) - 1)) & ~((align) - 1)) 21 | 22 | #define align_down(num, align) \ 23 | ((num) & ~((align) - 1)) 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/util/cuda_future.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | namespace cuda { 10 | 11 | class cuda_request { 12 | public: 13 | cuda_request() = default; 14 | cuda_request(const cuda_request&) = default; 15 | 16 | void wait() { 17 | BCL::cuda::flush(); 18 | } 19 | 20 | bool check() const { 21 | return true; 22 | } 23 | }; 24 | 25 | template 26 | class cuda_thread_request { 27 | public: 28 | using thread_type = Thread; 29 | 30 | cuda_thread_request(thread_type&& thread) : thread_(std::move(thread)) {} 31 | 32 | void wait() { 33 | thread_.join(); 34 | BCL::cuda::flush(); 35 | } 36 | 37 | bool check() const { 38 | return true; 39 | } 40 | 41 | thread_type thread_; 42 | }; 43 | 44 | template 45 | class cuda_future { 46 | using request_type = Request; 47 | std::vector requests_; 48 | 49 | public: 50 | std::unique_ptr value_; 51 | 52 | cuda_future() : value_(new T()) {} 53 | 54 | /* 55 | cuda_future(T&& value, const request_type& request) 56 | : value_(new T(std::move(value))), requests_({request}) {} 57 | 58 | cuda_future(T&& value, const std::vector& requests) 59 | : value_(new T(std::move(value))), requests_(requests) {} 60 | 61 | cuda_future(T&& value, std::vector&& requests) 62 | : value_(new T(std::move(value))), requests_(std::move(requests)) {} 63 | 64 | cuda_future(const T& value, const request_type& request) 65 | : value_(new T(value)), requests_({request}) {} 66 | */ 67 | 68 | cuda_future(T&& value, request_type&& request) 69 | : value_(new T(std::move(value))) { 70 | requests_.push_back(std::move(request)); 71 | } 72 | 73 | /* 74 | void update(const request_type& request) { 75 | requests_.push_back(request); 76 | } 77 | */ 78 | 79 | void update(request_type&& request) { 80 | requests_.push_back(std::move(request)); 81 | } 82 | 83 | cuda_future(cuda_future&&) = default; 84 | cuda_future& operator=(cuda_future&&) = default; 85 | cuda_future(const cuda_future&) = delete; 86 | 87 | T get() { 88 | for (auto& request : requests_) { 89 | request.wait(); 90 | } 91 | return std::move(*value_); 92 | } 93 | 94 | void wait() { 95 | for (auto& request : requests_) { 96 | request.wait(); 97 | } 98 | } 99 | 100 | template 101 | std::future_status wait_for(const std::chrono::duration& timeout_duration) { 102 | for (auto& request : requests_) { 103 | if (!request.check()) { 104 | return std::future_status::timeout; 105 | } 106 | } 107 | return std::future_status::ready; 108 | } 109 | }; 110 | 111 | } // end cuda 112 | 113 | } // end BCL 114 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/util/error.cuh: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #ifndef ERROR_CHECK 6 | #define ERROR_CHECK 7 | 8 | #define CUDA_CHECK(call) { \ 9 | cudaError_t err = call; \ 10 | if( cudaSuccess != err) { \ 11 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 12 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 13 | exit(EXIT_FAILURE); \ 14 | } \ 15 | } 16 | 17 | 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/util/hash.cuh: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | template 6 | struct MyHash{ 7 | __device__ __host__ uint32_t operator()(T key, uint32_t seed) 8 | { 9 | return key; 10 | } 11 | }; 12 | -------------------------------------------------------------------------------- /bcl/containers/experimental/cuda/util/matrix_indexing.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace BCL { 4 | 5 | namespace cuda { 6 | 7 | struct RowMajorIndexing { 8 | __host__ __device__ 9 | size_t index(size_t i, size_t j, size_t ld) { 10 | return i*ld + j; 11 | } 12 | 13 | __host__ __device__ 14 | size_t index(size_t i, size_t j, size_t m, size_t n) { 15 | return i*default_ld(m, n) + j; 16 | } 17 | 18 | __host__ __device__ 19 | size_t default_ld(size_t m, size_t n) { 20 | return n; 21 | } 22 | 23 | __host__ __device__ 24 | size_t size(size_t m, size_t n, size_t ld) { 25 | if (ld == 0) { 26 | ld = default_ld(m, n); 27 | } 28 | return m*ld; 29 | } 30 | }; 31 | 32 | struct ColumnMajorIndexing { 33 | __host__ __device__ 34 | size_t index(size_t i, size_t j, size_t ld) { 35 | return i + j*ld; 36 | } 37 | 38 | __host__ __device__ 39 | size_t index(size_t i, size_t j, size_t m, size_t n) { 40 | return i + j*default_ld(m, n); 41 | } 42 | 43 | __host__ __device__ 44 | size_t default_ld(size_t m, size_t n) { 45 | return m; 46 | } 47 | 48 | __host__ __device__ 49 | size_t size(size_t m, size_t n, size_t ld) { 50 | if (ld == 0) { 51 | ld = default_ld(m, n); 52 | } 53 | return ld*n; 54 | } 55 | }; 56 | 57 | } // end cuda 58 | 59 | } // end BCL -------------------------------------------------------------------------------- /bcl/containers/sequential/vector.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | // TODO: complete this 10 | 11 | template < 12 | typename T, 13 | typename Allocator = std::allocator 14 | > 15 | class vector { 16 | public: 17 | using allocator_type = Allocator; 18 | using size_type = typename Allocator::size_type; 19 | using pointer = typename Allocator::pointer; 20 | using const_pointer = typename Allocator::const_pointer; 21 | using reference = typename Allocator::reference; 22 | using const_reference = typename Allocator::const_reference; 23 | using difference_type = typename Allocator::difference_type; 24 | 25 | vector() = default; 26 | 27 | // TODO: implement 28 | vector(const vector& other) = delete; 29 | 30 | vector(vector&& other) : 31 | allocator_(std::move(other.allocator_)), 32 | ptr_(std::move(other.ptr_)), 33 | size_(std::move(other.size_)), 34 | capacity_(std::move(other.capacity_)) {} 35 | 36 | vector(size_type count) { 37 | ptr_ = allocator_.allocate(count); 38 | capacity_ = count; 39 | size_ = count; 40 | 41 | for (size_t i = 0; i < size(); i++) { 42 | allocator_.construct(ptr_ + i); 43 | } 44 | } 45 | 46 | vector& operator=(vector&& other) { 47 | allocator_ = std::move(other.allocator_); 48 | ptr_ = std::move(other.ptr_); 49 | size_ = std::move(other.size_); 50 | capacity_ = std::move(other.capacity_); 51 | return *this; 52 | } 53 | 54 | void reserve(size_t new_cap) { 55 | if (new_cap > capacity()) { 56 | pointer new_ptr = allocator_.allocate(new_cap); 57 | capacity_ = new_cap; 58 | 59 | if (size() > 0) { 60 | for (size_t i = 0; i < size(); i++) { 61 | new_ptr[i] = ptr_[i]; 62 | } 63 | } 64 | std::swap(new_ptr, ptr_); 65 | if (new_ptr != nullptr) { 66 | allocator_.deallocate(new_ptr); 67 | } 68 | } 69 | } 70 | 71 | const_reference operator[](size_type idx) const { 72 | return ptr_[idx]; 73 | } 74 | 75 | reference operator[](size_type idx) { 76 | return ptr_[idx]; 77 | } 78 | 79 | pointer data() { 80 | return ptr_; 81 | } 82 | 83 | const_pointer data() const { 84 | return ptr_; 85 | } 86 | 87 | size_type size() const { 88 | return size_; 89 | } 90 | 91 | bool empty() const { 92 | return size() == 0; 93 | } 94 | 95 | size_type capacity() const { 96 | return capacity_; 97 | } 98 | 99 | private: 100 | allocator_type allocator_; 101 | pointer ptr_ = nullptr; 102 | size_type size_ = 0; 103 | size_type capacity_ = 0; 104 | }; 105 | 106 | } 107 | -------------------------------------------------------------------------------- /bcl/core/GlobalRef.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | template 12 | struct GlobalPtr; 13 | 14 | template 15 | extern inline std::remove_const_t rget(GlobalPtr src); 16 | 17 | template 18 | extern inline void rput(const T &src, GlobalPtr dst); 19 | 20 | template 21 | class GlobalRef { 22 | public: 23 | 24 | GlobalRef() = delete; 25 | ~GlobalRef() = default; 26 | GlobalRef(const GlobalRef&) = default; 27 | GlobalRef& operator=(const GlobalRef&) = default; 28 | GlobalRef(GlobalRef&&) = default; 29 | GlobalRef& operator=(GlobalRef&&) = default; 30 | 31 | using value_type = T; 32 | using pointer = GlobalPtr; 33 | using reference = GlobalRef; 34 | using const_reference = GlobalRef>; 35 | 36 | GlobalRef(BCL::GlobalPtr ptr) : ptr_(ptr) { 37 | BCL_DEBUG( 38 | if (ptr_ == nullptr) { 39 | throw debug_error("GlobalRef() constructor created a null reference."); 40 | } 41 | ) 42 | } 43 | 44 | template <__BCL_REQUIRES(!std::is_const_v)> 45 | operator const_reference() const { 46 | return const_reference(ptr_); 47 | } 48 | 49 | operator T() const { 50 | return BCL::rget(ptr_); 51 | } 52 | 53 | reference operator=(const T& value) const { 54 | // TODO: replace static_assert with requires() for C++20 55 | static_assert(!std::is_const_v); 56 | BCL::rput(value, ptr_); 57 | return *this; 58 | } 59 | 60 | pointer operator&() const noexcept { 61 | return ptr_; 62 | } 63 | 64 | private: 65 | BCL::GlobalPtr ptr_ = nullptr; 66 | }; 67 | 68 | template 69 | void swap(BCL::GlobalRef a, BCL::GlobalRef b) { 70 | T first = a; 71 | a = (T) b; 72 | b = first; 73 | } 74 | 75 | } // end BCL 76 | -------------------------------------------------------------------------------- /bcl/core/comm.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | template 10 | inline void rput(const T &src, GlobalPtr dst) { 11 | BCL::write(&src, dst, 1); 12 | } 13 | 14 | template 15 | inline void rput(const T *src, GlobalPtr dst, std::size_t size) { 16 | BCL::write(src, dst, size); 17 | } 18 | 19 | template 20 | inline void rget(GlobalPtr> src, T *dst, std::size_t size) { 21 | BCL::read(src, dst, size); 22 | } 23 | 24 | template 25 | inline void rget_atomic(GlobalPtr> src, T *dst, std::size_t size) { 26 | BCL::atomic_read(src, dst, size); 27 | } 28 | 29 | template 30 | inline std::remove_const_t rget_atomic(GlobalPtr src) { 31 | std::remove_const_t rv; 32 | BCL::atomic_read(src, &rv, 1); 33 | return rv; 34 | } 35 | 36 | template 37 | inline std::remove_const_t rget(GlobalPtr src) { 38 | std::remove_const_t rv; 39 | BCL::read(src, &rv, 1); 40 | return rv; 41 | } 42 | 43 | template > 44 | inline future> arget(GlobalPtr> src, size_t size) { 45 | std::vector dst(size); 46 | BCL::request request = async_read(src, dst.data(), size); 47 | return BCL::future>(std::move(dst), std::move(request)); 48 | } 49 | 50 | // TODO: should this also accept an allocator? 51 | template 52 | inline BCL::future arget(GlobalPtr src) { 53 | future fut; 54 | BCL::request request = async_read(src, fut.value_.get(), 1); 55 | fut.update(request); 56 | return std::move(fut); 57 | } 58 | 59 | template 60 | inline BCL::request arget(GlobalPtr> src, T* dst, size_t size) { 61 | return async_read(src, dst, size); 62 | } 63 | 64 | template 65 | inline future> arput(GlobalPtr dst, 66 | std::vector&& src) { 67 | BCL::request request = async_write(src.data(), dst, src.size()); 68 | return BCL::future>(std::move(src), std::move(request)); 69 | } 70 | 71 | template 72 | inline BCL::request arput(GlobalPtr dst, 73 | const T* src, size_t n_elem) { 74 | return async_write(src, dst, n_elem); 75 | } 76 | 77 | inline void memcpy(GlobalPtr dst, const void* src, std::size_t n) { 78 | BCL::write(static_cast(src), 79 | BCL::reinterpret_pointer_cast(dst), 80 | n); 81 | } 82 | 83 | inline void memcpy(void* dst, GlobalPtr src, std::size_t n) { 84 | BCL::read(BCL::reinterpret_pointer_cast(src), 85 | static_cast(dst), 86 | n); 87 | } 88 | 89 | } // end BCL 90 | -------------------------------------------------------------------------------- /bcl/core/detail/detail.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | // This REQUIRES macro design is borrowed from Jared Hoberock's Agency. 8 | 9 | #define ___BCL_CONCATENATE_IMPL(x, y) x##y 10 | #define __BCL_CONCATENATE(x, y) ___BCL_CONCATENATE_IMPL(x, y) 11 | #define ___BCL_MAKE_UNIQUE(x) __BCL_CONCATENATE(x, __COUNTER__) 12 | #define __BCL_REQUIRES_IMPL(unique_name, ...) bool unique_name = true, typename std::enable_if<(unique_name and __VA_ARGS__)>::type* = nullptr 13 | #define __BCL_REQUIRES(...) __BCL_REQUIRES_IMPL(___BCL_MAKE_UNIQUE(__deduced_true), __VA_ARGS__) 14 | -------------------------------------------------------------------------------- /bcl/core/detail/hash_functions.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | template 10 | struct djb2_hash { 11 | unsigned long djb2(const unsigned char *str, std::size_t len) const noexcept { 12 | unsigned long hash = 5381; 13 | 14 | for (size_t i = 0; i < len; i++) { 15 | int c = str[i]; 16 | hash = ((hash << 5) + hash) + c; /* hash * 33 + c */ 17 | } 18 | return hash; 19 | } 20 | 21 | std::size_t operator()(const T& k) const noexcept { 22 | return djb2(reinterpret_cast(&k), sizeof(T)); 23 | } 24 | }; 25 | 26 | template 27 | struct nagasaka_hash { 28 | std::size_t operator()(const T& k) const noexcept { 29 | return k*107; 30 | } 31 | }; 32 | 33 | } 34 | -------------------------------------------------------------------------------- /bcl/core/except.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | namespace BCL { 10 | 11 | #ifdef BCL_NODEBUG 12 | #define BCL_DEBUG(stmt) 13 | #else 14 | #define BCL_DEBUG(stmt) stmt 15 | #endif 16 | 17 | class error 18 | { 19 | public: 20 | error(const std::string& what_arg) : what_arg_(what_arg) {} 21 | error() {} 22 | 23 | const char* what() const throw() { 24 | return what_arg_.c_str(); 25 | } 26 | 27 | private: 28 | std::string what_arg_; 29 | }; 30 | 31 | /* 32 | XXX: debug_error exceptions are expensive to check 33 | for at runtime, even if they don't occur. They 34 | will only be thrown if the user compiles with 35 | the `DEBUG` flag---thus the expensive checks 36 | will normally be compiled away. 37 | */ 38 | class debug_error final : public error 39 | { 40 | public: 41 | debug_error(const std::string& what_arg) : what_arg_(what_arg) {} 42 | 43 | const char* what() const throw() { 44 | return what_arg_.c_str(); 45 | } 46 | 47 | private: 48 | std::string what_arg_; 49 | }; 50 | 51 | } // end BCL 52 | -------------------------------------------------------------------------------- /bcl/core/future.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace BCL { 12 | 13 | template 14 | class future { 15 | std::vector requests_; 16 | 17 | public: 18 | std::unique_ptr value_; 19 | 20 | future() : value_(new T()) {} 21 | 22 | future(T&& value, const BCL::request& request) 23 | : value_(new T(std::move(value))), requests_({request}) {} 24 | 25 | future(T&& value, const std::vector& requests) 26 | : value_(new T(std::move(value))), requests_(requests) {} 27 | 28 | future(T&& value, std::vector&& requests) 29 | : value_(new T(std::move(value))), requests_(std::move(requests)) {} 30 | 31 | future(const T& value, const BCL::request& request) 32 | : value_(new T(value)), requests_({request}) {} 33 | 34 | void update(const BCL::request& request) { 35 | requests_.push_back(request); 36 | } 37 | 38 | void update(BCL::request&& request) { 39 | requests_.push_back(std::move(request)); 40 | } 41 | 42 | future(future&&) = default; 43 | future& operator=(future&&) = default; 44 | future(const future&) = delete; 45 | 46 | T get() { 47 | for (auto& request : requests_) { 48 | request.wait(); 49 | } 50 | return std::move(*value_); 51 | } 52 | 53 | void wait() { 54 | for (auto& request : requests_) { 55 | request.wait(); 56 | } 57 | } 58 | 59 | bool check() { 60 | return wait_for(std::chrono::seconds(0)) == std::future_status::ready; 61 | } 62 | 63 | template 64 | std::future_status wait_for(const std::chrono::duration& timeout_duration) { 65 | for (auto& request : requests_) { 66 | if (!request.check()) { 67 | return std::future_status::timeout; 68 | } 69 | } 70 | return std::future_status::ready; 71 | } 72 | }; 73 | 74 | } 75 | -------------------------------------------------------------------------------- /bcl/core/util.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace BCL { 12 | 13 | template 14 | void print(const char* format, Args... args) { 15 | fflush(stdout); 16 | BCL::barrier(); 17 | if (BCL::rank() == 0) { 18 | printf(format, args...); 19 | } 20 | fflush(stdout); 21 | BCL::barrier(); 22 | } 23 | 24 | std::string hostname() { 25 | constexpr size_t MH = 2048; 26 | char buf[MH+1]; 27 | gethostname(buf, MH); 28 | return std::string(buf); 29 | } 30 | 31 | } // end BCL 32 | -------------------------------------------------------------------------------- /bcl/core/util/Backoff.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | namespace BCL { 8 | 9 | size_t double_backoff(size_t sleep) { 10 | return sleep *= 2; 11 | } 12 | 13 | template 14 | class Backoff { 15 | public: 16 | Backoff(size_t init_sleep = 1, size_t max_sleep = 1, 17 | BackoffFn&& backoff_fn = double_backoff) 18 | : sleep_time_(init_sleep), max_sleep_(max_sleep), 19 | init_sleep_(init_sleep), backoff_fn_(backoff_fn) {} 20 | 21 | void backoff() { 22 | usleep(sleep_time_); 23 | increase_backoff_impl_(); 24 | } 25 | 26 | void increase_backoff_impl_() { 27 | sleep_time_ = backoff_fn_(sleep_time_); 28 | sleep_time_ = std::min(sleep_time_, max_sleep_); 29 | } 30 | 31 | void reset() { 32 | sleep_time_ = init_sleep_; 33 | } 34 | 35 | private: 36 | size_t sleep_time_; 37 | size_t max_sleep_; 38 | size_t init_sleep_; 39 | BackoffFn&& backoff_fn_; 40 | }; 41 | 42 | template 43 | Backoff(size_t init_sleep = 1, size_t max_sleep = 100, BackoffFn&& backoff_fn = double_backoff) -> Backoff; 44 | 45 | } // end BCL 46 | -------------------------------------------------------------------------------- /cmake/FindGASNET_EX.cmake: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | if (TARGET GASNET_EX::GASNET_EX OR GASNET_EX_FOUND) 6 | return() 7 | endif() 8 | 9 | find_path(GASNET_EX_INCLUDE_DIRS NAMES gasnetex.h) 10 | 11 | #find_library(GASNET_EX_LIBRARIES NAMES gasnetex) 12 | 13 | include(FindPackageHandleStandardArgs) 14 | find_package_handle_standard_args(GASNET_EX 15 | DEFAULT_MSG 16 | GASNET_EX_INCLUDE_DIRS 17 | #GASNET_EX_LIBRARIES 18 | ) 19 | 20 | mark_as_advanced(GASNET_EX_INCLUDE_DIRS GASNET_EX_LIBRARIES) 21 | 22 | add_library(GASNET_EX::GASNET_EX UNKNOWN IMPORTED) 23 | set_target_properties(GASNET_EX::GASNET_EX PROPERTIES 24 | #IMPORTED_LOCATION "${GASNET_EX_LIBRARIES}" 25 | INTERFACE_INCLUDE_DIRECTORIES "${GASNET_EX_INCLUDE_DIRS}" 26 | ) 27 | -------------------------------------------------------------------------------- /cmake/FindSHMEM.cmake: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | if (TARGET SHMEM::SHMEM OR SHMEM_FOUND) 6 | return() 7 | endif() 8 | 9 | find_path(SHMEM_INCLUDE_DIRS NAMES shmem.h PATHS mpp) 10 | 11 | #find_library(SHMEM_LIBRARIES NAMES gasnetex) 12 | 13 | include(FindPackageHandleStandardArgs) 14 | find_package_handle_standard_args(SHMEM 15 | DEFAULT_MSG 16 | GASNETEX_INCLUDE_DIRS 17 | #GASNETEX_LIBRARIES 18 | ) 19 | 20 | mark_as_advanced(SHMEM_INCLUDE_DIRS SHMEM_LIBRARIES) 21 | 22 | add_library(SHMEM::SHMEM UNKNOWN IMPORTED) 23 | set_target_properties(SHMEM::SHMEM PROPERTIES 24 | #IMPORTED_LOCATION "${SHMEM_LIBRARIES}" 25 | INTERFACE_INCLUDE_DIRECTORIES "${SHMEM_INCLUDE_DIRS}/.." 26 | ) 27 | 28 | -------------------------------------------------------------------------------- /cmake/FindUPCXX.cmake: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | if (TARGET UPCXX::UPCXX OR UPCXX_FOUND) 6 | return() 7 | endif() 8 | 9 | find_path(UPCXX_INCLUDE_DIRS NAMES upcxx.h) 10 | 11 | #find_library(UPCXX_LIBRARIES NAMES gasnetex) 12 | 13 | include(FindPackageHandleStandardArgs) 14 | find_package_handle_standard_args(UPCXX 15 | DEFAULT_MSG 16 | UPCXX_INCLUDE_DIRS 17 | #UPCXX_LIBRARIES 18 | ) 19 | 20 | mark_as_advanced(UPCXX_INCLUDE_DIRS UPCXX_LIBRARIES) 21 | 22 | add_library(UPCXX::UPCXX UNKNOWN IMPORTED) 23 | set_target_properties(UPCXX::UPCXX PROPERTIES 24 | #IMPORTED_LOCATION "${UPCXX_LIBRARIES}" 25 | INTERFACE_INCLUDE_DIRECTORIES "${UPCXX_INCLUDE_DIRS}" 26 | ) 27 | -------------------------------------------------------------------------------- /cmake/bclConfig.cmake: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | cmake_minimum_required(VERSION 3.10) 6 | 7 | # Guard against multiple 'find_package(bcl)' calls 8 | if (TARGET bcl OR bcl_FOUND) 9 | return() 10 | endif() 11 | 12 | set(bcl_LOC ${CMAKE_CURRENT_LIST_DIR}/..) 13 | list(APPEND CMAKE_MODULE_PATH ${bcl_LOC}/cmake) 14 | 15 | ## Base target ## 16 | 17 | add_library(bcl::core INTERFACE IMPORTED) 18 | target_include_directories(bcl::core INTERFACE ${bcl_LOC}) 19 | 20 | ## MPI ## 21 | 22 | find_package(MPI QUIET) 23 | if (TARGET MPI::MPI_CXX) 24 | add_library(bcl::mpi INTERFACE IMPORTED) 25 | target_link_libraries(bcl::mpi INTERFACE bcl::core MPI::MPI_CXX) 26 | target_compile_definitions(bcl::mpi INTERFACE BCL_MPI) 27 | endif() 28 | 29 | ## SHMEM ## 30 | 31 | find_package(SHMEM QUIET MODULE) 32 | if (TARGET SHMEM::SHMEM) 33 | add_library(bcl::shmem INTERFACE IMPORTED) 34 | target_link_libraries(bcl::shmem INTERFACE bcl::core SHMEM::SHMEM) 35 | target_compile_definitions(bcl::shmem INTERFACE SHMEM) 36 | endif() 37 | 38 | ## GASNET_EX ## 39 | 40 | find_package(GASNET_EX QUIET MODULE) 41 | if (TARGET GASNET_EX::GASNET_EX) 42 | add_library(bcl::gasnet_ex INTERFACE IMPORTED) 43 | target_link_libraries(bcl::gasnet_ex INTERFACE bcl::core GASNET_EX::GASNET_EX) 44 | target_compile_definitions(bcl::gasnet_ex INTERFACE GASNET_EX) 45 | endif() 46 | 47 | set(bcl_FOUND ON) 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: 3 | python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html 4 | -------------------------------------------------------------------------------- /docs/architecture/architecture.txt: -------------------------------------------------------------------------------- 1 | # BCL Architecture Design Document 2 | 3 | ## "Minimal Compilation Unit" Design 4 | 5 | BCL is designed to maximize the use of inlining. One way it tries to accomplish 6 | this is by minimizing the number of compilation units. All BCL functions are 7 | declared `inline` (1) so that the compiler might more readily choose to inline 8 | them, since it isn't forced to create external linkage, and (2) to avoid 9 | creating external linkage. (2) will allow each compilation unit to define its 10 | own inlined functions. 11 | 12 | BCL is currently implemented only in header files (we will soon move variables 13 | to a single compilation unit). 14 | 15 | ## Header Include Order 16 | 17 | 1) GlobalPtr.hpp, GlobalRef.hpp 18 | * Defines pointers, references. 19 | * References forward declare `rget`, and `rput`. 20 | 2) {backend}/backend.hpp 21 | * Uses global pointers, defines `read`, `write`, etc. 22 | See docs/backends/backend.txt for details. 23 | 3) comm.hpp 24 | * Defines `rget`, `rput`, etc. Uses backend functions. 25 | 4) teams.hpp 26 | * If it has not already been included. 27 | * Does not including anything else; forward declares backend `rank`, `nprocs` 28 | 5) util.hpp 29 | * Utilities, may use anything in core, etc. 30 | 31 | ## Global Variables 32 | 33 | Global variables include: 34 | 35 | uint64_t shared_segment_size; 36 | void *smem_base_ptr; 37 | 38 | (owned by BCL Core) 39 | 40 | bool bcl_finalized; 41 | 42 | uint64_t my_rank; 43 | uint64_t my_nprocs; 44 | 45 | (owned by backend -- backends may also allocate other global variables) 46 | -------------------------------------------------------------------------------- /docs/backends/backend.txt: -------------------------------------------------------------------------------- 1 | # BCL Backends 2 | 3 | BCL backends need to implement a set number of communication primitives. These 4 | functions should all be included with the `backend.hpp` header file included by 5 | `bcl.hpp`. For reference, we list the header files that define these functions 6 | in the standard BCL backends. 7 | 8 | ## Communication - backend/comm.hpp 9 | 10 | template 11 | inline void read(const GlobalPtr &src, T *dst, const size_t size); 12 | 13 | template 14 | inline void write(const T *src, const GlobalPtr &dst, const size_t size); 15 | 16 | template 17 | inline BCL::request async_read(const GlobalPtr &src, T *dst, const size_t size); 18 | 19 | template 20 | inline BCL::request async_write(const T *src, const GlobalPtr &dst, const size_t size); 21 | 22 | ## Atomics - backend/atomics.hpp 23 | 24 | template 25 | inline T compare_and_swap(BCL::GlobalPtr ptr, T old_val, T new_val); 26 | 27 | ## Backend initialization - backend/backend.hpp 28 | 29 | inline void init(uint64_t shared_segment_size = 256); 30 | 31 | inline void finalize(); 32 | 33 | inline void barrier(); 34 | 35 | inline void flush(); 36 | 37 | inline bool finalized(); 38 | 39 | 40 | inline size_t rank(); 41 | 42 | inline size_t nprocs(); 43 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | import subprocess, os 18 | 19 | def configureDoxyfile(input_dir, output_dir): 20 | 21 | with open('Doxyfile.in', 'r') as file : 22 | filedata = file.read() 23 | 24 | filedata = filedata.replace('@DOXYGEN_INPUT_DIR@', input_dir) 25 | filedata = filedata.replace('@DOXYGEN_OUTPUT_DIR@', output_dir) 26 | 27 | with open('Doxyfile', 'w') as file: 28 | file.write(filedata) 29 | 30 | breathe_projects = {} 31 | input_dir = '../bcl' 32 | output_dir = 'build' 33 | configureDoxyfile(input_dir, output_dir) 34 | subprocess.call('doxygen', shell=True) 35 | breathe_projects['BCL'] = output_dir + '/xml' 36 | 37 | 38 | # -- Project information ----------------------------------------------------- 39 | 40 | project = 'Berkeley Container Library' 41 | copyright = '2020, Benjamin Brock' 42 | author = 'Benjamin Brock' 43 | 44 | 45 | # -- General configuration --------------------------------------------------- 46 | 47 | # Add any Sphinx extension module names here, as strings. They can be 48 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 49 | # ones. 50 | #... 51 | 52 | extensions = [ "breathe" ] 53 | 54 | #... 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ['_templates'] 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This pattern also affects html_static_path and html_extra_path. 62 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 63 | 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | html_theme = 'sphinx_rtd_theme' 71 | 72 | # Add any paths that contain custom static files (such as style sheets) here, 73 | # relative to this directory. They are copied after the builtin static files, 74 | # so a file named "default.css" will overwrite the builtin "default.css". 75 | html_static_path = ['_static'] 76 | 77 | # Breathe Configuration 78 | breathe_default_project = "BCL" 79 | 80 | master_doc = 'index' 81 | 82 | cpp_id_attributes = ["__global__", "__device__", "__host__"] 83 | -------------------------------------------------------------------------------- /docs/containers/CircularQueue.txt: -------------------------------------------------------------------------------- 1 | 2 | Note that FastQueue is a similar data structure that offers less atomicity in 3 | return for faster performance. If you do not require concurrent pushes and 4 | pops, FastQueue is more appropriate. 5 | 6 | The CircularQueue is a globally visible queue data structure. To create a 7 | CircularQueue, one declares one by calling the constructor collectively, 8 | for example: 9 | 10 | size_t rank = 0; 11 | size_t capacity = 1000; 12 | BCL::CircularQueue queue(rank, capacity); 13 | 14 | All nodes are able to push and pop to the queue. 15 | 16 | To push: 17 | 18 | int val = 12; 19 | bool success = queue.push(val); 20 | 21 | std::vector vals = ...; 22 | bool success = queue.push(vals); 23 | 24 | To pop: 25 | 26 | int val; 27 | bool success = queue.pop(val); 28 | 29 | if (success) { 30 | std::cout << "Popped the value " << val << std::endl; 31 | } 32 | 33 | std::vector vals; 34 | bool success = queue.pop(vals); 35 | 36 | if (success) { 37 | std::cout << "Popped a bunch of values." << std::endl; 38 | print_vec(vals); 39 | } 40 | 41 | CircularQueue supports multiple levels of atomicity for both pushes and pops. 42 | By default, push() and pop() will use the highest levels of atomicity. 43 | Oftentimes, applications may require a lower level of atomicity. In these 44 | cases, you may prompt push() or pop() function to select a faster implementation 45 | which will still be correct. 46 | 47 | You do this using an atomicity level object that can be optionally passed to the 48 | push() and pop() functions. 49 | 50 | The available options are: 51 | 52 | // Other ranks may perform pushes concurrent with this operation. 53 | BCL::CircularQueueAL::push 54 | // Other ranks may perform pops concurrent with this operation. 55 | BCL::CircularQueueAL::pop 56 | // No other ranks will perform queue operations concurrent with this operation. 57 | BCL::CircularQueueAL::none 58 | 59 | Here are some use cases. 60 | 61 | // An insert phase 62 | 63 | BCL::barrier(); 64 | for (auto& val : vals) { 65 | // The programmer guarantees that only pushes 66 | // may happen concurrently. 67 | queue.push(val, BCL::CircularQueueAL::push); 68 | } 69 | BCL::barrier(); 70 | 71 | // A pop phase 72 | 73 | while (!queue.empty()) { 74 | bool success; 75 | int val; 76 | // The programmer guarantees that only pops 77 | // may happen concurrently. 78 | success = queue.pop(val, BCL::CircularQueueAL::pop); 79 | if (success) { 80 | .. 81 | } 82 | } 83 | 84 | BCL::barrier(); 85 | -------------------------------------------------------------------------------- /docs/core/atomics.txt: -------------------------------------------------------------------------------- 1 | 2 | template 3 | T compare_and_swap(BCL::GlobalPtr ptr, T old_val, T new_val); 4 | 5 | Atomic compare-and-swap operation on the value stored at ptr. 6 | 7 | The user issues a "guess" (`old_val`) at the current value stored in ptr. If 8 | the guess is correct (*ptr == old_val), then the value will be atomically 9 | swapped new_val. 10 | 11 | T must be a signed or unsigned 32 or 64-bit integer. 12 | 13 | template 14 | T fetch_and_op(BCL::GlobalPtr ptr, T val, BCL::atomic_op op); 15 | 16 | Atomic fetch-and-op operation on the value stored at ptr, using operand `val` 17 | and operation `op`. 18 | 19 | This function performs the operation 20 | 21 | *ptr = op(*ptr, val); 22 | 23 | and returns the old value stored at ptr. op must be a BCL::atomic_op. 24 | -------------------------------------------------------------------------------- /docs/core/comm.txt: -------------------------------------------------------------------------------- 1 | # Communication 2 | 3 | This file covers communication operations in BCL, as defined in core/comm.hpp. 4 | 5 | template 6 | inline T rget(const GlobalPtr &src); 7 | 8 | template 9 | inline void rput(const T &src, const GlobalPtr &dst); -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. CatCutifier documentation master file, created by 2 | sphinx-quickstart on Wed Apr 24 15:19:01 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to BCL's documentation! 7 | ======================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | :ref:`genindex` 14 | 15 | Communication 16 | ------------- 17 | 18 | .. doxygenstruct:: BCL::GlobalPtr 19 | :members: 20 | 21 | .. doxygenfunction:: BCL::alloc 22 | 23 | .. doxygenfunction:: BCL::dealloc 24 | 25 | .. doxygenfunction:: BCL::reinterpret_pointer_cast(GlobalPtr) 26 | 27 | Example 28 | ~~~~~~~ 29 | 30 | .. code-block:: C++ 31 | 32 | #include 33 | int main(int argc, char** argv) { 34 | BCL::init(); 35 | BCL::GlobalPtr cptr = BCL::alloc(100); 36 | 37 | BCL::GlobalPtr iptr = BCL::reinterpret_pointer_cast(cptr); 38 | 39 | BCL::finalize(); 40 | return 0; 41 | } 42 | 43 | 44 | Data Structures 45 | --------------- 46 | 47 | .. doxygenclass:: BCL::DMatrix 48 | :members: 49 | 50 | .. doxygenclass:: BCL::SPMatrix 51 | :members: 52 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | breathe 2 | sphinx == 4.0.3 3 | sphinx_rtd_theme>=0.5.2 4 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | find_package(MPI REQUIRED) 6 | 7 | function(add_bcl_test TEST_NAME) 8 | add_executable(${TEST_NAME} ${ARGN}) 9 | target_link_libraries(${TEST_NAME} PRIVATE bcl::mpi) 10 | add_test( 11 | NAME 12 | ${TEST_NAME} 13 | COMMAND 14 | ${MPIEXEC_EXECUTABLE} ${CMAKE_BINARY_DIR}/${TEST_NAME} -n 4 15 | ) 16 | endfunction() 17 | 18 | add_subdirectory(hashmap) 19 | add_subdirectory(simple) 20 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/all-to-all-async.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | using T = int; 16 | 17 | std::vector> ptrs(BCL::nprocs(), nullptr); 18 | std::vector> counters(BCL::nprocs(), nullptr); 19 | 20 | // Global data size, in bytes. 21 | size_t global_data_size = 256*1024*size_t(1024); 22 | // Number of "inserts" to perform, per processor 23 | size_t num_inserts = 1000; 24 | // Transfer Size `S` of each "insert", in bytes. 25 | size_t transfer_data_size = 1000; 26 | 27 | size_t global_size = global_data_size / sizeof(T); 28 | size_t local_size = (global_size + BCL::nprocs() - 1) / BCL::nprocs(); 29 | size_t transfer_size = transfer_data_size / sizeof(T); 30 | 31 | assert(transfer_size > 0); 32 | 33 | for (size_t i = 0; i < BCL::nprocs(); i++) { 34 | if (BCL::rank() == i) { 35 | ptrs[i] = BCL::alloc(local_size); 36 | counters[i] = BCL::alloc(1); 37 | } 38 | ptrs[i] = BCL::broadcast(ptrs[i], i); 39 | counters[i] = BCL::broadcast(counters[i], i); 40 | 41 | if (ptrs[i] == nullptr || counters[i] == nullptr) { 42 | throw std::runtime_error("Ran out of memory."); 43 | } 44 | } 45 | 46 | *counters[BCL::rank()].local() = 0; 47 | 48 | std::vector src(transfer_size); 49 | for (auto& val : src) { 50 | val = lrand48(); 51 | } 52 | 53 | srand48(BCL::rank()); 54 | 55 | std::vector requests; 56 | 57 | BCL::barrier(); 58 | auto begin = std::chrono::high_resolution_clock::now(); 59 | 60 | for (size_t i = 0; i < num_inserts; i++) { 61 | // Pick a random processor p' 62 | size_t dest_rank = lrand48() % BCL::nprocs(); 63 | 64 | // Perform a remote atomic `fetch_and_add()` on p's integer variable. 65 | BCL::fetch_and_op(counters[dest_rank], 1, BCL::plus{}); 66 | 67 | size_t rand_loc = lrand48() % (local_size - transfer_size); 68 | 69 | auto request = BCL::arput(ptrs[dest_rank] + rand_loc, src.data(), transfer_size); 70 | requests.emplace_back(std::move(request)); 71 | } 72 | 73 | for (auto& request : requests) { 74 | request.wait(); 75 | } 76 | 77 | BCL::barrier(); 78 | auto end = std::chrono::high_resolution_clock::now(); 79 | 80 | double duration = std::chrono::duration(end - begin).count(); 81 | 82 | size_t data_transferred = transfer_size*num_inserts*BCL::nprocs(); 83 | double bw = data_transferred / duration; 84 | double bw_gb = bw*1e-9; 85 | 86 | BCL::print("All-to-all benchmark completed in %lfs.\n", duration); 87 | BCL::print("Total bandwidth %lf GB/s\n", bw_gb); 88 | BCL::print("Bandwidth/process %lf GB/s\n", bw_gb/BCL::nprocs()); 89 | 90 | BCL::finalize(); 91 | return 0; 92 | } 93 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/all-to-all.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | using T = int; 16 | 17 | std::vector> ptrs(BCL::nprocs(), nullptr); 18 | std::vector> counters(BCL::nprocs(), nullptr); 19 | 20 | // Global data size, in bytes. 21 | size_t global_data_size = 256*1024*size_t(1024); 22 | // Number of "inserts" to perform, per processor 23 | size_t num_inserts = 1000; 24 | // Transfer Size `S` of each "insert", in bytes. 25 | size_t transfer_data_size = 1000; 26 | 27 | size_t global_size = global_data_size / sizeof(T); 28 | size_t local_size = (global_size + BCL::nprocs() - 1) / BCL::nprocs(); 29 | size_t transfer_size = transfer_data_size / sizeof(T); 30 | 31 | assert(transfer_size > 0); 32 | 33 | for (size_t i = 0; i < BCL::nprocs(); i++) { 34 | if (BCL::rank() == i) { 35 | ptrs[i] = BCL::alloc(local_size); 36 | counters[i] = BCL::alloc(1); 37 | } 38 | ptrs[i] = BCL::broadcast(ptrs[i], i); 39 | counters[i] = BCL::broadcast(counters[i], i); 40 | 41 | if (ptrs[i] == nullptr || counters[i] == nullptr) { 42 | throw std::runtime_error("Ran out of memory."); 43 | } 44 | } 45 | 46 | *counters[BCL::rank()].local() = 0; 47 | 48 | std::vector src(transfer_size); 49 | for (auto& val : src) { 50 | val = lrand48(); 51 | } 52 | 53 | srand48(BCL::rank()); 54 | 55 | BCL::barrier(); 56 | auto begin = std::chrono::high_resolution_clock::now(); 57 | 58 | for (size_t i = 0; i < num_inserts; i++) { 59 | // Pick a random processor p' 60 | size_t dest_rank = lrand48() % BCL::nprocs(); 61 | 62 | // Perform a remote atomic `fetch_and_add()` on p's integer variable. 63 | BCL::fetch_and_op(counters[dest_rank], 1, BCL::plus{}); 64 | 65 | size_t rand_loc = lrand48() % (local_size - transfer_size); 66 | 67 | auto request = BCL::arput(ptrs[dest_rank] + rand_loc, src.data(), transfer_size); 68 | request.wait(); 69 | } 70 | 71 | BCL::barrier(); 72 | auto end = std::chrono::high_resolution_clock::now(); 73 | 74 | double duration = std::chrono::duration(end - begin).count(); 75 | 76 | size_t data_transferred = transfer_size*num_inserts*BCL::nprocs(); 77 | double bw = data_transferred / duration; 78 | double bw_gb = bw*1e-9; 79 | 80 | BCL::print("All-to-all benchmark completed in %lfs.\n", duration); 81 | BCL::print("Total bandwidth %lf GB/s\n", bw_gb); 82 | BCL::print("Bandwidth/process %lf GB/s\n", bw_gb/BCL::nprocs()); 83 | 84 | BCL::finalize(); 85 | return 0; 86 | } 87 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/atomics-latency.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | using T = int; 16 | 17 | std::vector> ptrs(BCL::nprocs(), nullptr); 18 | 19 | // Global data size, in bytes. 20 | size_t global_data_size = 256*1024*size_t(1024); 21 | // Number of ops ("hash table accesses") to perform, per processor 22 | size_t num_ops = 1000; 23 | 24 | size_t global_size = global_data_size / sizeof(T); 25 | size_t local_size = (global_size + BCL::nprocs() - 1) / BCL::nprocs(); 26 | 27 | for (size_t i = 0; i < BCL::nprocs(); i++) { 28 | if (BCL::rank() == i) { 29 | ptrs[i] = BCL::alloc(local_size); 30 | } 31 | ptrs[i] = BCL::broadcast(ptrs[i], i); 32 | 33 | if (ptrs[i] == nullptr) { 34 | throw std::runtime_error("Ran out of memory."); 35 | } 36 | } 37 | 38 | srand48(BCL::rank()); 39 | 40 | BCL::barrier(); 41 | auto begin = std::chrono::high_resolution_clock::now(); 42 | 43 | for (size_t i = 0; i < num_ops; i++) { 44 | // Pick a random processor p' 45 | size_t dest_rank = lrand48() % BCL::nprocs(); 46 | size_t rand_loc = lrand48() % local_size; 47 | 48 | // Perform a global atomic `compare_and_swap()` on an integer variable 49 | BCL::compare_and_swap(ptrs[dest_rank] + rand_loc, 0, 1); 50 | } 51 | 52 | BCL::barrier(); 53 | auto end = std::chrono::high_resolution_clock::now(); 54 | 55 | double duration = std::chrono::duration(end - begin).count(); 56 | 57 | double latency = duration / num_ops; 58 | double latency_us = latency*1e6; 59 | 60 | BCL::print("Atomics latency benchmark completed in %lfs.\n", duration); 61 | BCL::print("Measured latency %lf us\n", latency_us); 62 | 63 | BCL::finalize(); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/experimental/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 14 | CXX=CC 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/aries-conduit/aries-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 23 | CXX = CC 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 29 | CXX=CC 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | FUTAR_FLAGS = -I/global/u1/b/brock/src/research/bcl/examples/benchmarks/mer-bench/experimental/futar 35 | 36 | CXXFLAGS = -O3 -std=gnu++17 $(BCLFLAGS) $(FUTAR_FLAGS) 37 | 38 | SOURCES += $(wildcard *.cpp) 39 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 40 | 41 | all: $(TARGETS) 42 | 43 | %: %.cpp 44 | @echo "C $@ $(BACKEND)" 45 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 46 | 47 | test: all 48 | @for target in $(TARGETS) ; do \ 49 | echo "R $$target $(BACKEND)" ;\ 50 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 51 | done 52 | 53 | clean: 54 | @rm -f $(TARGETS) 55 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/experimental/put_harness.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include "queue_impls.hpp" 8 | 9 | int main(int argc, char** argv) { 10 | BCL::init(1024); 11 | 12 | size_t write_size_bytes; 13 | if (argc >= 2) { 14 | write_size_bytes = std::atoll(argv[1]); 15 | } else { 16 | BCL::print("usage: ./put_harness [write size in bytes]\n"); 17 | BCL::finalize(); 18 | return 0; 19 | } 20 | 21 | using T = int; 22 | 23 | std::vector> ptrs(BCL::nprocs()); 24 | std::vector> counters(BCL::nprocs()); 25 | 26 | size_t local_segment_size_bytes = 512*1024*1024; 27 | 28 | size_t local_segment_size = (sizeof(T) + local_segment_size_bytes - 1) / sizeof(T); 29 | size_t inserts_per_proc = 256*1024; 30 | size_t write_size = (sizeof(T) + write_size_bytes - 1) / sizeof(T); 31 | 32 | BCL::print("Creating local segment size of %lu, write size %lu (%lu bytes), number of writes %lu\n", 33 | local_segment_size, write_size, write_size*sizeof(T), inserts_per_proc); 34 | 35 | for (size_t i = 0; i < BCL::nprocs(); i++) { 36 | if (BCL::rank() == i) { 37 | ptrs[i] = BCL::alloc(local_segment_size); 38 | counters[i] = BCL::alloc(1); 39 | *counters[i].local() = 0; 40 | 41 | if (ptrs[i] == nullptr || counters[i] == nullptr) { 42 | throw std::runtime_error("Ran out of memory!"); 43 | } 44 | } 45 | ptrs[i] = BCL::broadcast(ptrs[i], i); 46 | counters[i] = BCL::broadcast(counters[i], i); 47 | } 48 | 49 | std::vector buffer(write_size, BCL::rank()); 50 | 51 | srand48(BCL::rank()); 52 | 53 | BCL::barrier(); 54 | auto begin = std::chrono::high_resolution_clock::now(); 55 | 56 | relaxed_send(ptrs, counters, buffer, 57 | local_segment_size, inserts_per_proc, write_size); 58 | 59 | BCL::barrier(); 60 | auto end = std::chrono::high_resolution_clock::now(); 61 | double duration = std::chrono::duration(end - begin).count(); 62 | 63 | double data_sent = inserts_per_proc*sizeof(T)*write_size*BCL::nprocs(); 64 | double data_sent_gb = 1e-9*data_sent; 65 | 66 | double bw_gb = data_sent_gb / duration; 67 | 68 | BCL::print("%lf GB/s\n", bw_gb); 69 | BCL::print("Runtime %lf s\n", duration); 70 | 71 | BCL::finalize(); 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/experimental/put_upcxx.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | std::vector queue_; 10 | 11 | int main(int argc, char** argv) { 12 | upcxx::init(); 13 | 14 | size_t write_size_bytes; 15 | if (argc >= 2) { 16 | write_size_bytes = std::atoll(argv[1]); 17 | } else { 18 | if (upcxx::rank_me() == 0) { 19 | printf("usage: ./put_harness [write size in bytes]\n"); 20 | } 21 | upcxx::finalize(); 22 | return 0; 23 | } 24 | 25 | using T = int; 26 | 27 | size_t inserts_per_proc = 256*1024; 28 | size_t write_size = (sizeof(T) + write_size_bytes - 1) / sizeof(T); 29 | 30 | std::vector buffer(write_size, upcxx::rank_me()); 31 | 32 | srand48(upcxx::rank_me()); 33 | 34 | upcxx::barrier(); 35 | auto begin = std::chrono::high_resolution_clock::now(); 36 | 37 | size_t concurrency_level = 100; 38 | std::vector> futures; 39 | 40 | for (size_t i = 0; i < inserts_per_proc; i++) { 41 | size_t rand_proc = lrand48() % upcxx::rank_n(); 42 | auto f = upcxx::rpc(rand_proc, [](std::vector buffer) -> int { 43 | queue_.insert(queue_.end(), 44 | buffer.begin(), 45 | buffer.end()); 46 | return 0; 47 | }, buffer); 48 | 49 | futures.push_back(std::move(f)); 50 | } 51 | 52 | for (auto& f : futures) { 53 | f.wait(); 54 | } 55 | 56 | upcxx::barrier(); 57 | auto end = std::chrono::high_resolution_clock::now(); 58 | double duration = std::chrono::duration(end - begin).count(); 59 | 60 | double data_sent = inserts_per_proc*sizeof(T)*write_size*upcxx::rank_n(); 61 | double data_sent_gb = 1e-9*data_sent; 62 | 63 | double bw_gb = data_sent_gb / duration; 64 | 65 | if (upcxx::rank_me() == 0) { 66 | printf("%lf GB/s\n", bw_gb); 67 | printf("Runtime %lf s\n", duration); 68 | } 69 | 70 | upcxx::finalize(); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /examples/benchmarks/mer-bench/irregular-lookup.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | using T = int; 16 | 17 | std::vector> ptrs(BCL::nprocs(), nullptr); 18 | 19 | // Global data size, in bytes. 20 | size_t global_data_size = 256*1024*size_t(1024); 21 | // Number of "lookups" to perform, per processor 22 | size_t num_lookups = 1000; 23 | // Transfer Size `S` of each "insert", in bytes. 24 | size_t transfer_data_size = 1000; 25 | 26 | size_t global_size = global_data_size / sizeof(T); 27 | size_t local_size = (global_size + BCL::nprocs() - 1) / BCL::nprocs(); 28 | size_t transfer_size = transfer_data_size / sizeof(T); 29 | 30 | assert(transfer_size > 0); 31 | 32 | for (size_t i = 0; i < BCL::nprocs(); i++) { 33 | if (BCL::rank() == i) { 34 | ptrs[i] = BCL::alloc(local_size); 35 | } 36 | ptrs[i] = BCL::broadcast(ptrs[i], i); 37 | 38 | if (ptrs[i] == nullptr) { 39 | throw std::runtime_error("Ran out of memory."); 40 | } 41 | } 42 | 43 | srand48(BCL::rank()); 44 | 45 | BCL::barrier(); 46 | auto begin = std::chrono::high_resolution_clock::now(); 47 | 48 | for (size_t i = 0; i < num_lookups; i++) { 49 | // Pick a random processor p' 50 | size_t dest_rank = lrand48() % BCL::nprocs(); 51 | 52 | size_t rand_loc = lrand48() % (local_size - transfer_size); 53 | 54 | auto fut = BCL::arget(ptrs[dest_rank] + rand_loc, transfer_size); 55 | fut.get(); 56 | } 57 | 58 | BCL::barrier(); 59 | auto end = std::chrono::high_resolution_clock::now(); 60 | 61 | double duration = std::chrono::duration(end - begin).count(); 62 | 63 | size_t data_transferred = transfer_size*num_lookups*BCL::nprocs(); 64 | double bw = data_transferred / duration; 65 | double bw_gb = bw*1e-9; 66 | 67 | BCL::print("Irregular Lookup benchmark completed in %lfs.\n", duration); 68 | BCL::print("Total bandwidth %lf GB/s\n", bw_gb); 69 | BCL::print("Bandwidth/process %lf GB/s\n", bw_gb/BCL::nprocs()); 70 | 71 | BCL::finalize(); 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /examples/experimental/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | SHELL='bash' 6 | 7 | # XXX: Modify BCLROOT if you move this Makefile 8 | # out of an examples/* directory. 9 | BCLROOT=$(PWD)/../../ 10 | 11 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 12 | 13 | TIMER_CMD=time 14 | 15 | ifeq ($(BACKEND),SHMEM) 16 | BACKEND=SHMEM 17 | BCLFLAGS = -lpthread -DSHMEM -I$(BCLROOT) 18 | CXX=oshc++ 19 | 20 | BCL_RUN=oshrun -n 4 21 | else ifeq ($(BACKEND),GASNET_EX) 22 | BACKEND=GASNET_EX 23 | # XXX: Allow selection of conduit. 24 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 25 | 26 | BCLFLAGS = -lpthread $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 27 | CXX = mpic++ 28 | 29 | BCL_RUN=mpirun -n 4 30 | else 31 | BACKEND=MPI 32 | BCLFLAGS = -lpthread -I$(BCLROOT) 33 | CXX=mpic++ 34 | 35 | BCL_RUN=mpirun -n 4 36 | endif 37 | 38 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 39 | 40 | SOURCES += $(wildcard *.cpp) 41 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 42 | 43 | all: $(TARGETS) 44 | 45 | %: %.cpp 46 | @echo "C $@ $(BACKEND)" 47 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 48 | 49 | test: all 50 | @for target in $(TARGETS) ; do \ 51 | echo "R $$target $(BACKEND)" ;\ 52 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 53 | done 54 | 55 | clean: 56 | @rm -f $(TARGETS) -------------------------------------------------------------------------------- /examples/experimental/array/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 14 | CXX = CC 15 | # CXX=oshc++ 16 | 17 | BCL_RUN=oshrun -n 4 18 | else ifeq ($(BACKEND),GASNET_EX) 19 | BACKEND=GASNET_EX 20 | # XXX: Allow selection of conduit. 21 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 22 | 23 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 24 | CXX = $(GASNET_CXX) 25 | 26 | BCL_RUN=mpirun -n 4 27 | else 28 | BACKEND=MPI 29 | BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 30 | CXX=CC 31 | 32 | BCL_RUN=mpirun -n 4 33 | endif 34 | 35 | COMPILER = $(shell CC --version) 36 | IS_ICC = $(findstring icc, $(COMPILER)) 37 | 38 | ifneq (,$(IS_ICC)) 39 | MKLFLAGS = -qopenmp $(COMBBLAS_FLAGS) -I $(HOME)/src/pkg/mtspgemmlib -I $(HOME)/src/pkg/mtspgemmlib/mtspgemmlib/GTgraph/sprng2.0-lite/include -DMKL_ILP64 -I${MKLROOT}/include -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl -dynamic -ltbb -ltbbmalloc 40 | else 41 | MKLFLAGS = -fopenmp $(COMBBLAS_FLAGS) -I $(HOME)/src/pkg/mtspgemmlib -I $(HOME)/src/pkg/mtspgemmlib/mtspgemmlib/GTgraph/sprng2.0-lite/include -DMKL_ILP64 -m64 -I${MKLROOT}/include -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_gnu_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lgomp -lpthread -lm -ldl -dynamic -ltbb -ltbbmalloc 42 | endif 43 | 44 | CXXFLAGS = -O3 -std=gnu++17 $(BCLFLAGS) $(MKLFLAGS) 45 | 46 | SOURCES += $(wildcard *.cpp) 47 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 48 | 49 | all: $(TARGETS) 50 | 51 | %: %.cpp 52 | echo "C $@ $(BACKEND)" 53 | time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 54 | 55 | test: all 56 | @for target in $(TARGETS) ; do \ 57 | echo "R $$target $(BACKEND)" ;\ 58 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 59 | done 60 | 61 | clean: 62 | @rm -f $(TARGETS) 63 | -------------------------------------------------------------------------------- /examples/experimental/genome-assembly/contig-gen/Makefile: -------------------------------------------------------------------------------- 1 | # XXX: Modify BCLROOT if you move this Makefile 2 | # out of an examples/* directory. 3 | BCLROOT=$(PWD)/../../../../ 4 | 5 | # XXX: To compile with MPI 6 | #BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 7 | #CXX=mpic++ 8 | 9 | # XXX: To compile with OpenSHMEM 10 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 11 | # CXX=oshc++ 12 | CXX=CC 13 | 14 | # XXX: Compiling with GASNet-EX varies 15 | # depending on conduit. See your 16 | # GASNet documentation. 17 | # BCLFLAGS = -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 18 | # CXX=$(GASNET_CXX) ... 19 | 20 | CXXFLAGS = -std=gnu++17 -O3 $(BCLFLAGS) 21 | 22 | SOURCES += $(wildcard *.cpp) 23 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 24 | 25 | all: $(TARGETS) 26 | 27 | %: %.cpp 28 | $(CXX) -o $@ $^ $(CXXFLAGS) 29 | 30 | clean: 31 | rm -fv $(TARGETS) 32 | -------------------------------------------------------------------------------- /examples/experimental/genome-assembly/contig-gen/hash_funcs.h: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #ifndef HASH_FUNCS_H 8 | #define HASH_FUNCS_H 9 | 10 | #include 11 | 12 | 13 | #ifdef __cplusplus 14 | extern "C" { 15 | #endif 16 | 17 | uint32_t rotl32(uint32_t x, int8_t r) 18 | { 19 | return (x << r) | (x >> (32 - r)); 20 | } 21 | 22 | uint64_t rotl64(uint64_t x, int8_t r) 23 | { 24 | return (x << r) | (x >> (64 - r)); 25 | } 26 | 27 | #define ROTL32(x,y) rotl32(x,y) 28 | #define ROTL64(x,y) rotl64(x,y) 29 | 30 | #define BIG_CONSTANT(x) (x##LLU) 31 | 32 | //----------------------------------------------------------------------------- 33 | // Finalization mix - force all bits of a hash block to avalanche 34 | 35 | uint64_t fmix64(uint64_t k) 36 | { 37 | k ^= k >> 33; 38 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 39 | k ^= k >> 33; 40 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 41 | k ^= k >> 33; 42 | 43 | return k; 44 | } 45 | 46 | //----------------------------------------------------------------------------- 47 | // Block read - if your platform needs to do endian-swapping or can only 48 | // handle aligned reads, do the conversion here 49 | 50 | #define getblock(p, i) (p[i]) 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | 57 | 58 | #endif // HASH_FUNCS_H 59 | -------------------------------------------------------------------------------- /examples/experimental/genome-assembly/contig-gen/hash_funcs.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | extern "C" { 8 | 9 | #include "hash_funcs.h" 10 | 11 | void MurmurHash3_x64_128(const void * key, const uint32_t len, const uint32_t seed, void * out) 12 | { 13 | const uint8_t * data = (const uint8_t*)key; 14 | const uint32_t nblocks = len / 16; 15 | int32_t i; 16 | 17 | uint64_t h1 = seed; 18 | uint64_t h2 = seed; 19 | 20 | uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 21 | uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 22 | 23 | //---------- 24 | // body 25 | 26 | const uint64_t * blocks = (const uint64_t *)(data); 27 | 28 | for(i = 0; i < nblocks; i++) 29 | { 30 | uint64_t k1 = getblock(blocks,i*2+0); 31 | uint64_t k2 = getblock(blocks,i*2+1); 32 | 33 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 34 | 35 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 36 | 37 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 38 | 39 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 40 | } 41 | 42 | //---------- 43 | // tail 44 | 45 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 46 | 47 | uint64_t k1 = 0; 48 | uint64_t k2 = 0; 49 | 50 | switch(len & 15) 51 | { 52 | case 15: k2 ^= (uint64_t)(tail[14]) << 48; 53 | case 14: k2 ^= (uint64_t)(tail[13]) << 40; 54 | case 13: k2 ^= (uint64_t)(tail[12]) << 32; 55 | case 12: k2 ^= (uint64_t)(tail[11]) << 24; 56 | case 11: k2 ^= (uint64_t)(tail[10]) << 16; 57 | case 10: k2 ^= (uint64_t)(tail[ 9]) << 8; 58 | case 9: k2 ^= (uint64_t)(tail[ 8]) << 0; 59 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 60 | 61 | case 8: k1 ^= (uint64_t)(tail[ 7]) << 56; 62 | case 7: k1 ^= (uint64_t)(tail[ 6]) << 48; 63 | case 6: k1 ^= (uint64_t)(tail[ 5]) << 40; 64 | case 5: k1 ^= (uint64_t)(tail[ 4]) << 32; 65 | case 4: k1 ^= (uint64_t)(tail[ 3]) << 24; 66 | case 3: k1 ^= (uint64_t)(tail[ 2]) << 16; 67 | case 2: k1 ^= (uint64_t)(tail[ 1]) << 8; 68 | case 1: k1 ^= (uint64_t)(tail[ 0]) << 0; 69 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 70 | }; 71 | 72 | //---------- 73 | // finalization 74 | 75 | h1 ^= len; h2 ^= len; 76 | 77 | h1 += h2; 78 | h2 += h1; 79 | 80 | h1 = fmix64(h1); 81 | h2 = fmix64(h2); 82 | 83 | h1 += h2; 84 | h2 += h1; 85 | 86 | ((uint64_t*)out)[0] = h1; 87 | ((uint64_t*)out)[1] = h2; 88 | } 89 | 90 | uint64_t MurmurHash3_x64_64(const unsigned char* key, uint32_t len) 91 | { 92 | uint64_t temp[2]; 93 | MurmurHash3_x64_128(key, len, 313, temp); 94 | return temp[0]; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /examples/experimental/genome-assembly/contig-gen/packing.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #define KMER_LEN 51 10 | #define PACKED_KMER_LEN ((KMER_LEN+3)/4) 11 | 12 | bool packedCodeToFourMerCoded = false; 13 | unsigned int packedCodeToFourMer[256]; 14 | 15 | #define pow4(a) (1<<((a)<<1)) 16 | 17 | void init_LookupTable() 18 | { 19 | // Work with 4-mers for the moment to have small lookup tables 20 | int merLen = 4, i, slot, valInSlot; 21 | unsigned char mer[4]; 22 | 23 | for ( i = 0; i < 256; i++ ) { 24 | // convert a packedcode to a 4-mer 25 | int remainder = i; 26 | int pos = 0; 27 | for( slot = merLen-1; slot >= 0; slot-- ) { 28 | valInSlot = remainder / pow4(slot); 29 | char base; 30 | 31 | if (valInSlot == 0) { base = 'A'; } 32 | else if( valInSlot == 1 ) { base = 'C'; } 33 | else if( valInSlot == 2 ) { base = 'G'; } 34 | else if( valInSlot == 3 ) { base = 'T'; } 35 | else{ assert( 0 ); } 36 | 37 | mer[pos] = base; 38 | pos++; 39 | remainder -= valInSlot * pow4(slot); 40 | } 41 | unsigned int *merAsUInt = (unsigned int*) mer; 42 | packedCodeToFourMer[i] = (unsigned int) (*merAsUInt); 43 | } 44 | } 45 | 46 | unsigned char packFourMer(const char *fourMer) 47 | { 48 | int retval = 0; 49 | int code, i; 50 | int pow = 64; 51 | 52 | for ( i=0; i < 4; i++) { 53 | char base = fourMer[i]; 54 | switch ( base ) { 55 | case 'A': 56 | code = 0; 57 | break; 58 | case 'C': 59 | code = 1; 60 | break; 61 | case 'G': 62 | code = 2; 63 | break; 64 | case 'T': 65 | code = 3; 66 | break; 67 | } 68 | retval += code * pow; 69 | pow /= 4; 70 | } 71 | return ((unsigned char) retval); 72 | } 73 | 74 | void packKmer(const char *kmer, unsigned char *packed_kmer) { 75 | int ind, j = 0; 76 | int i = 0; 77 | 78 | for ( ; j <= KMER_LEN - 4; i++, j += 4) { 79 | packed_kmer[i] = packFourMer(kmer + j); 80 | } 81 | 82 | int remainder = KMER_LEN % 4; 83 | char blockSeq[5] = "AAAA"; 84 | for (ind = 0; ind < remainder; ind++) { 85 | blockSeq[ind] = kmer[j + ind]; 86 | } 87 | 88 | packed_kmer[i] = packFourMer(blockSeq); 89 | } 90 | 91 | 92 | void unpackKmer(const unsigned char packed_kmer[PACKED_KMER_LEN], 93 | char *kmer) { 94 | if (!packedCodeToFourMerCoded) { 95 | packedCodeToFourMerCoded = true; 96 | init_LookupTable(); 97 | } 98 | int i = 0, j = 0; 99 | for( ; i < PACKED_KMER_LEN; i++, j += 4 ) { 100 | unsigned char block[4]; 101 | *(unsigned int *) block = packedCodeToFourMer[packed_kmer[i]]; 102 | for (int i = 0; i < 4; i++) { 103 | (kmer + j)[i] = (char) block[i]; 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /examples/experimental/genome-assembly/contig-gen/read_kmers.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include "kmer_t.hpp" 16 | 17 | size_t line_count(const std::string &fname) { 18 | FILE *f = fopen(fname.c_str(), "r"); 19 | 20 | if (f == NULL) { 21 | throw std::runtime_error("line_count: could not open " + fname); 22 | } 23 | 24 | size_t n_lines = 0; 25 | size_t n_read; 26 | 27 | const size_t buf_size = 16384; 28 | char buf[buf_size]; 29 | 30 | do { 31 | n_read = fread(buf, sizeof(char), buf_size, f); 32 | for (size_t i = 0; i < n_read; i++) { 33 | if (buf[i] == '\n') { 34 | n_lines++; 35 | } 36 | } 37 | } while (n_read != 0); 38 | fclose(f); 39 | return n_lines; 40 | } 41 | 42 | std::vector read_kmers(const std::string &fname, uint64_t nprocs = 1, uint64_t rank = 0) { 43 | size_t num_lines = line_count(fname); 44 | size_t split = (num_lines + nprocs - 1) / nprocs; 45 | size_t start = split*rank; 46 | size_t size = std::min(split, num_lines - start); 47 | 48 | FILE *f = fopen(fname.c_str(), "r"); 49 | const size_t line_len = KMER_LEN + 4; 50 | fseek(f, line_len*start, SEEK_SET); 51 | 52 | std::shared_ptr buf(new char[line_len*size]); 53 | size_t n_read = fread(buf.get(), sizeof(char), line_len*size, f); 54 | 55 | std::vector kmers; 56 | 57 | for (size_t line_offset = 0; line_offset < line_len*size; line_offset += line_len) { 58 | char *kmer_buf = &buf.get()[line_offset]; 59 | char *fb_ext_buf = kmer_buf + KMER_LEN+1; 60 | kmers.push_back(kmer_pair(std::string(kmer_buf, KMER_LEN), std::string(fb_ext_buf, 2))); 61 | } 62 | fclose(f); 63 | return kmers; 64 | } 65 | 66 | // Extract contig for the full application version of contig generation. 67 | std::string extract_contig_fullapp(const std::list &contig) { 68 | std::string contig_buf = ""; 69 | 70 | if (contig.front().backwardExt() != 'X' && contig.front().backwardExt() != 'F') { 71 | contig_buf += contig.front().backwardExt(); 72 | } 73 | contig_buf += contig.front().kmer_str(); 74 | 75 | for (const auto &kmer : contig) { 76 | if (kmer.forwardExt() != 'F' && kmer.forwardExt() != 'X') { 77 | contig_buf += kmer.forwardExt(); 78 | } 79 | } 80 | return canonicalize(contig_buf); 81 | } 82 | 83 | // Extract contig for the CS 267 version of contig generation. 84 | std::string extract_contig_simple(const std::list &contig) { 85 | std::string contig_buf = ""; 86 | 87 | contig_buf += contig.front().kmer_str(); 88 | 89 | for (const auto &kmer : contig) { 90 | if (kmer.forwardExt() != 'F') { 91 | contig_buf += kmer.forwardExt(); 92 | } 93 | } 94 | return contig_buf; 95 | } 96 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | # Makefile for some simple examples in this directory. 6 | # On bridges, you need to run `source prep.sh` to set 7 | # your environment before invoking this Makefile. 8 | 9 | BCL_HOME=$(PWD)/../../../ 10 | 11 | SOURCES += $(wildcard *.cu) 12 | TARGETS := $(patsubst %.cu, %, $(SOURCES)) 13 | 14 | CXX=nvcc 15 | 16 | # NVSHMEM_FLAGS=-DNVSHMEM_TARGET -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 17 | NVSHMEM_FLAGS=-DNVSHMEM_TARGET -arch=sm_70 -ccbin g++ -I$(CUDA_HOME)/include -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 18 | 19 | CXXFLAGS = -std=c++14 -O3 -I$(BCL_HOME) --expt-extended-lambda $(NVSHMEM_FLAGS) 20 | 21 | all: $(TARGETS) 22 | 23 | %: %.cu 24 | $(CXX) -o $@ $^ $(CXXFLAGS) 25 | 26 | clean: 27 | rm -fv $(TARGETS) 28 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/bcl_memcpy.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | void print_values(std::vector>& ptrs) { 9 | if (BCL::rank() == 0) { 10 | printf("Process 0 sees:\n"); 11 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 12 | printf("%lu:", rank); 13 | for (size_t i = 0; i < BCL::nprocs(); i++) { 14 | int val; 15 | BCL::cuda::memcpy(&val, ptrs[rank] + i, sizeof(int)); 16 | printf(" %d", val); 17 | } 18 | printf("\n"); 19 | } 20 | } 21 | } 22 | 23 | int main(int argc, char** argv) { 24 | BCL::init(); 25 | 26 | printf("Hello, world! I am rank %lu/%lu\n", 27 | BCL::rank(), BCL::nprocs()); 28 | 29 | BCL::cuda::init(); 30 | 31 | std::vector> ptrs(BCL::nprocs()); 32 | 33 | for (size_t i = 0; i < BCL::nprocs(); i++) { 34 | if (BCL::rank() == i) { 35 | ptrs[i] = BCL::cuda::alloc(BCL::nprocs()); 36 | } 37 | ptrs[i] = BCL::broadcast(ptrs[i], i); 38 | } 39 | 40 | BCL::cuda::barrier(); 41 | 42 | for (size_t i = 0; i < BCL::nprocs(); i++) { 43 | int val = BCL::rank(); 44 | BCL::cuda::memcpy(ptrs[BCL::rank()] + i, &val, sizeof(int)); 45 | } 46 | 47 | BCL::cuda::barrier(); 48 | 49 | print_values(ptrs); 50 | 51 | BCL::cuda::barrier(); 52 | 53 | for (size_t origin_rank = 0; origin_rank < BCL::nprocs(); origin_rank++) { 54 | BCL::print("Origin rank is %lu\n", origin_rank); 55 | if (BCL::rank() == origin_rank) { 56 | for (size_t i = 0; i < BCL::nprocs(); i++) { 57 | int val = BCL::rank(); 58 | BCL::cuda::memcpy(ptrs[BCL::rank()] + i, &val, sizeof(int)); 59 | } 60 | for (size_t dst_rank = 0; dst_rank < BCL::nprocs(); dst_rank++) { 61 | // XXX: does not currently work 62 | BCL::cuda::memcpy(ptrs[dst_rank], ptrs[origin_rank], sizeof(int)*BCL::nprocs()); 63 | } 64 | } 65 | BCL::cuda::barrier(); 66 | print_values(ptrs); 67 | BCL::cuda::barrier(); 68 | } 69 | 70 | BCL::finalize(); 71 | return 0; 72 | } 73 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/darray_test.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void bcl_array_test() { 12 | size_t block_size = 4; 13 | BCL::cuda::DArray array(BCL::nprocs()*block_size); 14 | 15 | BCL::cuda::global_launch(array.size(), 16 | [] __device__ (size_t idx, BCL::cuda::DArray& array) { 17 | array[idx] = idx; 18 | }, array); 19 | 20 | BCL::cuda::barrier(); 21 | 22 | BCL::cuda::global_launch(array.size(), 23 | [] __device__ (size_t idx, BCL::cuda::DArray& array) { 24 | int result = array[idx]; 25 | printf("%lu: %d\n", idx, result); 26 | }, array); 27 | 28 | BCL::cuda::barrier(); 29 | } 30 | 31 | int main(int argc, char** argv) { 32 | BCL::init(64); 33 | 34 | BCL::cuda::init(); 35 | 36 | bcl_array_test(); 37 | 38 | BCL::cuda::finalize(); 39 | 40 | BCL::finalize(); 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/device_vector_test.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | __global__ void kernel(BCL::cuda::device_vector vec) { 11 | size_t tid = threadIdx.x + blockIdx.x * blockDim.x; 12 | 13 | int value = vec.data()[tid]; 14 | printf("%lu: %d\n", tid, value); 15 | } 16 | 17 | int main(int argc, char** argv) { 18 | cudaSetDevice(0); 19 | 20 | constexpr size_t n = 16; 21 | 22 | std::vector vec(n); 23 | for (size_t i = 0; i < vec.size(); i++) { 24 | vec[i] = i; 25 | } 26 | 27 | BCL::cuda::device_vector v(vec.begin(), vec.end()); 28 | 29 | printf("First Launch (should be in order 0 -> n)\n"); 30 | fflush(stdout); 31 | BCL::cuda::launch(v.size(), 32 | [] __device__ (size_t tid, BCL::cuda::device_vector v) { 33 | int val = v[tid]; 34 | printf("Element %lu == %d\n", tid, val); 35 | }, v); 36 | cudaDeviceSynchronize(); 37 | fflush(stdout); 38 | 39 | printf("Second Launch (modifying values)\n"); 40 | fflush(stdout); 41 | BCL::cuda::launch(v.size(), 42 | [] __device__ (size_t tid, BCL::cuda::device_vector v) { 43 | v[tid] = v.size() - tid; 44 | }, v); 45 | 46 | cudaDeviceSynchronize(); 47 | fflush(stdout); 48 | 49 | printf("Third Launch (should be in order n -> 1)\n"); 50 | fflush(stdout); 51 | 52 | BCL::cuda::launch(v.size(), 53 | [] __device__ (size_t tid, BCL::cuda::device_vector v) { 54 | int val = v[tid]; 55 | printf("Element %lu == %d\n", tid, val); 56 | }, v); 57 | 58 | cudaDeviceSynchronize(); 59 | fflush(stdout); 60 | 61 | v.destroy(); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/hashtable/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | # Makefile for some simple examples in this directory. 6 | # On bridges, you need to run `source prep.sh` to set 7 | # your environment before invoking this Makefile. 8 | 9 | BCL_HOME=$(PWD)/../../../../ 10 | 11 | SOURCES += $(wildcard *.cu) 12 | TARGETS := $(patsubst %.cu, %, $(SOURCES)) 13 | 14 | CXX=nvcc 15 | 16 | # NVSHMEM_FLAGS=-DNVSHMEM_TARGET -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 17 | NVSHMEM_FLAGS=-DNVSHMEM_TARGET -arch=sm_70 -ccbin g++ -I$(CUDA_HOME)/include -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 18 | 19 | CXXFLAGS = -std=c++14 -O3 -I$(BCL_HOME) --expt-extended-lambda $(NVSHMEM_FLAGS) 20 | 21 | all: $(TARGETS) 22 | 23 | %: %.cu 24 | $(CXX) -o $@ $^ $(CXXFLAGS) 25 | 26 | clean: 27 | rm -fv $(TARGETS) 28 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/hashtable/hashtable.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | int main(int argc, char** argv) { 13 | BCL::init(64); 14 | BCL::cuda::init(1024); 15 | BCL::cuda::DHashMap map(100); 16 | 17 | if (BCL::rank() == 0) { 18 | BCL::cuda::launch(1, [] __device__ (size_t tid, BCL::cuda::DHashMap& map) { 19 | for (size_t i = 0; i < 10; i++) { 20 | map.insert(i, i); 21 | } 22 | }, map); 23 | cudaDeviceSynchronize(); 24 | } 25 | BCL::cuda::barrier(); 26 | 27 | if (BCL::rank() == 1) { 28 | BCL::cuda::launch(1, [] __device__ (size_t tid, BCL::cuda::DHashMap& map) { 29 | for (size_t i = 0; i < 10; i++) { 30 | int value = map.find(i); 31 | printf("{%lu, %d}\n", i, value); 32 | } 33 | }, map); 34 | cudaDeviceSynchronize(); 35 | } 36 | 37 | BCL::cuda::finalize(); 38 | BCL::finalize(); 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/matrix/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | # Makefile for some simple examples in this directory. 6 | # On bridges, you need to run `source prep.sh` to set 7 | # your environment before invoking this Makefile. 8 | 9 | BCL_HOME=$(PWD)/../../../../ 10 | 11 | SOURCES += $(wildcard *.cu) 12 | TARGETS := $(patsubst %.cu, %, $(SOURCES)) 13 | 14 | CXX=nvcc 15 | 16 | # NVSHMEM_FLAGS=-DNVSHMEM_TARGET -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 17 | GRAPHBLAST_DIR = /ccs/home/b2v/src/research/graphblast 18 | GRAPHBLAST_FLAGS = -I$(GRAPHBLAST_DIR) -I$(GRAPHBLAST_DIR)/ext/moderngpu/include/ -I$(GRAPHBLAST_DIR)/ext/cub/cub/ $(GRAPHBLAST_DIR)/ext/moderngpu/src/mgpucontext.cu $(GRAPHBLAST_DIR)/ext/moderngpu/src/mgpuutil.cpp -lboost_program_options -lcublas -lcusparse -lcurand 19 | 20 | MATRIX_SUM_FLAGS = -I/autofs/nccs-svm1_home2/b2v/src/research/matrix_sum 21 | NSPARSE_FLAGS = -I/ccs/home/b2v/src/research/nsparse/cuda-cpp/inc -I$(CUDAPATH)/samples/common/inc 22 | 23 | CUSPARSE_FLAGS = -lcusparse -lcublas 24 | 25 | CUSP_FLAGS = -I/autofs/nccs-svm1_home2/b2v/pkg/cusplibrary 26 | 27 | NVSHMEM_FLAGS=-DNVSHMEM_TARGET -arch=sm_70 -ccbin g++ -I$(CUDA_HOME)/include -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi_ibm -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil -lcublas_static -lculibos -lcublasLt_static -I$(OLCF_ESSL_ROOT)/include -L$(OLCF_ESSL_ROOT)/lib64 -lessl 28 | 29 | SHMEM_FLAGS=-lpthread -L/autofs/nccs-svm1_sw/summit/.swci/1-compute/opt/spack/20180914/linux-rhel7-ppc64le/gcc-8.1.1/spectrum-mpi-10.3.1.2-20200121-chae23sgwacfeot7vxkpfboz6wao2c33/lib -loshmem -lmpi_ibm 30 | 31 | CXXFLAGS = -std=c++14 -O3 -DSHMEM $(SHMEM_FLAGS) -I$(BCL_HOME) --expt-extended-lambda $(NVSHMEM_FLAGS) $(CUSPARSE_FLAGS) $(CUSP_FLAGS) 32 | 33 | all: $(TARGETS) 34 | 35 | %: %.cu 36 | $(CXX) -o $@ $^ $(CXXFLAGS) 37 | 38 | clean: 39 | rm -fv $(TARGETS) 40 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/matrix/analyze_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char** argv) { 4 | size_t num_args = 1; 5 | 6 | if (argc != num_args+1) { 7 | fprintf(stderr, "usage: ./analyze_matrix [input file]\n"); 8 | fprintf(stderr, "Analyze matrix.\n"); 9 | fprintf(stderr, "Automatically infers type from extensions (.mtx or .binary).\n"); 10 | return 1; 11 | } 12 | 13 | std::string input_fname(argv[1]); 14 | 15 | auto format = BCL::matrix_io::detect_file_type(input_fname); 16 | assert(format == BCL::FileFormat::Binary); 17 | 18 | using index_type = int; 19 | 20 | fprintf(stderr, "Reading in \"%s\"\n", input_fname.c_str()); 21 | BCL::CSRMatrix matrix(input_fname); 22 | 23 | fprintf(stderr, "Dimensions: %lu x %lu, NNZ: %lu\n", matrix.shape()[0], matrix.shape()[1], matrix.nnz()); 24 | 25 | std::vector nnzs(10*10, 0); 26 | 27 | size_t m_s = (matrix.shape()[0] + 10 - 1) / 10; 28 | size_t n_s = (matrix.shape()[1] + 10 - 1) / 10; 29 | 30 | for (size_t i = 0; i < matrix.m(); i++) { 31 | for (index_type j_ptr = matrix.rowptr_data()[i]; j_ptr < matrix.rowptr_data()[i+1]; j_ptr++) { 32 | index_type j = matrix.colind_data()[j_ptr]; 33 | 34 | size_t tile_i = i / m_s; 35 | size_t tile_j = j / n_s; 36 | 37 | nnzs[tile_i*10 + tile_j]++; 38 | } 39 | } 40 | 41 | for (size_t i = 0; i < 10; i++) { 42 | for (size_t j = 0; j < 10; j++) { 43 | // fprintf(stderr, "(%lu, %lu): %lu nnz\n", i, j, nnzs[i*10 + j]); 44 | } 45 | } 46 | 47 | size_t total = 0; 48 | size_t max = 0; 49 | for (const auto& nnz : nnzs) { 50 | total += nnz; 51 | max = std::max(max, nnz); 52 | } 53 | 54 | double avg = total / (10.0*10.0); 55 | double load_imbalance = max / avg; 56 | 57 | fprintf(stderr, "Avg NNZs %lf, max %lu, load imbalance %lf\n", 58 | avg, max, load_imbalance); 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/matrix/nsparse_util.hpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #pragma once 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace BCL { 12 | 13 | namespace cuda { 14 | 15 | template 16 | CSR convert_to_nsparse(CudaCSRMatrix& mat) { 17 | CSR mat_nsparse(mat.m(), mat.n(), mat.nnz(), false); 18 | mat_nsparse.d_rpt = mat.rowptr_data(); 19 | mat_nsparse.d_colids = mat.colind_data(); 20 | mat_nsparse.d_values = mat.values_data(); 21 | return mat_nsparse; 22 | } 23 | 24 | template 25 | CudaCSRMatrix 26 | convert_to_csr(CSR mat) { 27 | return CudaCSRMatrix({mat.nrow, mat.ncolumn}, mat.nnz, 28 | mat.d_values, mat.d_rpt, mat.d_colids); 29 | } 30 | 31 | template 32 | CudaCSRMatrix 33 | spgemm_nsparse(CudaCSRMatrix& a, 34 | CudaCSRMatrix& b) { 35 | auto a_nsparse = convert_to_nsparse(a); 36 | auto b_nsparse = convert_to_nsparse(b); 37 | CSR c_nsparse; 38 | 39 | SpGEMM_Hash(a_nsparse, b_nsparse, c_nsparse); 40 | 41 | return convert_to_csr(c_nsparse); 42 | } 43 | 44 | } // end cuda 45 | } // end BCL -------------------------------------------------------------------------------- /examples/experimental/nvshmem/merbench/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | # Makefile for some simple examples in this directory. 6 | # On bridges, you need to run `source prep.sh` to set 7 | # your environment before invoking this Makefile. 8 | 9 | BCL_HOME=$(PWD)/../../../../ 10 | 11 | SOURCES += $(wildcard *.cu) 12 | TARGETS := $(patsubst %.cu, %, $(SOURCES)) 13 | 14 | CXX=nvcc 15 | 16 | # NVSHMEM_FLAGS=-DNVSHMEM_TARGET -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 17 | NVSHMEM_FLAGS=-DNVSHMEM_TARGET -arch=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 18 | 19 | CXXFLAGS = -std=c++14 -O3 -I$(BCL_HOME) --expt-extended-lambda $(NVSHMEM_FLAGS) 20 | 21 | all: $(TARGETS) 22 | 23 | %: %.cu 24 | $(CXX) -o $@ $^ $(CXXFLAGS) 25 | 26 | clean: 27 | rm -fv $(TARGETS) 28 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/queue/Makefile: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | # Makefile for some simple examples in this directory. 6 | # On bridges, you need to run `source prep.sh` to set 7 | # your environment before invoking this Makefile. 8 | 9 | BCL_HOME=$(PWD)/../../../../ 10 | 11 | SOURCES += $(wildcard *.cu) 12 | TARGETS := $(patsubst %.cu, %, $(SOURCES)) 13 | 14 | CXX=nvcc 15 | 16 | # NVSHMEM_FLAGS=-DNVSHMEM_TARGET -gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_37,code=sm_37 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_70,code=sm_70 -ccbin g++ -I$(CUDA_HOME)/include/nvprefix -I$(NVSHMEM_HOME)/include/nvprefix -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib/nvprefix -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 17 | NVSHMEM_FLAGS=-DNVSHMEM_TARGET -arch=sm_70 -ccbin g++ -I$(CUDA_HOME)/include -I$(NVSHMEM_HOME)/include -I$(MPI_HOME)/include -DENABLE_MPI_SUPPORT -rdc=true -L$(NVSHMEM_HOME)/lib -lnvshmem -lcuda -L$(CUDA_HOME)/lib64 -lcudart -L$(MPI_HOME)/lib -lmpi_ibm -lopen-rte -lopen-pal -lm -lnuma -ldl -lrt -lutil 18 | 19 | CXXFLAGS = -std=c++14 -O3 -I$(BCL_HOME) --expt-extended-lambda $(NVSHMEM_FLAGS) 20 | 21 | all: $(TARGETS) 22 | 23 | %: %.cu 24 | $(CXX) -o $@ $^ $(CXXFLAGS) 25 | 26 | clean: 27 | rm -fv $(TARGETS) 28 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/queue/duplqueue.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #define NUM_INSERTS 2*1024 13 | 14 | int main(int argc, char** argv) { 15 | BCL::init(16); 16 | 17 | printf("Hello, world! I am rank %lu/%lu\n", 18 | BCL::rank(), BCL::nprocs()); 19 | 20 | BCL::cuda::init(16*1024); 21 | 22 | size_t num_inserts = NUM_INSERTS; 23 | size_t insert_size = 64; 24 | 25 | BCL::cuda::DuplQueue queue(0, num_inserts*insert_size); 26 | 27 | BCL::cuda::device_vector values(insert_size); 28 | std::vector values_local(insert_size, BCL::rank()); 29 | values.assign(values_local.begin(), values_local.end()); 30 | 31 | BCL::cuda::barrier(); 32 | auto begin = std::chrono::high_resolution_clock::now(); 33 | 34 | BCL::cuda::global_launch(num_inserts, 35 | [] __device__ (size_t idx, BCL::cuda::DuplQueue& queue, 36 | BCL::cuda::device_vector& values) { 37 | bool success = queue.push(values.data(), values.size()); 38 | if (!success) { 39 | printf("AGH! I have failed!\n"); 40 | } 41 | }, queue, values); 42 | 43 | cudaDeviceSynchronize(); 44 | 45 | fflush(stdout); 46 | fflush(stderr); 47 | BCL::barrier(); 48 | fflush(stdout); 49 | fflush(stderr); 50 | BCL::barrier(); 51 | 52 | BCL::print("Here...\n"); 53 | 54 | BCL::cuda::barrier(); 55 | BCL::print("After barrier...\n"); 56 | auto end = std::chrono::high_resolution_clock::now(); 57 | 58 | BCL::finalize(); 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/queue/duplqueue_warp.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #define NUM_INSERTS 2*8*1024 13 | 14 | int main(int argc, char** argv) { 15 | BCL::init(16); 16 | 17 | printf("Hello, world! I am rank %lu/%lu\n", 18 | BCL::rank(), BCL::nprocs()); 19 | 20 | BCL::cuda::init(14*1024); 21 | 22 | size_t num_inserts = NUM_INSERTS; 23 | size_t insert_size = 8*1024; 24 | 25 | // Round up so each rank has an equal number of inserts. 26 | size_t inserts_per_rank = (num_inserts + BCL::nprocs() - 1) / BCL::nprocs(); 27 | inserts_per_rank *= BCL::nprocs(); 28 | num_inserts = inserts_per_rank * BCL::nprocs(); 29 | 30 | BCL::cuda::DuplQueue queue(0, num_inserts*insert_size); 31 | 32 | BCL::cuda::device_vector> values(insert_size); 33 | // BCL::cuda::device_vector values(insert_size); 34 | std::vector values_local(insert_size, BCL::rank()); 35 | values.assign(values_local.begin(), values_local.end()); 36 | 37 | BCL::cuda::barrier(); 38 | auto begin = std::chrono::high_resolution_clock::now(); 39 | 40 | BCL::cuda::launch(inserts_per_rank*32, 41 | [] __device__ (size_t idx, BCL::cuda::DuplQueue& queue, 42 | BCL::cuda::device_vector>& values) { 43 | // BCL::cuda::device_vector& values) { 44 | bool success = queue.push_warp(values.data(), values.size()); 45 | if (!success) { 46 | printf("AGH! I have failed!\n"); 47 | } 48 | }, queue, values); 49 | 50 | cudaDeviceSynchronize(); 51 | BCL::cuda::barrier(); 52 | auto end = std::chrono::high_resolution_clock::now(); 53 | 54 | double duration = std::chrono::duration(end - begin).count(); 55 | 56 | double data_moved = num_inserts*insert_size*sizeof(int); 57 | double data_moved_gb = data_moved*1e-9; 58 | 59 | double bw = data_moved / duration; 60 | double bw_gb = bw*1e-9; 61 | 62 | BCL::print("Total %lf s (%lf GB) (%lf GB/s)\n", duration, data_moved_gb, bw_gb); 63 | 64 | if (BCL::rank() == 0) { 65 | BCL::cuda::launch(num_inserts, 66 | [] __device__ (size_t idx, BCL::cuda::DuplQueue& queue) { 67 | int value = 12; 68 | bool success = queue.local_pop(value); 69 | // printf("%lu: %d (%s)\n", idx, value, (success) ? "success" : "failure"); 70 | }, queue); 71 | cudaDeviceSynchronize(); 72 | } 73 | BCL::cuda::barrier(); 74 | 75 | BCL::print("Here...\n"); 76 | 77 | BCL::cuda::barrier(); 78 | BCL::print("After barrier...\n"); 79 | 80 | BCL::finalize(); 81 | return 0; 82 | } 83 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/queue/fastqueue-warp.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #define NUM_INSERTS 256*1024*1024 13 | 14 | int main(int argc, char** argv) { 15 | BCL::init(16); 16 | 17 | printf("Hello, world! I am rank %lu/%lu\n", 18 | BCL::rank(), BCL::nprocs()); 19 | 20 | BCL::cuda::init(16*1024); 21 | 22 | size_t num_inserts = NUM_INSERTS; 23 | 24 | BCL::cuda::FastQueue queue(0, num_inserts); 25 | 26 | BCL::cuda::barrier(); 27 | auto begin = std::chrono::high_resolution_clock::now(); 28 | 29 | BCL::cuda::global_launch(num_inserts, 30 | [] __device__ (size_t idx, BCL::cuda::FastQueue& queue) { 31 | size_t warp_id = idx % 32; 32 | int values[32]; 33 | bool success = queue.push_warp(values, 32); 34 | if (!success) { 35 | printf("AGH! I have failed!\n"); 36 | } 37 | }, queue); 38 | 39 | cudaDeviceSynchronize(); 40 | 41 | BCL::cuda::barrier(); 42 | auto end = std::chrono::high_resolution_clock::now(); 43 | 44 | double duration = std::chrono::duration(end - begin).count(); 45 | 46 | BCL::print("Finished in %lf s (%lf megapushes/s)\n", duration, (num_inserts / duration) / (1024*1024)); 47 | 48 | BCL::print("Queue has %lu items (out of %lu)\n", queue.size(), num_inserts); 49 | 50 | BCL::cuda::global_launch(num_inserts, 51 | [] __device__ (size_t idx, BCL::cuda::FastQueue& queue) { 52 | int value; 53 | bool success = queue.pop(value); 54 | if (!success || value < 0 || value > NUM_INSERTS) { 55 | printf("AGH! I have failed (popping)!\n"); 56 | } 57 | }, queue); 58 | cudaDeviceSynchronize(); 59 | BCL::cuda::barrier(); 60 | 61 | BCL::print("Queue has %lu items (out of %lu)\n", queue.size(), num_inserts); 62 | 63 | BCL::finalize(); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /examples/experimental/nvshmem/queue/fastqueue.cu: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #define NUM_INSERTS 256*1024*1024 13 | 14 | int main(int argc, char** argv) { 15 | BCL::init(16); 16 | 17 | printf("Hello, world! I am rank %lu/%lu\n", 18 | BCL::rank(), BCL::nprocs()); 19 | 20 | BCL::cuda::init(16*1024); 21 | 22 | size_t num_inserts = NUM_INSERTS; 23 | 24 | BCL::cuda::FastQueue queue(0, num_inserts); 25 | 26 | BCL::cuda::barrier(); 27 | auto begin = std::chrono::high_resolution_clock::now(); 28 | 29 | BCL::cuda::global_launch(num_inserts, 30 | [] __device__ (size_t idx, BCL::cuda::FastQueue& queue) { 31 | bool success = queue.push(idx); 32 | if (!success) { 33 | printf("AGH! I have failed!\n"); 34 | } 35 | }, queue); 36 | 37 | cudaDeviceSynchronize(); 38 | 39 | BCL::cuda::barrier(); 40 | auto end = std::chrono::high_resolution_clock::now(); 41 | 42 | double duration = std::chrono::duration(end - begin).count(); 43 | 44 | BCL::print("Finished in %lf s (%lf megapushes/s)\n", duration, (num_inserts / duration) / (1024*1024)); 45 | 46 | BCL::print("Queue has %lu items (out of %lu)\n", queue.size(), num_inserts); 47 | 48 | BCL::cuda::global_launch(num_inserts, 49 | [] __device__ (size_t idx, BCL::cuda::FastQueue& queue) { 50 | int value; 51 | bool success = queue.pop(value); 52 | if (!success || value < 0 || value > NUM_INSERTS) { 53 | printf("AGH! I have failed (popping)!\n"); 54 | } 55 | }, queue); 56 | cudaDeviceSynchronize(); 57 | BCL::cuda::barrier(); 58 | 59 | BCL::print("Queue has %lu items (out of %lu)\n", queue.size(), num_inserts); 60 | 61 | BCL::finalize(); 62 | return 0; 63 | } 64 | -------------------------------------------------------------------------------- /examples/fastqueue/Makefile: -------------------------------------------------------------------------------- 1 | 2 | SHELL='bash' 3 | 4 | # XXX: Modify BCLROOT if you move this Makefile 5 | # out of an examples/* directory. 6 | BCLROOT=$(PWD)/../../ 7 | 8 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 9 | 10 | TIMER_CMD=time 11 | 12 | ifeq ($(BACKEND),SHMEM) 13 | BACKEND=SHMEM 14 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 15 | CXX=oshc++ 16 | 17 | BCL_RUN=oshrun -n 4 18 | else ifeq ($(BACKEND),GASNET_EX) 19 | BACKEND=GASNET_EX 20 | # XXX: Allow selection of conduit. 21 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 22 | 23 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 24 | CXX = mpic++ 25 | 26 | BCL_RUN=mpirun -n 4 27 | else 28 | BACKEND=MPI 29 | BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 30 | CXX=mpic++ 31 | 32 | BCL_RUN=mpirun -n 4 33 | endif 34 | 35 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 36 | 37 | SOURCES += $(wildcard *.cpp) 38 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 39 | 40 | all: $(TARGETS) 41 | 42 | %: %.cpp 43 | @echo "C $@ $(BACKEND)" 44 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 45 | 46 | test: all 47 | @for target in $(TARGETS) ; do \ 48 | echo "R $$target $(BACKEND)" ;\ 49 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 50 | done 51 | 52 | clean: 53 | @rm -f $(TARGETS) 54 | -------------------------------------------------------------------------------- /examples/fastqueue/bucket_count.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | template 8 | std::vector bucket_count(const Range& data, size_t bucket_width, size_t bucket_start) { 9 | using T = typename Range::value_type; 10 | std::vector counts(bucket_width, 0); 11 | for (auto it = data.begin().local(); it != data.end().local(); it++) { 12 | const auto val = *it; 13 | // assert(val >= bucket_start); 14 | // assert(val < bucket_start+bucket_width); 15 | counts[val - bucket_start] += 1; 16 | } 17 | return counts; 18 | } 19 | 20 | template 21 | size_t total_count(std::vector& vals) { 22 | size_t count = 0; 23 | for (const auto& val : vals) { 24 | count += val; 25 | } 26 | 27 | size_t total_count = BCL::allreduce(count, std::plus{}); 28 | 29 | return total_count; 30 | } 31 | -------------------------------------------------------------------------------- /examples/fastqueue/single_insert.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | int main(int argc, char** argv) { 11 | BCL::init(); 12 | 13 | size_t n_to_insert = 100; 14 | 15 | size_t queue_size = n_to_insert*2; 16 | 17 | std::vector> queues; 18 | 19 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 20 | queues.push_back(BCL::FastQueue(rank, queue_size)); 21 | } 22 | 23 | srand48(BCL::rank()); 24 | 25 | for (size_t i = 0; i < n_to_insert; i++) { 26 | size_t dst_rank = lrand48() % BCL::nprocs(); 27 | queues[dst_rank].push(lrand48()); 28 | } 29 | 30 | BCL::barrier(); 31 | 32 | // Sort local queue in place 33 | std::sort(queues[BCL::rank()].begin().local(), queues[BCL::rank()].end().local()); 34 | 35 | // Pop out of queue 36 | size_t count = 0; 37 | while (!queues[BCL::rank()].empty()) { 38 | int value; 39 | bool success = queues[BCL::rank()].pop(value); 40 | 41 | if (success) { 42 | count++; 43 | } 44 | } 45 | 46 | size_t total_count = BCL::allreduce(count, std::plus{}); 47 | 48 | BCL::print("Popped %lu values total.\n", total_count); 49 | 50 | BCL::finalize(); 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /examples/fastqueue/vector_insert.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | int main(int argc, char** argv) { 11 | BCL::init(); 12 | 13 | size_t n_to_insert = 100; 14 | size_t n_each_insert = 10; 15 | 16 | size_t queue_size = n_to_insert*n_each_insert*2; 17 | 18 | std::vector> queues; 19 | 20 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 21 | queues.push_back(BCL::FastQueue(rank, queue_size)); 22 | } 23 | 24 | srand48(BCL::rank()); 25 | 26 | for (size_t i = 0; i < n_to_insert; i++) { 27 | size_t dst_rank = lrand48() % BCL::nprocs(); 28 | std::vector values(n_each_insert, BCL::rank()); 29 | queues[dst_rank].push(values); 30 | } 31 | 32 | BCL::barrier(); 33 | 34 | // Sort local queue in place 35 | std::sort(queues[BCL::rank()].begin().local(), queues[BCL::rank()].end().local()); 36 | 37 | // Pop out of queue 38 | size_t count = 0; 39 | while (!queues[BCL::rank()].empty()) { 40 | int value; 41 | bool success = queues[BCL::rank()].pop(value); 42 | 43 | if (success) { 44 | count++; 45 | } 46 | } 47 | 48 | size_t total_count = BCL::allreduce(count, std::plus{}); 49 | 50 | BCL::print("Popped %lu values total.\n", total_count); 51 | 52 | BCL::finalize(); 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /examples/hashmap/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | add_bcl_test(buffered_inserts buffered_inserts.cpp) 6 | add_bcl_test(insert_find insert_find.cpp) 7 | 8 | -------------------------------------------------------------------------------- /examples/hashmap/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /examples/hashmap/buffered_inserts.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | int main(int argc, char** argv) { 13 | BCL::init(); 14 | 15 | size_t n_to_insert = 10; 16 | 17 | double load_factor = 0.5; 18 | BCL::HashMap map(n_to_insert*BCL::nprocs() / load_factor); 19 | 20 | BCL::HashMapBuffer buffer(map, 21 | n_to_insert*2, 22 | std::max(n_to_insert/BCL::nprocs(), 1)); 23 | 24 | for (size_t i = 0; i < n_to_insert; i++) { 25 | bool success = buffer.insert(std::to_string(n_to_insert*BCL::rank() + i), i); 26 | assert(success); 27 | } 28 | 29 | buffer.flush(); 30 | 31 | if (BCL::rank() == 0) { 32 | for (size_t i = 0; i < BCL::nprocs(); i++) { 33 | 34 | for (size_t j = 0; j < n_to_insert; j++) { 35 | auto iter = map.find(std::to_string(n_to_insert*i + j)); 36 | int value = *iter; 37 | printf("%lu: %d\n", n_to_insert*BCL::rank() + i, value); 38 | } 39 | } 40 | } 41 | 42 | BCL::finalize(); 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /examples/hashmap/insert_find.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int main(int argc, char** argv) { 12 | BCL::init(); 13 | 14 | BCL::HashMap map(1000); 15 | 16 | map[std::to_string(BCL::rank())] = BCL::rank(); 17 | 18 | BCL::barrier(); 19 | 20 | if (BCL::rank() == 0) { 21 | for (size_t i = 0; i < BCL::nprocs(); i++) { 22 | int value = map[std::to_string(i)]; 23 | 24 | printf("%lu: %d\n", i, value); 25 | } 26 | } 27 | 28 | BCL::finalize(); 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /examples/matrix/Makefile: -------------------------------------------------------------------------------- 1 | # NOTE: uncomment these three lines to use the GASNet-EX backend. 2 | # Note that the environment variable `gasnet_prefix` 3 | # must be set. 4 | # (You also need to set `BACKEND` to `-DBCL_BACKEND_GASNET_EX` below. 5 | # include $(gasnet_prefix)/include/aries-conduit/aries-par.mak 6 | # GASNET_FLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) 7 | # GASNET_BACKEND_LIB_FLAGS = $(GASNET_LDFLAGS) $(GASNET_LIBS) 8 | 9 | # It may be a good idea to set a `BCL_HOME` global environment variable. 10 | # That way this Makefile would work in any directory. 11 | BCL_HOME ?= $(PWD)/../../ 12 | 13 | # Try to pick out whether `CC` exists (for NERSC systems) 14 | # If not, try to use mpic++. 15 | ifeq (, $(shell which CC)) 16 | CXX = mpic++ 17 | else 18 | CXX = CC 19 | endif 20 | 21 | SOURCES += $(wildcard *.cpp) 22 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 23 | 24 | # SHMEM and GASNet-EX tend to perform the best. 25 | # MPI is default 26 | BACKEND = -DBCL_BACKEND_MPI 27 | # BACKEND = -DBCL_BACKEND_SHMEM 28 | # BACKEND = -DBCL_BACKEND_GASNET_EX $(GASNET_FLAGS) 29 | LD_FLAGS = $(GASNET_BACKEND_LIB_FLAGS) 30 | 31 | # If you compile off of Cori, you will need to link with `-lcblas`. 32 | BLAS_LIB = 33 | 34 | CXXFLAGS = -std=gnu++17 -O3 $(BACKEND) -I$(BCL_HOME) $(BLAS_LIB) 35 | 36 | all: $(TARGETS) 37 | 38 | %: %.cpp 39 | $(CXX) $(CXXFLAGS) -o $@ $^ $(LD_FLAGS) 40 | 41 | clean: 42 | rm -fv $(TARGETS) 43 | -------------------------------------------------------------------------------- /examples/matrix/matrix_basic.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | float multiply_by_two(float x) { 10 | return 2 * x; 11 | } 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | BCL::DMatrix a({8, 8}); 17 | BCL::DMatrix b({8, 8}); 18 | 19 | a = 1; 20 | b = 1; 21 | 22 | BCL::print("A matrix:\n"); 23 | a.print(); 24 | 25 | a.apply_inplace(multiply_by_two); 26 | 27 | BCL::print("A matrix after applying \"multiply_by_two\":\n"); 28 | a.print(); 29 | 30 | b.apply_inplace([](float x) { return x + 12; }); 31 | 32 | auto c = a.dot(b); 33 | 34 | BCL::print("Result of multiply A and B:\n"); 35 | c.print(); 36 | 37 | BCL::finalize(); 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /examples/matrix/matrix_basic_sparse.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | float multiply_by_two(float x) { 10 | return 2 * x; 11 | } 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | BCL::SPMatrix a("1138_bus.mtx"); 17 | BCL::SPMatrix b("1138_bus.mtx"); 18 | 19 | auto c = a.dot(b); 20 | 21 | BCL::print("Printing out C matrix:\n"); 22 | c.print(); 23 | 24 | BCL::finalize(); 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /examples/matrix/matrix_getrow.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void print_vec(std::vector& v) { 11 | for (size_t i = 0; i < v.size(); i++) { 12 | printf(" %.2lf", v[i]); 13 | } 14 | printf("\n"); 15 | } 16 | 17 | int main(int argc, char** argv) { 18 | // How big to make each process' shared segment, in MB. 19 | size_t segment_size = 1024; 20 | 21 | BCL::init(segment_size); 22 | 23 | // Create a distributed matrix of size 1024 x 1024 24 | BCL::DMatrix matrix({32, 32}, BCL::BlockRow()); 25 | 26 | // Print information about tile size and distribution. 27 | if (BCL::rank() == 0) { 28 | printf("Just created a %lu x %lu matrix. Here's some info:\n", 29 | matrix.shape()[0], matrix.shape()[1]); 30 | matrix.print_info(); 31 | } 32 | 33 | // Apply the matrix to random values. 34 | srand48(BCL::rank()); 35 | matrix.apply_inplace([](float a) { return drand48(); }); 36 | 37 | // Barrier necessary to ensure all processes are finished 38 | // before rank 0 reads. 39 | BCL::barrier(); 40 | 41 | if (BCL::rank() == 0) { 42 | // Let's get each row of the matrix. 43 | for (size_t i = 0; i < matrix.shape()[0]; i++) { 44 | size_t tile_num = i / matrix.tile_shape()[0]; 45 | size_t row_num = i % matrix.tile_shape()[0]; 46 | printf("Getting row %lu, which should be row %lu within tile %lu:\n", 47 | i, row_num, tile_num); 48 | std::vector row = matrix.get_tile_row({tile_num, 0}, row_num); 49 | print_vec(row); 50 | } 51 | } 52 | 53 | BCL::finalize(); 54 | 55 | return 0; 56 | } 57 | -------------------------------------------------------------------------------- /examples/matrix/matrix_getrow_async.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | template 10 | void print_vec(std::vector& v) { 11 | for (size_t i = 0; i < v.size(); i++) { 12 | printf(" %.2lf", v[i]); 13 | } 14 | printf("\n"); 15 | } 16 | 17 | int main(int argc, char** argv) { 18 | // How big to make each process' shared segment, in MB. 19 | size_t segment_size = 1024; 20 | 21 | BCL::init(segment_size); 22 | 23 | // Create a distributed matrix of size 1024 x 1024 24 | BCL::DMatrix matrix({32, 32}, BCL::BlockRow()); 25 | 26 | // Print information about tile size and distribution. 27 | if (BCL::rank() == 0) { 28 | printf("Just created a %lu x %lu matrix. Here's some info:\n", 29 | matrix.shape()[0], matrix.shape()[1]); 30 | matrix.print_info(); 31 | } 32 | 33 | // Apply the matrix to random values. 34 | srand48(BCL::rank()); 35 | matrix.apply_inplace([](float a) { return drand48(); }); 36 | 37 | // Barrier necessary to ensure all processes are finished 38 | // before rank 0 reads. 39 | BCL::barrier(); 40 | 41 | if (BCL::rank() == 0) { 42 | // Let's get each row of the matrix. 43 | for (size_t i = 0; i < matrix.shape()[0]; i++) { 44 | size_t tile_num = i / matrix.tile_shape()[0]; 45 | size_t row_num = i % matrix.tile_shape()[0]; 46 | printf("Getting row %lu, which should be row %lu within tile %lu:\n", 47 | i, row_num, tile_num); 48 | auto row_buf = matrix.arget_tile_row(tile_num, 0, row_num); 49 | auto row = row_buf.get(); 50 | print_vec(row); 51 | } 52 | } 53 | 54 | BCL::finalize(); 55 | 56 | return 0; 57 | } -------------------------------------------------------------------------------- /examples/matrix/matrix_row.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | int main(int argc, char** argv) { 9 | // How big to make each process' shared segment, in MB. 10 | size_t segment_size = 1024; 11 | 12 | BCL::init(segment_size); 13 | 14 | // Create a distributed matrix of size 1024 x 1024 15 | BCL::DMatrix matrix({1024, 1024}, BCL::BlockRow()); 16 | // This would mean each row block has a fixed height of 10 17 | // BCL::DMatrix matrix({1024, 1024}, BCL::BlockRow({10, BCL::Tile::div})); 18 | 19 | // Print information about tile size and distribution. 20 | if (BCL::rank() == 0) { 21 | printf("Just created a %lu x %lu matrix. Here's some info:\n", 22 | matrix.shape()[0], matrix.shape()[1]); 23 | matrix.print_info(); 24 | } 25 | 26 | BCL::finalize(); 27 | 28 | return 0; 29 | } -------------------------------------------------------------------------------- /examples/matrix/rw_gemm.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "generate_spmat.hpp" 12 | 13 | #include 14 | 15 | int main(int argc, char** argv) { 16 | // How big to make each process' shared segment, in MB. 17 | size_t segment_size = 2048; 18 | 19 | BCL::init(segment_size); 20 | 21 | size_t m, k, n; 22 | 23 | // parameters: [number of samples] [number of categories] [embedding size] [nnz_row] [alpha] 24 | 25 | if (argc >= 4) { 26 | m = std::atoi(argv[1]); 27 | k = std::atoi(argv[2]); 28 | n = std::atoi(argv[3]); 29 | } else { 30 | BCL::finalize(); 31 | return 1; 32 | } 33 | 34 | size_t nnz_per_row = 100; 35 | double alpha = 0.0; 36 | 37 | if (argc >= 5) { 38 | nnz_per_row = std::atoi(argv[4]); 39 | } 40 | 41 | if (argc >= 6) { 42 | alpha = std::atof(argv[5]); 43 | } 44 | 45 | using value_type = float; 46 | using index_type = long long int; 47 | 48 | auto blocks = BCL::block_matmul(m, n, k); 49 | 50 | srand48(BCL::rank()); 51 | BCL::print("Generating matrix (%lu x %lu), alpha %lf, nnz_per_row %lu\n", 52 | m, k, alpha, nnz_per_row); 53 | auto a = BCL::generate_matrix(m, k, nnz_per_row, alpha, std::move(blocks[0])); 54 | 55 | BCL::print("Generated A (%lu x %lu matrix) with %lu nnz\n", 56 | a.shape()[0], a.shape()[1], a.nnz()); 57 | 58 | BCL::DMatrix b({k, n}, std::move(blocks[1])); 59 | BCL::DMatrix c({m, n}, std::move(blocks[2])); 60 | 61 | BCL::print("Multipyling by B (%lu x %lu dense matrix)\n", 62 | b.shape()[0], b.shape()[1]); 63 | 64 | BCL::print("To produce C (%lu x %lu dense matrix)\n", 65 | c.shape()[0], c.shape()[1]); 66 | 67 | /* 68 | if (BCL::rank() == 0) { 69 | printf("A:\n"); 70 | a.print_details(); 71 | printf("B:\n"); 72 | b.print_details(); 73 | printf("C:\n"); 74 | c.print_details(); 75 | } 76 | */ 77 | 78 | b = 1; 79 | c = 0; 80 | 81 | BCL::barrier(); 82 | auto begin = std::chrono::high_resolution_clock::now(); 83 | BCL::gemm(a, b, c); 84 | BCL::barrier(); 85 | auto end = std::chrono::high_resolution_clock::now(); 86 | double duration = std::chrono::duration(end - begin).count(); 87 | 88 | BCL::barrier(); 89 | 90 | BCL::print("Matrix Multiply took %lf s\n", duration); 91 | 92 | BCL::print("Comm/comp %lf / %lf\n", BCL::row_comm, duration - BCL::row_comm); 93 | 94 | BCL::print("Sum is %lf\n", c.sum()); 95 | 96 | BCL::finalize(); 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /examples/matrix/spmm.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | int main(int argc, char** argv) { 12 | // How big to make each process' shared segment, in MB. 13 | size_t segment_size = 2048; 14 | 15 | BCL::init(segment_size); 16 | 17 | // Number of columns in the matrix B 18 | size_t num_columns = 1024; 19 | 20 | std::string fname = ""; 21 | if (argc >= 2) { 22 | fname = argv[1]; 23 | } else { 24 | BCL::finalize(); 25 | return 1; 26 | } 27 | 28 | if (argc >= 3) { 29 | num_columns = std::atoi(argv[2]); 30 | } 31 | 32 | using value_type = float; 33 | using index_type = long long int; 34 | 35 | BCL::print("Multiplying matrix \"%s\" by %lu columns\n", 36 | fname.c_str(), num_columns); 37 | 38 | 39 | auto matrix_shape = BCL::matrix_io::matrix_info(fname); 40 | size_t m = matrix_shape.shape[0]; 41 | size_t k = matrix_shape.shape[1]; 42 | size_t n = num_columns; 43 | 44 | auto blocks = BCL::block_matmul(m, n, k); 45 | 46 | BCL::SPMatrix a(fname, std::move(blocks[0])); 47 | 48 | BCL::DMatrix b({k, n}, std::move(blocks[1])); 49 | BCL::DMatrix c({k, n}, std::move(blocks[2])); 50 | 51 | b = 1; 52 | c = 0; 53 | 54 | BCL::print("Multiplying...\n"); 55 | BCL::barrier(); 56 | auto begin = std::chrono::high_resolution_clock::now(); 57 | BCL::gemm(a, b, c); 58 | BCL::barrier(); 59 | auto end = std::chrono::high_resolution_clock::now(); 60 | double duration = std::chrono::duration(end - begin).count(); 61 | 62 | BCL::print("Matrix Multiply took %lf s\n", duration); 63 | 64 | BCL::print("Sum is %lf\n", c.sum()); 65 | 66 | BCL::finalize(); 67 | 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /examples/ranges/Makefile: -------------------------------------------------------------------------------- 1 | # NOTE: uncomment these three lines to use the GASNet-EX backend. 2 | # Note that the environment variable `gasnet_prefix` 3 | # must be set. 4 | # (You also need to set `BACKEND` to `-DBCL_BACKEND_GASNET_EX` below. 5 | # include $(gasnet_prefix)/include/aries-conduit/aries-par.mak 6 | # GASNET_FLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) 7 | # GASNET_BACKEND_LIB_FLAGS = $(GASNET_LDFLAGS) $(GASNET_LIBS) 8 | 9 | # It may be a good idea to set a `BCL_HOME` global environment variable. 10 | # That way this Makefile would work in any directory. 11 | BCL_HOME ?= $(PWD)/../../ 12 | 13 | # Try to pick out whether `CC` exists (for NERSC systems) 14 | # If not, try to use mpic++. 15 | ifeq (, $(shell which CC)) 16 | CXX = g++ 17 | else 18 | CXX = CC 19 | endif 20 | 21 | SOURCES += $(wildcard *.cpp) 22 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 23 | 24 | # SHMEM and GASNet-EX tend to perform the best. 25 | # MPI is default 26 | BACKEND = -DBCL_BACKEND_MPI 27 | # BACKEND = -DBCL_BACKEND_SHMEM 28 | # BACKEND = -DBCL_BACKEND_GASNET_EX $(GASNET_FLAGS) 29 | LD_FLAGS = $(GASNET_BACKEND_LIB_FLAGS) 30 | 31 | CXXFLAGS = -std=c++17 -O3 $(BACKEND) -I$(BCL_HOME) 32 | 33 | all: $(TARGETS) 34 | 35 | %: %.cpp 36 | $(CXX) $(CXXFLAGS) -o $@ $^ $(LD_FLAGS) `mpic++ -showme:compile` `mpic++ -showme:link` 37 | 38 | clean: 39 | rm -fv $(TARGETS) 40 | -------------------------------------------------------------------------------- /examples/rpc/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -lpthread -DBCL_BACKEND_SHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = -lpthread $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun --oversubscribe -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -lpthread -I$(BCLROOT) -DBCL_BACKEND_MPI 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun --oversubscribe -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /examples/rpc/buffered_rpc_checksum.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | template 14 | bool ready(std::vector& futures) { 15 | for (auto& future : futures) { 16 | if (future.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 17 | return false; 18 | } 19 | } 20 | return true; 21 | } 22 | 23 | template 24 | void future_barrier(std::vector& futures) { 25 | bool success = false; 26 | do { 27 | BCL::flush_rpc(); 28 | size_t success_count = ready(futures); 29 | success_count = BCL::allreduce(success_count, std::plus{}); 30 | success = success_count == BCL::nprocs(); 31 | } while (!success); 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | BCL::init(); 36 | BCL::init_rpc(); 37 | 38 | auto fn = [](int a, int b) -> int { 39 | return a * b; 40 | }; 41 | int a = 7; 42 | int b = 7; 43 | 44 | 45 | using rv = decltype(BCL::buffered_rpc(0, fn, a, b)); 46 | std::vector futures; 47 | 48 | srand48(BCL::rank()); 49 | auto begin = std::chrono::high_resolution_clock::now(); 50 | size_t rpcs = 1000; 51 | for (size_t i = 0 ; i < rpcs; i++) { 52 | size_t rand_proc = lrand48() % BCL::nprocs(); 53 | auto f = BCL::buffered_rpc(rand_proc, fn, a, b); 54 | futures.push_back(std::move(f)); 55 | } 56 | 57 | BCL::flush_signal(); 58 | 59 | future_barrier(futures); 60 | auto end = std::chrono::high_resolution_clock::now(); 61 | double duration = std::chrono::duration(end - begin).count(); 62 | 63 | BCL::print("%lu buffered RPCs serviced in %lf s\n", rpcs*BCL::nprocs(), duration); 64 | 65 | for (auto& f : futures) { 66 | int val = f.get(); 67 | assert(val == a*b); 68 | } 69 | 70 | BCL::finalize_rpc(); 71 | BCL::finalize(); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /examples/rpc/buffered_rpc_circular.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | template 13 | bool ready(std::vector& futures) { 14 | for (auto& future : futures) { 15 | if (future.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 16 | return false; 17 | } 18 | } 19 | return true; 20 | } 21 | 22 | template 23 | void future_barrier(std::vector& futures) { 24 | bool success = false; 25 | do { 26 | BCL::flush_rpc(); 27 | size_t success_count = ready(futures); 28 | success_count = BCL::allreduce(success_count, std::plus{}); 29 | success = success_count == BCL::nprocs(); 30 | } while (!success); 31 | } 32 | 33 | int main(int argc, char** argv) { 34 | BCL::init(); 35 | BCL::init_rpc(); 36 | 37 | 38 | auto fn = [](int a, int b) -> int { 39 | return a * b; 40 | }; 41 | int a = 7; 42 | int b = 7; 43 | 44 | 45 | using rv = decltype(BCL::buffered_rpc(0, fn, a, b)); 46 | std::vector futures; 47 | 48 | srand48(BCL::rank()); 49 | auto begin = std::chrono::high_resolution_clock::now(); 50 | size_t rpcs = 1000; 51 | for (size_t i = 0 ; i < rpcs; i++) { 52 | size_t rand_proc = lrand48() % BCL::nprocs(); 53 | auto f = BCL::buffered_rpc(rand_proc, fn, a, b); 54 | futures.push_back(std::move(f)); 55 | } 56 | 57 | BCL::flush_signal(); 58 | 59 | future_barrier(futures); 60 | auto end = std::chrono::high_resolution_clock::now(); 61 | double duration = std::chrono::duration(end - begin).count(); 62 | 63 | BCL::print("%lu buffered RPCs serviced in %lf s\n", rpcs*BCL::nprocs(), duration); 64 | 65 | for (auto& f : futures) { 66 | int val = f.get(); 67 | assert(val == a*b); 68 | } 69 | 70 | BCL::finalize_rpc(); 71 | BCL::finalize(); 72 | return 0; 73 | } 74 | -------------------------------------------------------------------------------- /examples/simple/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | ## 3 | ## SPDX-License-Identifier: BSD-3-Clause 4 | 5 | add_bcl_test(hello_world hello_world.cpp) 6 | add_bcl_test(global_ptr global_ptr.cpp) 7 | -------------------------------------------------------------------------------- /examples/simple/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DBCL_BACKEND_SHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DBCL_BACKEND_GASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) -DBCL_BACKEND_MPI 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /examples/simple/global_ptr.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | The point of this example is to demonstrate basic mechanics 10 | of global pointers. It shows off a couple different methods 11 | for reading and writing to remote pointers: 12 | 13 | 1) Dereferencing a global pointer - ptr[12]. 14 | * Can read and write. 15 | * int local_value = ptr[12]; 16 | * ptr[12] = my_value; 17 | 18 | 2) Calling ptr.local() to get a local pointer. 19 | 20 | 3) BCL::memcpy() 21 | * Works like POSIX memcpy, except one 22 | of the arguments is a global pointer. 23 | 24 | 4) rput and rget 25 | * Fairly similar to UPC++ rget and rput. 26 | * Like memcpy, but typed. 27 | * See /bcl/core/comm.hpp for definitions. 28 | */ 29 | 30 | int main(int argc, char** argv) { 31 | BCL::init(); 32 | 33 | // Need to run this with at least two processes. 34 | assert(BCL::nprocs() >= 2); 35 | 36 | BCL::GlobalPtr ptr = nullptr; 37 | 38 | if (BCL::rank() == 0) { 39 | ptr = BCL::alloc(BCL::nprocs()); 40 | } 41 | 42 | ptr = BCL::broadcast(ptr, 0); 43 | 44 | // 1) Using pointer dereference 45 | // to write a value. 46 | ptr[BCL::rank()] = BCL::rank(); 47 | 48 | BCL::barrier(); 49 | 50 | if (BCL::rank() == 0) { 51 | // 2) Calling .local() to get a local pointer. 52 | int* local = ptr.local(); 53 | 54 | printf("Rank 0 Sees:\n"); 55 | for (size_t i = 0; i < BCL::nprocs(); i++) { 56 | printf("%lu: %d\n", i, local[i]); 57 | } 58 | } 59 | 60 | BCL::barrier(); 61 | 62 | if (BCL::rank() == 1) { 63 | std::vector zeros(BCL::nprocs(), 0); 64 | // 3) Using BCL::memcpy. 65 | BCL::memcpy(ptr, zeros.data(), sizeof(int)*zeros.size()); 66 | } 67 | 68 | BCL::barrier(); 69 | 70 | // 4) Using rput 71 | BCL::rput((int) BCL::rank(), ptr + BCL::rank()); 72 | 73 | BCL::barrier(); 74 | 75 | if (BCL::rank() == 1) { 76 | printf("Rank 1 Sees:\n"); 77 | for (size_t i = 0; i < BCL::nprocs(); i++) { 78 | // 4) Using rget 79 | printf("%lu: %d\n", i, BCL::rget(ptr + i)); 80 | } 81 | } 82 | 83 | BCL::finalize(); 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /examples/simple/hello_world.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | 7 | int main(int argc, char** argv) { 8 | BCL::init(); 9 | for (std::size_t i = 0; i < BCL::nprocs(); i++) { 10 | if (BCL::rank() == i) { 11 | printf("Hello, BCL! I am rank %lu/%lu on host %s.\n", 12 | BCL::rank(), BCL::nprocs(), BCL::hostname().c_str()); 13 | } 14 | BCL::barrier(); 15 | } 16 | BCL::finalize(); 17 | return 0; 18 | } 19 | -------------------------------------------------------------------------------- /tests/atomics/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /tests/atomics/compare_and_swap.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | int main(int argc, char** argv) { 10 | BCL::init(); 11 | BCL::GlobalPtr ptr = nullptr; 12 | 13 | constexpr size_t n_repetitions = 10; 14 | 15 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 16 | if (BCL::rank() == rank) { 17 | ptr = BCL::alloc(BCL::nprocs()); 18 | for (size_t i = 0; i < BCL::nprocs(); i++) { 19 | ptr[i] = 0; 20 | } 21 | } 22 | ptr = BCL::broadcast(ptr, rank); 23 | 24 | for (size_t k = 0; k < n_repetitions; k++) { 25 | for (size_t i = 0; i < BCL::nprocs(); i++) { 26 | int rv = std::numeric_limits::max(); 27 | while (rv != BCL::nprocs()*k + BCL::rank()) { 28 | rv = BCL::compare_and_swap(ptr + i, BCL::nprocs()*k + BCL::rank(), 29 | BCL::nprocs()*k + BCL::rank()+1); 30 | } 31 | } 32 | } 33 | 34 | BCL::barrier(); 35 | 36 | if (BCL::rank() == rank) { 37 | BCL::dealloc(ptr); 38 | } 39 | } 40 | 41 | BCL::finalize(); 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /tests/atomics/fetch_and_add.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | 7 | #include 8 | 9 | int main(int argc, char ** argv) { 10 | BCL::init(); 11 | size_t n_adds = 100; 12 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 13 | BCL::GlobalPtr ptr; 14 | 15 | if (BCL::rank() == rank) { 16 | ptr = BCL::alloc(1); 17 | *ptr = 0; 18 | } 19 | 20 | ptr = BCL::broadcast(ptr, rank); 21 | 22 | for (size_t i = 0; i < n_adds; i++) { 23 | BCL::fetch_and_op(ptr, BCL::rank(), BCL::plus{}); 24 | } 25 | 26 | BCL::barrier(); 27 | 28 | if (BCL::rank() == 0) { 29 | int final_val = *ptr; 30 | 31 | int computed_val = 0; 32 | for (size_t i = 0; i < BCL::nprocs(); i++) { 33 | computed_val += n_adds*i; 34 | } 35 | 36 | assert(computed_val == final_val); 37 | } 38 | 39 | if (BCL::rank() == rank) { 40 | BCL::dealloc(ptr); 41 | } 42 | } 43 | BCL::finalize(); 44 | return 0; 45 | } 46 | -------------------------------------------------------------------------------- /tests/comm/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /tests/comm/arget01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | int main(int argc, char** argv) { 9 | BCL::init(); 10 | 11 | size_t vec_size = 1000; 12 | 13 | std::vector> ptrs(BCL::nprocs()); 14 | 15 | ptrs[BCL::rank()] = BCL::alloc(vec_size); 16 | 17 | for (size_t i = 0; i < vec_size; i++) { 18 | ptrs[BCL::rank()].local()[i] = i; 19 | } 20 | 21 | for (size_t i = 0; i < BCL::nprocs(); i++) { 22 | ptrs[i] = BCL::broadcast(ptrs[i], i); 23 | } 24 | 25 | for (size_t i_ = 0; i_ < BCL::nprocs(); i_++) { 26 | size_t i = (i_ + BCL::rank()) % BCL::nprocs(); 27 | 28 | auto f = BCL::arget(ptrs[i], vec_size); 29 | 30 | auto vec = f.get(); 31 | 32 | assert(vec.size() == vec_size); 33 | 34 | for (size_t j = 0; j < vec.size(); j++) { 35 | assert(vec[j] == j); 36 | } 37 | 38 | for (size_t j = 0; j < vec_size; j++) { 39 | auto f = BCL::arget(&ptrs[i][j]); 40 | assert(f.get() == j); 41 | } 42 | } 43 | 44 | BCL::finalize(); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /tests/comm/arput01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | int main(int argc, char** argv) { 10 | BCL::init(); 11 | 12 | size_t vec_size = 1000; 13 | 14 | std::vector> ptrs(BCL::nprocs()); 15 | 16 | ptrs[BCL::rank()] = BCL::alloc(vec_size); 17 | 18 | for (size_t i = 0; i < BCL::nprocs(); i++) { 19 | ptrs[i] = BCL::broadcast(ptrs[i], i); 20 | } 21 | 22 | for (size_t i_ = 0; i_ < BCL::nprocs(); i_++) { 23 | size_t i = (i_ + BCL::rank()) % BCL::nprocs(); 24 | 25 | std::vector vec(vec_size); 26 | 27 | std::iota(vec.begin(), vec.end(), i_); 28 | 29 | auto fut = arput(ptrs[i], std::move(vec)); 30 | 31 | vec = fut.get(); 32 | 33 | BCL::barrier(); 34 | 35 | assert(vec.size() == vec_size); 36 | 37 | for (size_t j = 0; j < vec_size; j++) { 38 | assert(ptrs[BCL::rank()].local()[j] == vec[j]); 39 | } 40 | 41 | BCL::barrier(); 42 | } 43 | 44 | BCL::finalize(); 45 | return 0; 46 | } 47 | -------------------------------------------------------------------------------- /tests/comm/eventually_visible01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | bool is_set(BCL::GlobalPtr ptr) { 9 | for (size_t i = 0; i < BCL::nprocs(); i++) { 10 | int value = ptr[i]; 11 | if (value != i) { 12 | return false; 13 | } 14 | } 15 | return true; 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | BCL::init(); 20 | BCL::GlobalPtr ptr = nullptr; 21 | 22 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 23 | if (BCL::rank() == rank) { 24 | ptr = BCL::alloc(BCL::nprocs()); 25 | } 26 | ptr = BCL::broadcast(ptr, rank); 27 | 28 | ptr[BCL::rank()] = BCL::rank(); 29 | 30 | while (!is_set(ptr)) {} 31 | 32 | BCL::barrier(); 33 | } 34 | 35 | BCL::finalize(); 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /tests/comm/eventually_visible02.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | bool is_set(BCL::GlobalPtr ptr) { 9 | for (size_t i = 0; i < BCL::nprocs(); i++) { 10 | int value = ptr[i]; 11 | if (value != i) { 12 | return false; 13 | } 14 | } 15 | return true; 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | BCL::init(); 20 | BCL::GlobalPtr ptr = nullptr; 21 | 22 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 23 | if (BCL::rank() == rank) { 24 | ptr = BCL::alloc(BCL::nprocs()); 25 | } 26 | ptr = BCL::broadcast(ptr, rank); 27 | 28 | ptr[BCL::rank()] = BCL::rank(); 29 | 30 | if (BCL::rank() == rank) { 31 | while (!is_set(ptr)) {} 32 | } 33 | 34 | BCL::barrier(); 35 | } 36 | 37 | BCL::finalize(); 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /tests/comm/eventually_visible03.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | bool is_set(BCL::GlobalPtr ptr, size_t size) { 9 | int* ptr_ = ptr.local(); 10 | for (size_t i = 0; i < BCL::nprocs()*size; i++) { 11 | if (ptr_[i] != i / size) { 12 | return false; 13 | } 14 | } 15 | return true; 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | BCL::init(); 20 | BCL::GlobalPtr ptr = nullptr; 21 | 22 | size_t size = 8*1024; 23 | 24 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 25 | if (BCL::rank() == rank) { 26 | ptr = BCL::alloc(BCL::nprocs() * size); 27 | } 28 | ptr = BCL::broadcast(ptr, rank); 29 | 30 | for (size_t i = size*BCL::rank(); i < size*(BCL::rank()+1); i++) { 31 | ptr[i] = BCL::rank(); 32 | } 33 | 34 | if (BCL::rank() == rank) { 35 | while (!is_set(ptr, size)) {} 36 | } 37 | 38 | BCL::barrier(); 39 | } 40 | 41 | BCL::finalize(); 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /tests/comm/rput01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | int main(int argc, char** argv) { 9 | BCL::init(); 10 | BCL::GlobalPtr ptr = nullptr; 11 | 12 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 13 | if (BCL::rank() == rank) { 14 | ptr = BCL::alloc(BCL::nprocs()); 15 | } 16 | ptr = BCL::broadcast(ptr, rank); 17 | 18 | BCL::rput(BCL::rank(), ptr + BCL::rank()); 19 | BCL::barrier(); 20 | 21 | if (BCL::rank() == rank) { 22 | for (size_t i = 0; i < BCL::nprocs(); i++) { 23 | size_t recvd = BCL::rget(ptr + i); 24 | size_t recvd_local = *(ptr.local() + i); 25 | assert(recvd == i); 26 | assert(recvd_local == i); 27 | } 28 | BCL::dealloc(ptr); 29 | } 30 | } 31 | BCL::finalize(); 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /tests/containers/ChecksumQueue/ChecksumQueue01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test phasal queue pushes/pops. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::ChecksumQueue queue(rank, n_pushes * BCL::nprocs() + 1); 19 | 20 | for (size_t i = 0; i < n_pushes; i++) { 21 | bool success = queue.push(BCL::rank()); 22 | assert(success); 23 | } 24 | 25 | BCL::barrier(); 26 | 27 | std::unordered_map counts; 28 | if (BCL::rank() == rank) { 29 | while (!queue.empty()) { 30 | int val; 31 | bool success = queue.pop(val); 32 | if (success) { 33 | // recovered val 34 | assert(val < BCL::nprocs() && val >= 0); 35 | counts[val]++; 36 | if (counts[val] > n_pushes) { 37 | throw std::runtime_error("BCL::ChecksumQueue01: too many " + 38 | std::to_string(val) + "s"); 39 | } 40 | } 41 | } 42 | 43 | for (auto& c : counts) { 44 | assert(c.second == n_pushes); 45 | } 46 | } 47 | } 48 | BCL::finalize(); 49 | return 0; 50 | } 51 | -------------------------------------------------------------------------------- /tests/containers/ChecksumQueue/ChecksumQueue02.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test simultaneous multiple pushes and single pops of value. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::ChecksumQueue queue(rank, n_pushes * BCL::nprocs()); 19 | 20 | if (BCL::rank() != rank) { 21 | for (size_t i = 0; i < n_pushes; i++) { 22 | bool success = queue.push(BCL::rank()); 23 | assert(success); 24 | } 25 | } 26 | 27 | std::unordered_map counts; 28 | 29 | if (BCL::rank() == rank) { 30 | for (size_t i = 0; i < (BCL::nprocs() - 1)*n_pushes; i++) { 31 | bool success; 32 | int val; 33 | do { 34 | success = queue.pop(val); 35 | } while (!success); 36 | assert(val < BCL::nprocs() && val >= 0); 37 | counts[val]++; 38 | if (counts[val] > n_pushes) { 39 | throw std::runtime_error("BCL::ChecksumQueue02: " + std::to_string(rank) 40 | + " saw too many " + 41 | std::to_string(val) + "s"); 42 | } 43 | } 44 | 45 | for (auto& c : counts) { 46 | if (c.second != n_pushes) { 47 | throw std::runtime_error("BCL::ChecksumQueue02: found " + 48 | std::to_string(c.second) + " != " + 49 | std::to_string(n_pushes) + " pushes for " + 50 | std::to_string(c.first)); 51 | } 52 | } 53 | } 54 | BCL::barrier(); 55 | } 56 | 57 | BCL::finalize(); 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /tests/containers/ChecksumQueue/ChecksumQueue03.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | // XXX: Designed to test simultaneous multiple pushes and single pops of vectors. 14 | 15 | int main(int argc, char** argv) { 16 | BCL::init(); 17 | 18 | size_t n_pushes = 10000; 19 | size_t push_size = 10; 20 | 21 | constexpr bool print_verbose = false; 22 | 23 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 24 | if (print_verbose) { 25 | BCL::print("Rank %lu\n", rank); 26 | } 27 | BCL::ChecksumQueue queue(rank, 100); 28 | 29 | if (BCL::rank() != rank) { 30 | for (size_t i = 0; i < n_pushes; i++) { 31 | std::vector vec(push_size, BCL::rank()); 32 | BCL::Backoff backoff; 33 | while (!queue.push(vec, true)) { 34 | // backoff.backoff(); 35 | } 36 | } 37 | } 38 | 39 | std::unordered_map counts; 40 | 41 | if (BCL::rank() == rank) { 42 | for (size_t i = 0; i < (BCL::nprocs() - 1)*n_pushes*push_size; i++) { 43 | int val; 44 | BCL::Backoff backoff; 45 | while (!queue.pop(val)) { 46 | // backoff.backoff(); 47 | } 48 | assert(val < BCL::nprocs() && val >= 0); 49 | counts[val]++; 50 | if (counts[val] > n_pushes*push_size) { 51 | throw std::runtime_error("BCL::ChecksumQueue03: " + std::to_string(rank) 52 | + " saw too many " + 53 | std::to_string(val) + "s"); 54 | } 55 | } 56 | 57 | for (auto& c : counts) { 58 | if (c.second != n_pushes*push_size) { 59 | throw std::runtime_error("BCL::ChecksumQueue03: found " + 60 | std::to_string(c.second) + " != " + 61 | std::to_string(n_pushes) + " pushes for " + 62 | std::to_string(c.first)); 63 | } 64 | } 65 | } 66 | 67 | if (print_verbose) { 68 | fprintf(stderr, "(%lu) DONE\n", BCL::rank()); 69 | } 70 | BCL::barrier(); 71 | } 72 | 73 | BCL::finalize(); 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /tests/containers/ChecksumQueue/ChecksumQueue04.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test simultaneous multiple async_pushes and single pops. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::ChecksumQueue queue(rank, n_pushes * BCL::nprocs()); 19 | 20 | if (BCL::rank() != rank) { 21 | for (size_t i = 0; i < n_pushes; i++) { 22 | auto future = queue.async_push({int(BCL::rank())}); 23 | BCL::Backoff backoff; 24 | while (!future.is_ready()) { 25 | backoff.backoff(); 26 | } 27 | } 28 | } 29 | 30 | std::unordered_map counts; 31 | 32 | if (BCL::rank() == rank) { 33 | for (size_t i = 0; i < (BCL::nprocs() - 1)*n_pushes; i++) { 34 | bool success; 35 | int val; 36 | do { 37 | success = queue.pop(val); 38 | } while (!success); 39 | assert(val < BCL::nprocs() && val >= 0); 40 | counts[val]++; 41 | if (counts[val] > n_pushes) { 42 | throw std::runtime_error("BCL::ChecksumQueue04: " + std::to_string(rank) 43 | + " saw too many " + 44 | std::to_string(val) + "s"); 45 | } 46 | } 47 | 48 | for (auto& c : counts) { 49 | if (c.second != n_pushes) { 50 | throw std::runtime_error("BCL::ChecksumQueue04: found " + 51 | std::to_string(c.second) + " != " + 52 | std::to_string(n_pushes) + " pushes for " + 53 | std::to_string(c.first)); 54 | } 55 | } 56 | } 57 | BCL::barrier(); 58 | } 59 | 60 | BCL::finalize(); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /tests/containers/ChecksumQueue/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # XXX: Modify BCLROOT if you move this Makefile 3 | # out of an examples/* directory. 4 | BCLROOT=$(PWD)/../../../ 5 | 6 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 7 | 8 | TIMER_CMD=time 9 | 10 | ifeq ($(BACKEND),SHMEM) 11 | BACKEND=SHMEM 12 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 13 | CXX=oshc++ 14 | 15 | BCL_RUN=oshrun -n 4 16 | else ifeq ($(BACKEND),GASNET_EX) 17 | BACKEND=GASNET_EX 18 | # XXX: Allow selection of conduit. 19 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 20 | 21 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 22 | CXX = mpic++ 23 | 24 | BCL_RUN=mpirun -n 4 25 | else 26 | BACKEND=MPI 27 | BCLFLAGS = -I$(BCLROOT) 28 | CXX=mpic++ 29 | 30 | BCL_RUN=mpirun -n 4 31 | endif 32 | 33 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 34 | 35 | SOURCES += $(wildcard *.cpp) 36 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 37 | 38 | all: $(TARGETS) 39 | 40 | %: %.cpp 41 | @echo "C $@ $(BACKEND)" 42 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 43 | 44 | test: all 45 | @for target in $(TARGETS) ; do \ 46 | echo "R $$target $(BACKEND)" ;\ 47 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 48 | done 49 | 50 | clean: 51 | @rm -f $(TARGETS) 52 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/CircularQueue01.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test phasal queue pushes/pops. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::CircularQueue queue(rank, n_pushes * BCL::nprocs()); 19 | 20 | for (size_t i = 0; i < n_pushes; i++) { 21 | queue.push(BCL::rank(), BCL::CircularQueueAL::push); 22 | } 23 | 24 | BCL::barrier(); 25 | 26 | std::unordered_map counts; 27 | if (BCL::rank() == rank) { 28 | while (!queue.empty()) { 29 | int val; 30 | bool success = queue.pop(val, BCL::CircularQueueAL::none); 31 | if (success) { 32 | // recovered val 33 | assert(val < BCL::nprocs() && val >= 0); 34 | counts[val]++; 35 | if (counts[val] > n_pushes) { 36 | throw std::runtime_error("BCL::CircularQueue01: too many " + 37 | std::to_string(val) + "s"); 38 | } 39 | } 40 | } 41 | 42 | for (auto& c : counts) { 43 | assert(c.second == n_pushes); 44 | } 45 | } 46 | } 47 | BCL::finalize(); 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/CircularQueue02.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test simultaneous multiple pushes and single pops of value. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::CircularQueue queue(rank, n_pushes * BCL::nprocs()); 19 | 20 | if (BCL::rank() != rank) { 21 | for (size_t i = 0; i < n_pushes; i++) { 22 | bool success = queue.push(BCL::rank()); 23 | assert(success); 24 | } 25 | } 26 | 27 | std::unordered_map counts; 28 | 29 | if (BCL::rank() == rank) { 30 | for (size_t i = 0; i < (BCL::nprocs() - 1)*n_pushes; i++) { 31 | bool success; 32 | int val; 33 | do { 34 | success = queue.pop(val); 35 | } while (!success); 36 | assert(val < BCL::nprocs() && val >= 0); 37 | counts[val]++; 38 | if (counts[val] > n_pushes) { 39 | throw std::runtime_error("BCL::CircularQueue02: " + std::to_string(rank) 40 | + " saw too many " + 41 | std::to_string(val) + "s"); 42 | } 43 | } 44 | 45 | for (auto& c : counts) { 46 | if (c.second != n_pushes) { 47 | throw std::runtime_error("BCL::CircularQueue02: found " + 48 | std::to_string(c.second) + " != " + 49 | std::to_string(n_pushes) + " pushes for " + 50 | std::to_string(c.first)); 51 | } 52 | } 53 | } 54 | BCL::barrier(); 55 | } 56 | 57 | BCL::finalize(); 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/CircularQueue04.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | // XXX: Designed to test simultaneous multiple async_pushes and single pops. 12 | 13 | int main(int argc, char** argv) { 14 | BCL::init(); 15 | 16 | size_t n_pushes = 1000; 17 | for (size_t rank = 0; rank < BCL::nprocs(); rank++) { 18 | BCL::CircularQueue queue(rank, n_pushes * BCL::nprocs()); 19 | 20 | if (BCL::rank() != rank) { 21 | for (size_t i = 0; i < n_pushes; i++) { 22 | auto future = queue.async_push({int(BCL::rank())}); 23 | BCL::Backoff backoff; 24 | while (!future.is_ready()) { 25 | backoff.backoff(); 26 | } 27 | } 28 | } 29 | 30 | std::unordered_map counts; 31 | 32 | if (BCL::rank() == rank) { 33 | for (size_t i = 0; i < (BCL::nprocs() - 1)*n_pushes; i++) { 34 | bool success; 35 | int val; 36 | do { 37 | success = queue.pop(val); 38 | } while (!success); 39 | assert(val < BCL::nprocs() && val >= 0); 40 | counts[val]++; 41 | if (counts[val] > n_pushes) { 42 | throw std::runtime_error("BCL::CircularQueue04: " + std::to_string(rank) 43 | + " saw too many " + 44 | std::to_string(val) + "s"); 45 | } 46 | } 47 | 48 | for (auto& c : counts) { 49 | if (c.second != n_pushes) { 50 | throw std::runtime_error("BCL::CircularQueue04: found " + 51 | std::to_string(c.second) + " != " + 52 | std::to_string(n_pushes) + " pushes for " + 53 | std::to_string(c.first)); 54 | } 55 | } 56 | } 57 | BCL::barrier(); 58 | } 59 | 60 | BCL::finalize(); 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/run_many_times.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 4 | // 5 | // SPDX-License-Identifier: BSD-3-Clause 6 | 7 | for i in {1..10} 8 | do 9 | echo run ${i} 10 | mpirun -n 4 ./CircularQueue03 11 | echo 12 | done 13 | -------------------------------------------------------------------------------- /tests/containers/CircularQueue/simplify_mpi_issue.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | /* 13 | * Be problematic only when rank=0 14 | * CircularQueue and SafeChecksumQueue both have the issue. 15 | */ 16 | int main(int argc, char** argv) { 17 | BCL::init(); 18 | fprintf(stderr, "I am rank %lu out of %lu procs; tag1\n", BCL::rank(), BCL::nprocs()); 19 | 20 | size_t n_pushes = 10; 21 | 22 | size_t rank = 0; 23 | fprintf(stderr, "I am rank %lu out of %lu procs; tag2\n", BCL::rank(), BCL::nprocs()); 24 | BCL::ChecksumQueue queue(0, 100); 25 | fprintf(stderr, "I am rank %lu out of %lu procs; tag3\n", BCL::rank(), BCL::nprocs()); 26 | 27 | fprintf(stderr, "I am rank %lu out of %lu procs; tag4\n", BCL::rank(), BCL::nprocs()); 28 | 29 | if (BCL::rank() != rank) { 30 | for (size_t i = 0; i < n_pushes; i++) { 31 | int val = 1; 32 | bool success = queue.push(val, true); 33 | assert(success); 34 | } 35 | } 36 | 37 | // BCL::barrier(); // adding this barrier fixes the issue 38 | 39 | if (BCL::rank() == rank) { 40 | std::unordered_map counts; 41 | 42 | for (size_t i = 0; i < (BCL::nprocs()-1)*n_pushes; i++) { 43 | int val; 44 | BCL::Backoff backoff; // it doesn't relate to the issue 45 | while (!queue.pop(val)) { 46 | backoff.backoff(); 47 | } 48 | assert(val == 1); 49 | } 50 | } 51 | 52 | fprintf(stderr, "rank %lu done\n", BCL::rank()); 53 | BCL::finalize(); 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /tests/containers/HashMap/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | test: all 46 | @for target in $(TARGETS) ; do \ 47 | echo "R $$target $(BACKEND)" ;\ 48 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 49 | done 50 | 51 | clean: 52 | @rm -f $(TARGETS) 53 | -------------------------------------------------------------------------------- /tests/containers/HashMap/find_atomic.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | int main(int argc, char** argv) { 13 | BCL::init(); 14 | 15 | srand48(BCL::rank()); 16 | 17 | using array_t = std::array; 18 | 19 | BCL::HashMap map(1000); 20 | 21 | array_t my_array; 22 | 23 | my_array.fill(BCL::rank()); 24 | 25 | auto result = map.insert_or_assign(BCL::rank(), my_array); 26 | bool success = result.second; 27 | assert(success); 28 | 29 | BCL::barrier(); 30 | 31 | for (size_t i = 0; i < BCL::nprocs(); i++) { 32 | auto result = map.find((int) i); 33 | assert(result != map.end()); 34 | array_t remote_array = *result; 35 | 36 | for (size_t i = 0; i < remote_array.size(); i++) { 37 | if (i > 0) { 38 | assert(remote_array[i] == remote_array[i-1]); 39 | if (remote_array[i] != remote_array[i-1]) { 40 | fprintf(stderr, "Remote array not consistent.\n"); 41 | } 42 | } 43 | } 44 | 45 | success = map.insert_or_assign(i, my_array).second; 46 | assert(success); 47 | } 48 | 49 | BCL::finalize(); 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /tests/containers/HashMap/insert_find.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int main(int argc, char** argv) { 12 | BCL::init(); 13 | 14 | BCL::HashMap map(1000); 15 | 16 | auto result = map.insert_or_assign(std::to_string(BCL::rank()), BCL::rank()); 17 | bool success = result.second; 18 | assert(success); 19 | 20 | BCL::barrier(); 21 | 22 | if (BCL::rank() == 0) { 23 | for (size_t i = 0; i < BCL::nprocs(); i++) { 24 | int value; 25 | auto val = map.find(std::to_string(i)); 26 | assert(success); 27 | } 28 | } 29 | 30 | BCL::finalize(); 31 | return 0; 32 | } 33 | -------------------------------------------------------------------------------- /tests/containers/experimental/rpc/Makefile: -------------------------------------------------------------------------------- 1 | SHELL='bash' 2 | 3 | # XXX: Modify BCLROOT if you move this Makefile 4 | # out of an examples/* directory. 5 | BCLROOT=$(PWD)/../../../../ 6 | 7 | BACKEND = $(shell echo $(BCL_BACKEND) | tr '[:lower:]' '[:upper:]') 8 | 9 | TIMER_CMD=time 10 | 11 | ifeq ($(BACKEND),SHMEM) 12 | BACKEND=SHMEM 13 | BCLFLAGS = -lpthread -DSHMEM -I$(BCLROOT) 14 | CXX=oshc++ 15 | 16 | BCL_RUN=oshrun -n 4 17 | else ifeq ($(BACKEND),GASNET_EX) 18 | BACKEND=GASNET_EX 19 | # XXX: Allow selection of conduit. 20 | include $(gasnet_prefix)/include/mpi-conduit/mpi-par.mak 21 | 22 | BCLFLAGS = -lpthread $(GASNET_CXXCPPFLAGS) $(GASNET_CXXFLAGS) $(GASNET_LDFLAGS) $(GASNET_LIBS) -DGASNET_EX -I$(BCLROOT) 23 | CXX = mpic++ 24 | 25 | BCL_RUN=mpirun -n 4 26 | else 27 | BACKEND=MPI 28 | BCLFLAGS = -lpthread -I$(BCLROOT) 29 | CXX=mpic++ 30 | 31 | BCL_RUN=mpirun -n 4 32 | endif 33 | 34 | CXXFLAGS = -std=gnu++17 $(BCLFLAGS) 35 | 36 | SOURCES += $(wildcard *.cpp) 37 | TARGETS := $(patsubst %.cpp, %, $(SOURCES)) 38 | 39 | all: $(TARGETS) 40 | 41 | %: %.cpp 42 | @echo "C $@ $(BACKEND)" 43 | @time $(CXX) -o $@ $^ $(CXXFLAGS) || echo "$@ $(BACKEND) BUILD FAIL" 44 | 45 | ifeq ($(BACKEND),MPI) 46 | test: all 47 | @echo "Not running RPC tests with MPI" 48 | else 49 | test: all 50 | @for target in $(TARGETS) ; do \ 51 | echo "R $$target $(BACKEND)" ;\ 52 | time $(BCL_RUN) ./$$target || (echo "$$target $(BACKEND) FAIL $$?"; exit 1) ;\ 53 | done 54 | endif 55 | 56 | clean: 57 | @rm -f $(TARGETS) 58 | -------------------------------------------------------------------------------- /tests/containers/experimental/rpc/rpc1.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | template 13 | bool ready(std::vector& futures) { 14 | for (auto& future : futures) { 15 | if (future.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 16 | return false; 17 | } 18 | } 19 | return true; 20 | } 21 | 22 | template 23 | void future_barrier(std::vector& futures) { 24 | bool success = false; 25 | do { 26 | BCL::flush_rpc(); 27 | size_t success_count = ready(futures); 28 | success_count = BCL::allreduce(success_count, std::plus{}); 29 | success = success_count == BCL::nprocs(); 30 | } while (!success); 31 | } 32 | 33 | int main(int argc, char** argv) { 34 | BCL::init(); 35 | BCL::init_rpc(); 36 | 37 | 38 | auto fn = [](int a, int b) -> int { 39 | return a * b; 40 | }; 41 | int a = 7; 42 | int b = 7; 43 | 44 | 45 | using rv = decltype(BCL::buffered_rpc(0, fn, a, b)); 46 | std::vector futures; 47 | if (BCL::rank() == 1) { 48 | for (int i = 0 ; i < 1000; i++) { 49 | auto f = BCL::buffered_rpc(0, fn, a, b); 50 | futures.push_back(std::move(f)); 51 | } 52 | } 53 | 54 | BCL::flush_signal(); 55 | 56 | future_barrier(futures); 57 | 58 | for (auto& f : futures) { 59 | int val = f.get(); 60 | assert(val == a*b); 61 | } 62 | 63 | BCL::finalize_rpc(); 64 | BCL::finalize(); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /tests/containers/experimental/rpc/rpc_checksum1.cpp: -------------------------------------------------------------------------------- 1 | // SPDX-FileCopyrightText: 2021 Benjamin Brock 2 | // 3 | // SPDX-License-Identifier: BSD-3-Clause 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | template 14 | bool ready(std::vector& futures) { 15 | for (auto& future : futures) { 16 | if (future.wait_for(std::chrono::seconds(0)) != std::future_status::ready) { 17 | return false; 18 | } 19 | } 20 | return true; 21 | } 22 | 23 | template 24 | void future_barrier(std::vector& futures) { 25 | bool success = false; 26 | do { 27 | BCL::flush_rpc(); 28 | size_t success_count = ready(futures); 29 | success_count = BCL::allreduce(success_count, std::plus{}); 30 | success = success_count == BCL::nprocs(); 31 | } while (!success); 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | BCL::init(); 36 | BCL::init_rpc(); 37 | 38 | 39 | auto fn = [](int a, int b) -> int { 40 | return a * b; 41 | }; 42 | int a = 7; 43 | int b = 7; 44 | 45 | 46 | using rv = decltype(BCL::buffered_rpc(0, fn, a, b)); 47 | std::vector futures; 48 | if (BCL::rank() == 1) { 49 | for (int i = 0 ; i < 1000; i++) { 50 | auto f = BCL::buffered_rpc(0, fn, a, b); 51 | futures.push_back(std::move(f)); 52 | } 53 | } 54 | 55 | BCL::flush_signal(); 56 | 57 | future_barrier(futures); 58 | 59 | for (auto& f : futures) { 60 | int val = f.get(); 61 | assert(val == a*b); 62 | } 63 | 64 | BCL::finalize_rpc(); 65 | BCL::finalize(); 66 | 67 | return 0; 68 | } 69 | -------------------------------------------------------------------------------- /tests/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | base_dir=$PWD 4 | 5 | user_backends=${BASH_ARGV[*]} 6 | 7 | if hash gmake 2> /dev/null 8 | then 9 | make_cmd='gmake' 10 | else 11 | make_cmd='make' 12 | fi 13 | 14 | if [ "$user_backends" == 'all' ] || [ "$user_backends" == '' ] 15 | then 16 | backends='mpi shmem gasnet_ex' 17 | else 18 | backends=$user_backends 19 | fi 20 | 21 | echo "Testing backends $backends" 22 | 23 | rv=0 24 | 25 | for backend in $backends 26 | do 27 | for i in $(find . -iname "Makefile") 28 | do 29 | cd $(dirname $i) 30 | BCL_BACKEND=$backend $make_cmd clean test | tee log.dat 31 | grep "FAIL" log.dat 32 | 33 | return_value=$? 34 | 35 | if [ $return_value -ne 1 ] 36 | then 37 | echo "Just FAIL'D test." 38 | rv=1 39 | fi 40 | rm log.dat 41 | cd $base_dir 42 | done 43 | done 44 | 45 | exit $rv 46 | --------------------------------------------------------------------------------