├── python └── bagua_core │ ├── version.py │ ├── __init__.py │ └── _environment.py ├── bagua-core-internal ├── third_party │ └── cub-1.8.0 │ │ ├── tune │ │ ├── .gitignore │ │ └── Makefile │ │ ├── experimental │ │ ├── .gitignore │ │ ├── spmv_script.sh │ │ ├── histogram │ │ │ ├── histogram_cub.h │ │ │ ├── histogram_gmem_atomics.h │ │ │ └── histogram_smem_atomics.h │ │ └── Makefile │ │ ├── .settings │ │ ├── .gitignore │ │ ├── org.eclipse.cdt.ui.prefs │ │ └── org.eclipse.core.runtime.prefs │ │ ├── test │ │ ├── .gitignore │ │ ├── link_main.cpp │ │ ├── link_a.cu │ │ ├── link_b.cu │ │ ├── test_grid_barrier.cu │ │ └── mersenne.h │ │ ├── examples │ │ ├── block │ │ │ ├── .gitignore │ │ │ ├── reduce_by_key.cu │ │ │ └── Makefile │ │ └── device │ │ │ ├── .gitignore │ │ │ ├── example_device_reduce.cu │ │ │ └── example_device_scan.cu │ │ ├── .project │ │ ├── LICENSE.TXT │ │ ├── cub │ │ ├── util_namespace.cuh │ │ ├── block │ │ │ ├── specializations │ │ │ │ └── block_histogram_atomic.cuh │ │ │ └── block_raking_layout.cuh │ │ ├── cub.cuh │ │ ├── util_macro.cuh │ │ ├── grid │ │ │ ├── grid_mapping.cuh │ │ │ └── grid_barrier.cuh │ │ ├── host │ │ │ └── mutex.cuh │ │ ├── thread │ │ │ ├── thread_search.cuh │ │ │ └── thread_reduce.cuh │ │ ├── util_debug.cuh │ │ ├── util_arch.cuh │ │ └── iterator │ │ │ ├── discard_output_iterator.cuh │ │ │ └── counting_input_iterator.cuh │ │ └── README.md ├── src │ ├── cuda_utils.rs │ ├── comm_ops │ │ ├── mod.rs │ │ ├── python_ffi_op.rs │ │ ├── centralized_full_precision_synchronous.rs │ │ ├── centralized_low_precision_synchronous.rs │ │ ├── decentralized_full_precision_synchronous.rs │ │ ├── decentralized_full_precision_asynchronous.rs │ │ └── decentralized_low_precision_synchronous.rs │ ├── events.rs │ ├── resource_pool │ │ └── mod.rs │ └── kernels │ │ └── mod.rs ├── cpp │ └── include │ │ └── bagua_utils.h ├── Cargo.toml └── build.rs ├── Cargo.toml ├── .gitignore ├── .gitmodules ├── pyproject.toml ├── README.md ├── .github ├── workflows │ ├── check-package-install.yml │ ├── rustfmt.yml │ └── pypi-publish.yml └── dependabot.yml ├── MANIFEST.in ├── bagua-opentelemetry ├── Cargo.toml └── src │ ├── lib.rs │ └── exporter │ ├── agent.rs │ └── mod.rs ├── bagua-core-c ├── Cargo.toml └── src │ └── lib.rs ├── bagua-core-py └── Cargo.toml ├── LICENSE ├── CHANGELOG.md └── setup.py /python/bagua_core/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "dev" 2 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/tune/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/.settings/.gitignore: -------------------------------------------------------------------------------- 1 | /language.settings.xml 2 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /link_main.obj 3 | /dummy/ 4 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | workspace = { members = [ 2 | "bagua-core-internal", 3 | "bagua-core-py", 4 | "bagua-opentelemetry", 5 | ], exclude = [] } 6 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/block/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /Release 4 | /cuda55.sdf 5 | /cuda55.suo 6 | /cuda60.sdf 7 | /cuda60.suo 8 | -------------------------------------------------------------------------------- /python/bagua_core/__init__.py: -------------------------------------------------------------------------------- 1 | from . import _environment 2 | _environment._preload_libraries() 3 | from .version import __version__ 4 | from .bagua_core import * 5 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | formatter_profile=_B40C 3 | formatter_settings_version=1 4 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/device/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /ipch 4 | /Release 5 | /cuda55.sdf 6 | /cuda55.suo 7 | /cuda60.sdf 8 | /cuda60.suo 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | /data/ 4 | scratchpad.org 5 | build.sh 6 | push.sh 7 | __pycache__/ 8 | *.egg-info/ 9 | /dist/ 10 | /.eggs/ 11 | /build/ 12 | .data/ 13 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bagua-core-internal/third_party/Aluminum"] 2 | path = bagua-core-internal/third_party/Aluminum 3 | url = https://github.com/BaguaSys/Aluminum.git 4 | branch = bagua 5 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/link_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | extern void a(); 4 | extern void b(); 5 | 6 | int main() 7 | { 8 | printf("hello world\n"); 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /python/bagua_core/_environment.py: -------------------------------------------------------------------------------- 1 | import ctypes 2 | import os 3 | 4 | 5 | def _preload_libraries(): 6 | cwd = os.path.dirname(os.path.abspath(__file__)) 7 | libnccl_path = os.path.join(cwd, ".data", "lib", "libnccl.so") 8 | ctypes.CDLL(libnccl_path) 9 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs: -------------------------------------------------------------------------------- 1 | content-types/enabled=true 2 | content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh 3 | content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu 4 | eclipse.preferences.version=1 5 | -------------------------------------------------------------------------------- /bagua-core-internal/src/cuda_utils.rs: -------------------------------------------------------------------------------- 1 | pub unsafe fn cuda_memcpy_device_to_host_sync(host_ptr: u64, device_ptr: u64, num_bytes: i32) { 2 | cpp::cpp!([host_ptr as "void*", device_ptr as "void*", num_bytes as "int"] 3 | { 4 | CUDACHECK(cudaMemcpy(host_ptr, device_ptr, num_bytes, cudaMemcpyDeviceToHost)); 5 | }); 6 | } 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 43.0.0", "wheel", "setuptools-rust", "colorama", "tqdm", "setuptools_scm[toml]>=6.0"] 3 | build-backend = 'setuptools.build_meta' 4 | 5 | [tool.setuptools_scm] 6 | local_scheme = "no-local-version" 7 | write_to = "python/bagua_core/version.py" 8 | write_to_template = "__version__ = \"{version}\"" 9 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/link_a.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void a() 4 | { 5 | printf("a() called\n"); 6 | 7 | cub::DoubleBuffer d_keys; 8 | cub::DoubleBuffer d_values; 9 | size_t temp_storage_bytes = 0; 10 | cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024); 11 | } 12 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/link_b.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void b() 4 | { 5 | printf("b() called\n"); 6 | 7 | cub::DoubleBuffer d_keys; 8 | cub::DoubleBuffer d_values; 9 | size_t temp_storage_bytes = 0; 10 | cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024); 11 | } 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | `bagua-core` has been merged into [Bagua](https://github.com/BaguaSys/bagua)! 2 | ===== 3 | 4 | [![PyPI version](https://badge.fury.io/py/bagua-core.svg)](https://badge.fury.io/py/bagua-core) [![GitHub license](https://img.shields.io/github/license/BaguaSys/bagua-core)](https://github.com/BaguaSys/bagua-core/blob/master/LICENSE) 5 | 6 | The core communication logic for [Bagua](https://github.com/BaguaSys/bagua). 7 | 8 | * PyPI: https://pypi.org/project/bagua-core/ 9 | -------------------------------------------------------------------------------- /.github/workflows/check-package-install.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: check package install 10 | 11 | jobs: 12 | check: 13 | runs-on: ubuntu-latest 14 | container: ikzk/bagua-ci:latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | with: 18 | submodules: recursive 19 | - run: rustup default stable 20 | - run: pip install . 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/rustfmt.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: Rustfmt 10 | 11 | jobs: 12 | format: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions-rs/toolchain@v1 17 | with: 18 | toolchain: stable 19 | components: rustfmt 20 | override: true 21 | - uses: mbrobbel/rustfmt-check@master 22 | with: 23 | token: ${{ secrets.GITHUB_TOKEN }} 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Cargo.toml 2 | include bagua-core-internal/Cargo.toml 3 | include bagua-core-internal/build.rs 4 | recursive-include bagua-core-internal/src * 5 | recursive-include bagua-core-internal/kernels * 6 | recursive-include bagua-core-internal/cpp * 7 | recursive-include bagua-core-internal/third_party * 8 | include bagua-core-py/Cargo.toml 9 | include bagua-core-py/build.rs 10 | recursive-include bagua-core-py/src * 11 | include bagua-core-c/Cargo.toml 12 | include bagua-core-c/build.rs 13 | recursive-include bagua-core-c/src * 14 | recursive-include python * 15 | exclude bagua-core-internal/third_party/Aluminum/.git 16 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod centralized_full_precision_synchronous; 2 | pub mod centralized_low_precision_synchronous; 3 | pub mod decentralized_full_precision_asynchronous; 4 | pub mod decentralized_full_precision_synchronous; 5 | pub mod decentralized_low_precision_synchronous; 6 | pub mod python_ffi_op; 7 | 8 | use crate::datatypes::BaguaBucket; 9 | use crate::{BaguaCommOpChannels, BaguaCoreError}; 10 | use std::fmt::Debug; 11 | use std::sync::Arc; 12 | 13 | pub trait CommOpTrait: Debug { 14 | fn execute_background_communication( 15 | &self, 16 | bucket: Arc, 17 | comm_channels: &BaguaCommOpChannels, 18 | ); 19 | } 20 | -------------------------------------------------------------------------------- /bagua-opentelemetry/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bagua-opentelemetry" 3 | version = "0.1.0" 4 | edition = "2018" 5 | publish = ["private"] 6 | 7 | [dependencies] 8 | tracing = "0.1" 9 | async-std = { version = "1.10", features = ["attributes", "tokio1"] } 10 | async-trait = { version = "0.1" } 11 | hyper = { version = "0.14", features = ["full"] } 12 | opentelemetry = { version = "0.15", default-features = false, features = [ 13 | "trace", 14 | "rt-async-std", 15 | ] } 16 | serde = { version = "1.0", features = ["derive"] } 17 | serde_json = "1.0" 18 | reqwest = { version = "0.11", features = ["json"] } 19 | tokio = { version = "1", features = ["full"] } 20 | futures = { version = "0.3" } 21 | 22 | tokio-stream = { version = "0.1" } 23 | -------------------------------------------------------------------------------- /bagua-core-c/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bagua-core-c" 3 | version = "0.1.2" 4 | edition = "2018" 5 | 6 | [lib] 7 | name = "bagua_comm_core_c" 8 | crate-type = ["dylib"] 9 | path = "src/lib.rs" 10 | 11 | [dependencies] 12 | bagua-core-internal = { path = "../bagua-core-internal" } 13 | tracing = "0.1" 14 | tracing-subscriber = "0.2" 15 | anyhow = "1.0" 16 | color-eyre = "0.5" 17 | numpy = "0.14.1" 18 | parking_lot = { version = "0.11", features = ["deadlock_detection"] } 19 | openssl-sys = { version = "*", features = ["vendored"] } 20 | 21 | [dependencies.pyo3] 22 | version = "0.14.1" 23 | features = ["extension-module"] 24 | 25 | [build-dependencies] 26 | shadow-rs = "0.6" 27 | cpp_build = "0.5" 28 | cc = "1.0" 29 | cmd_lib = "1.0" 30 | which = "4.2" 31 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/python_ffi_op.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::CommOpTrait; 2 | use crate::datatypes::BaguaBucket; 3 | use crate::BaguaCommOpChannels; 4 | use pyo3::Python; 5 | use std::sync::Arc; 6 | 7 | #[derive(Debug)] 8 | pub struct PythonFFIOp { 9 | pub py_callable: pyo3::Py, 10 | } 11 | 12 | impl CommOpTrait for PythonFFIOp { 13 | fn execute_background_communication( 14 | &self, 15 | bucket: Arc, 16 | _comm_op_channels: &BaguaCommOpChannels, 17 | ) { 18 | Python::with_gil(|python| { 19 | let result = self.py_callable.call1(python, (bucket.name.as_str(),)); 20 | if let Err(e) = result { 21 | tracing::error!("python ffi op error: {:?}", e); 22 | } 23 | }); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /bagua-opentelemetry/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod exporter; 2 | 3 | use crate::exporter::agent::AgentAsyncClientHTTP; 4 | use crate::exporter::Exporter; 5 | use opentelemetry; 6 | use opentelemetry::{global, sdk, trace::Tracer, trace::TracerProvider}; 7 | 8 | pub fn init_tracer(autotune_server_addr: &str) -> impl Tracer { 9 | let exporter = Exporter { 10 | uploader: AgentAsyncClientHTTP::new(autotune_server_addr.to_string()), 11 | }; 12 | 13 | let builder = sdk::trace::TracerProvider::builder() 14 | .with_batch_exporter(exporter, opentelemetry::runtime::AsyncStd); 15 | 16 | let tracer_provider = builder.build(); 17 | let tracer = tracer_provider.get_tracer("bagua-opentelemetry", Some(env!("CARGO_PKG_VERSION"))); 18 | let _ = global::set_tracer_provider(tracer_provider); 19 | 20 | tracer 21 | } 22 | -------------------------------------------------------------------------------- /bagua-core-py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bagua-core-py" 3 | version = "0.1.2" 4 | authors = ["Xiangru Lian "] 5 | edition = "2018" 6 | publish = ["private"] 7 | 8 | [lib] 9 | name = "bagua_core_py" 10 | crate-type = ["cdylib"] 11 | path = "src/lib.rs" 12 | 13 | [dependencies] 14 | bagua-core-internal = { path = "../bagua-core-internal" } 15 | ndarray = "0.15.3" 16 | tracing = "0.1" 17 | tracing-subscriber = "0.2" 18 | anyhow = "1.0" 19 | color-eyre = "0.5" 20 | numpy = "0.14.1" 21 | parking_lot = { version = "0.11", features = ["deadlock_detection"] } 22 | openssl-sys = { version = "*", features = ["vendored"] } 23 | num-traits = "0.2" 24 | num-derive = "0.3" 25 | 26 | [dependencies.pyo3] 27 | version = "0.14.5" 28 | features = ["extension-module"] 29 | 30 | [build-dependencies] 31 | shadow-rs = "0.7" 32 | cpp_build = "0.5" 33 | cc = "1.0" 34 | cmd_lib = "1.0" 35 | which = "4.2" 36 | -------------------------------------------------------------------------------- /bagua-core-internal/src/events.rs: -------------------------------------------------------------------------------- 1 | use parking_lot::Mutex; 2 | use std::sync::Arc; 3 | 4 | #[derive(Clone, Debug)] 5 | pub struct BaguaEventChannel { 6 | pub name: String, 7 | inner: Arc<(Mutex, parking_lot::Condvar)>, 8 | } 9 | 10 | impl BaguaEventChannel { 11 | pub fn new(name: &str) -> Self { 12 | Self { 13 | name: name.to_string(), 14 | inner: Arc::new((Mutex::new(false), parking_lot::Condvar::new())), 15 | } 16 | } 17 | 18 | pub fn finish(&self) { 19 | let &(ref lock, ref cvar) = &*self.inner; 20 | let mut finished = lock.lock(); 21 | *finished = true; 22 | cvar.notify_all(); 23 | } 24 | 25 | pub fn wait(&self) { 26 | let &(ref lock, ref cvar) = &*self.inner; 27 | let mut finished = lock.lock(); 28 | if !*finished { 29 | cvar.wait(&mut finished); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/spmv_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216 4 | do 5 | echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7` 6 | done 7 | 8 | echo 9 | echo 10 | 11 | for i in `ls /home/dumerrill/graphs/spmv/*.mtx` 12 | do 13 | if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 14 | then 15 | echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null` 16 | fi 17 | done 18 | 19 | echo 20 | echo 21 | 22 | for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx` 23 | #for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx` 24 | do 25 | if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 26 | then 27 | echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null` 28 | fi 29 | done 30 | 31 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | GIT_CUB 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 24 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 25 | org.eclipse.cdt.core.ccnature 26 | 27 | 28 | -------------------------------------------------------------------------------- /bagua-core-internal/cpp/include/bagua_utils.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __BAGUA_UTILS_HPP__ 3 | #define __BAGUA_UTILS_HPP__ 4 | 5 | #define CUDACHECK(cmd) do { cudaError_t e = cmd; if( e != cudaSuccess ) { printf("Failed: Cuda error %s:%d '%s'\n", __FILE__,__LINE__,cudaGetErrorString(e)); exit(EXIT_FAILURE); } } while(0) 6 | #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0) 7 | 8 | #define ALIGN_SIZE(size, align) (((size) + (align) - 1) / (align) * (align)) 9 | #define DIVUP(x, y) (((x)+(y)-1)/(y)) 10 | 11 | #include 12 | 13 | 14 | ncclResult_t ncclAllToAll(void *sendbuf, void *recvbuf, 15 | size_t count, 16 | ncclDataType_t datatype, 17 | ncclComm_t comm, 18 | int nranks, 19 | int rank, 20 | cudaStream_t stream); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Kuaishou AI Platform & DS3 Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bagua-core-internal/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "bagua-core-internal" 3 | version = "0.1.2" 4 | authors = ["Xiangru Lian "] 5 | edition = "2018" 6 | publish = ["private"] 7 | build = "build.rs" 8 | 9 | [dependencies] 10 | tracing = "0.1" 11 | tracing-subscriber = "0.2" 12 | thiserror = "1" 13 | base64 = "0.13" 14 | itertools = "0.10" 15 | shadow-rs = "0.7" 16 | parking_lot = { version = "0.11", features = ["deadlock_detection"] } 17 | hashbrown = "0.11" 18 | flume = "0.10" 19 | derivative = "2.2.0" 20 | oneshot = "0.1" 21 | cpp = "0.5" 22 | sized-object-pool = "0.2" 23 | dynamic-pool = "0.2" 24 | once_cell = "1.7" 25 | ndarray = "0.15.3" 26 | serde = { version = "1", features = ["derive"] } 27 | scheduled-thread-pool = "0.2" 28 | serde_json = "1.0" 29 | ureq = "2.2" 30 | num-traits = "0.2" 31 | num-derive = "0.3" 32 | display_utils = "0.4.0" 33 | opentelemetry = { version = "0.15", features = ["serialize", "metrics"] } 34 | bagua-opentelemetry = { path = "../bagua-opentelemetry" } 35 | 36 | [dependencies.pyo3] 37 | version = "0.14.5" 38 | features = ["auto-initialize"] 39 | 40 | [build-dependencies] 41 | shadow-rs = "0.7" 42 | cmake = "0.1" 43 | cpp_build = "0.5" 44 | cc = "1.0" 45 | cmd_lib = "1.0" 46 | which = "4.2" 47 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | branches: 7 | - master 8 | 9 | name: pypi publish 10 | 11 | jobs: 12 | publish: 13 | runs-on: ubuntu-latest 14 | container: ikzk/bagua-ci:latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | with: 18 | fetch-depth: 0 19 | submodules: recursive 20 | - run: env BAGUA_NO_INSTALL_DEPS=1 python -m build -s 21 | - name: Publish a Python distribution to PyPI 22 | if: github.ref == 'refs/heads/master' && !startsWith(github.ref, 'refs/tags') 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | with: 25 | user: __token__ 26 | password: ${{ secrets.PYPI_API_TOKEN }} 27 | - name: Publish package 28 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 29 | uses: pypa/gh-action-pypi-publish@release/v1 30 | with: 31 | user: __token__ 32 | password: ${{ secrets.PYPI_API_TOKEN }} 33 | - name: Archive package artifacts 34 | uses: actions/upload-artifact@v2 35 | with: 36 | name: bagua-pypi-package 37 | path: | 38 | dist/* 39 | -------------------------------------------------------------------------------- /bagua-opentelemetry/src/exporter/agent.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Serialize, Deserialize, Clone, Debug, Hash)] 4 | pub struct BaguaSpan { 5 | pub trace_id: u128, 6 | pub action: String, 7 | pub tensor_name: String, 8 | pub start_time: u128, 9 | pub end_time: u128, 10 | } 11 | 12 | #[derive(Serialize, Deserialize, Clone, Debug, Hash)] 13 | pub struct BaguaBatch { 14 | pub spans: Vec, 15 | } 16 | 17 | #[derive(Debug)] 18 | pub struct AgentAsyncClientHTTP { 19 | server_addr: String, 20 | client: reqwest::Client, 21 | } 22 | 23 | impl AgentAsyncClientHTTP { 24 | pub fn new(server_addr: String) -> AgentAsyncClientHTTP { 25 | Self { 26 | server_addr: server_addr, 27 | client: reqwest::Client::new(), 28 | } 29 | } 30 | 31 | pub async fn emit_batch( 32 | &mut self, 33 | batch: BaguaBatch, 34 | ) -> Result { 35 | let uri = format!( 36 | "http://{}/api/v1/report_tensor_execution_order", 37 | self.server_addr 38 | ); 39 | 40 | let resp = self.client.post(uri).json(&batch).send().await?; 41 | 42 | Ok(resp) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the NVIDIA CORPORATION nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/block/reduce_by_key.cu: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | 5 | 6 | template < 7 | int BLOCK_THREADS, ///< Number of CTA threads 8 | typename KeyT, ///< Key type 9 | typename ValueT> ///< Value type 10 | __global__ void Kernel() 11 | { 12 | // Tuple type for scanning (pairs accumulated segment-value with segment-index) 13 | typedef cub::KeyValuePair OffsetValuePairT; 14 | 15 | // Reduce-value-by-segment scan operator 16 | typedef cub::ReduceBySegmentOp ReduceBySegmentOpT; 17 | 18 | // Parameterized BlockDiscontinuity type for setting head flags 19 | typedef cub::BlockDiscontinuity< 20 | KeyT, 21 | BLOCK_THREADS> 22 | BlockDiscontinuityKeysT; 23 | 24 | // Parameterized BlockScan type 25 | typedef cub::BlockScan< 26 | OffsetValuePairT, 27 | BLOCK_THREADS, 28 | cub::BLOCK_SCAN_WARP_SCANS> 29 | BlockScanT; 30 | 31 | // Shared memory 32 | __shared__ union TempStorage 33 | { 34 | typename BlockScanT::TempStorage scan; // Scan storage 35 | typename BlockDiscontinuityKeysT::TempStorage discontinuity; // Discontinuity storage 36 | } temp_storage; 37 | 38 | 39 | // Read data (each thread gets 3 items each, every 9 items is a segment) 40 | KeyT my_keys[3] = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3}; 41 | ValueT my_values[3] = {1, 1, 1}; 42 | 43 | // Set head segment head flags 44 | int my_flags[3]; 45 | BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads( 46 | my_flags, 47 | my_keys, 48 | cub::Inequality()); 49 | 50 | __syncthreads(); 51 | 52 | 53 | 54 | 55 | 56 | 57 | } 58 | -------------------------------------------------------------------------------- /bagua-opentelemetry/src/exporter/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod agent; 2 | 3 | use crate::exporter::agent::{AgentAsyncClientHTTP, BaguaBatch, BaguaSpan}; 4 | use async_trait::async_trait; 5 | use opentelemetry::{sdk::export::trace, Key}; 6 | use reqwest::StatusCode; 7 | use std::time::UNIX_EPOCH; 8 | 9 | #[derive(Debug)] 10 | pub struct Exporter { 11 | pub uploader: AgentAsyncClientHTTP, 12 | } 13 | 14 | #[async_trait] 15 | impl trace::SpanExporter for Exporter { 16 | async fn export(&mut self, batch: Vec) -> trace::ExportResult { 17 | let mut bagua_spans = Vec::new(); 18 | for span in batch { 19 | let bagua_span = BaguaSpan { 20 | trace_id: span.span_context.trace_id().to_u128(), 21 | action: span.name.into_owned(), 22 | tensor_name: span 23 | .attributes 24 | .get(&Key::new("tensor_name")) 25 | .unwrap() 26 | .as_str() 27 | .to_string(), 28 | start_time: span 29 | .start_time 30 | .duration_since(UNIX_EPOCH) 31 | .unwrap() 32 | .as_millis(), 33 | end_time: span 34 | .end_time 35 | .duration_since(UNIX_EPOCH) 36 | .unwrap() 37 | .as_millis(), 38 | }; 39 | 40 | bagua_spans.push(bagua_span); 41 | } 42 | 43 | let resp = self 44 | .uploader 45 | .emit_batch(BaguaBatch { spans: bagua_spans }) 46 | .await; 47 | match resp { 48 | Ok(resp) => { 49 | if resp.status() != StatusCode::OK { 50 | tracing::warn!("upload bagua span failed, resp={:?}", resp); 51 | } 52 | } 53 | Err(err) => { 54 | tracing::warn!("upload bagua span failed, err={:?}", err); 55 | } 56 | } 57 | 58 | Ok(()) 59 | } 60 | 61 | fn shutdown(&mut self) {} 62 | } 63 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## [0.4.1] - 2021-08-14 4 | 5 | ### Features 6 | 7 | - Add opentelemetry to report tensor ready order (#42) 8 | 9 | 10 | ## [0.4.0] - 2021-07-23 11 | 12 | ### Features 13 | 14 | - Better debug log including tensor info when executing ops 15 | - Make full precision decentralized op stateless (#36) 16 | 17 | 18 | ## [0.3.1] - 2021-07-01 19 | 20 | ### Bug Fixes 21 | 22 | - Always mark bagua padding tensor as ready 23 | 24 | 25 | ## [0.3.0] - 2021-07-01 26 | 27 | ### Bug Fixes 28 | 29 | - Fix decompress incorrect pointer and typo in error msg 30 | - Fix python gil deadlock during getting data ptr 31 | 32 | ### Features 33 | 34 | - Replace NCCL with Aluminum (#7) 35 | - Support creating BaguaTensor by passing torch tensor directly (#19) 36 | - Compatible mode for getting pytorch tensor info with Python interpreter 37 | 38 | 39 | ## [0.2.0] - 2021-06-17 40 | 41 | ### Features 42 | 43 | - Initial support for python op (#2) 44 | 45 | 46 | ## [0.1.3] - 2021-06-17 47 | 48 | ### Bug Fixes 49 | 50 | - Move import bagua_install_library to install library function 51 | - Merge bagua_install_library and setup.py, remove nccl<=2.6 support 52 | 53 | 54 | ## [0.1.2] - 2021-06-17 55 | 56 | ### Features 57 | 58 | - Add version.py placeholder to prevent file not found error 59 | 60 | 61 | ## [0.1.1] - 2021-06-10 62 | 63 | ### Bug Fixes 64 | 65 | - Only run publish once on git tag 66 | 67 | ### Features 68 | 69 | - Install nccl deps in bagua core and add generated __version__ variable 70 | 71 | 72 | ## [0.1.0] - 2021-06-10 73 | 74 | ### Bug Fixes 75 | 76 | - Fix ci pypi versioning 77 | - Remove __init__.py and python __version__, use cargo version 78 | 79 | ### Features 80 | 81 | - Initial commit of bagua core impl 82 | - Add python packaging related files 83 | - Only publish pypi for master commits 84 | - Add __version__ variable 85 | 86 | 87 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | // For example: 37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 38 | //#define CUB_NS_POSTFIX } } 39 | 40 | #ifndef CUB_NS_PREFIX 41 | #define CUB_NS_PREFIX 42 | #endif 43 | 44 | #ifndef CUB_NS_POSTFIX 45 | #define CUB_NS_POSTFIX 46 | #endif 47 | -------------------------------------------------------------------------------- /bagua-core-c/src/lib.rs: -------------------------------------------------------------------------------- 1 | use bagua_core_internal::communicators::BaguaSingleCommunicator; 2 | use std::ffi::CStr; 3 | use std::os::raw::c_char; 4 | 5 | pub struct BaguaSingleCommunicatorC { 6 | inner: BaguaSingleCommunicator, 7 | } 8 | 9 | pub extern "C" fn bagua_single_communicator_c_create( 10 | rank: usize, 11 | nranks: usize, 12 | device_id: usize, 13 | stream_ptr: u64, 14 | nccl_unique_id_str: *const c_char, 15 | ) -> *mut BaguaSingleCommunicatorC { 16 | let obj = BaguaSingleCommunicatorC { 17 | inner: bagua_core_internal::communicators::BaguaSingleCommunicator::new( 18 | rank, 19 | nranks, 20 | device_id, 21 | stream_ptr, 22 | unsafe { CStr::from_ptr(nccl_unique_id_str).to_str().unwrap() }, 23 | ), 24 | }; 25 | 26 | // into_raw turns the Box into a *mut, which the borrow checker 27 | // ignores, without calling its destructor. 28 | Box::into_raw(Box::new(obj)) 29 | } 30 | 31 | pub extern "C" fn bagua_single_communicator_c_destroy(ptr: &mut *mut BaguaSingleCommunicatorC) { 32 | // First, we **must** check to see if the pointer is null. 33 | if ptr.is_null() { 34 | // Do nothing. 35 | return; 36 | } 37 | 38 | // Now we know the pointer is non-null, we can continue. from_raw is the 39 | // inverse of into_raw: it turns the *mut Dramatic back into a 40 | // Box. You must only call from_raw once per pointer. 41 | let obj: Box = unsafe { Box::from_raw(*ptr) }; 42 | 43 | // We don't *have* to do anything else; once obj goes out of scope, it will 44 | // be dropped. I'm going to drop it explicitly, however, for clarity. 45 | drop(obj); 46 | 47 | // I am, however, going to null out the `ptr` we were passed just so the 48 | // calling code is less likely to accidentally re-use the pointer. 49 | *ptr = ::std::ptr::null_mut(); 50 | } 51 | 52 | /// Error code 53 | /// 0: success 54 | /// -1: null pointer 55 | pub extern "C" fn bagua_single_communicator_c_nranks( 56 | ptr: &mut *mut BaguaSingleCommunicatorC, 57 | nranks: *mut usize, 58 | ) -> i32 { 59 | // First, we **must** check to see if the pointer is null. 60 | if ptr.is_null() { 61 | // Do nothing. 62 | return -1; 63 | } 64 | 65 | unsafe { 66 | *nranks = (*(*ptr)).inner.nranks(); 67 | } 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/centralized_full_precision_synchronous.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::CommOpTrait; 2 | use crate::communicators::BaguaCommunicator; 3 | use crate::datatypes::{BaguaBucket, BaguaReductionOp, BaguaTensorRaw, RawBaguaTensor}; 4 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL; 5 | use crate::BaguaCommOpChannels; 6 | use std::sync::Arc; 7 | 8 | #[derive(Debug)] 9 | pub struct CentralizedFullPrecisionSynchronous { 10 | pub communicator: BaguaCommunicator, 11 | /// whether divide world_size after allreduce sum op 12 | pub average: bool, 13 | pub scattergather: bool, 14 | } 15 | 16 | impl CommOpTrait for CentralizedFullPrecisionSynchronous { 17 | fn execute_background_communication( 18 | &self, 19 | bucket: Arc, 20 | _comm_op_channels: &BaguaCommOpChannels, 21 | ) { 22 | let bucket = bucket.inner.lock(); 23 | let stream_ptr = self.communicator.stream_ptr(); 24 | let mut communication_tensor = bucket.get_communication_tensor(stream_ptr, false, false); 25 | self.communicator.execute_communication( 26 | &mut communication_tensor, 27 | self.average, 28 | true, 29 | true, 30 | &mut |c, t| { 31 | tracing::debug!("internode communication started"); 32 | if self.scattergather { 33 | tracing::debug!("start alltoall"); 34 | c.alltoall_inplace(&mut t.raw); 35 | tracing::debug!("start reduce_sum"); 36 | if self.average { 37 | t.raw.reduce_mean_inplace(c.nranks, c.rank, c.stream_ptr); 38 | } else { 39 | t.raw.reduce_sum_inplace(c.nranks, c.rank, c.stream_ptr); 40 | } 41 | tracing::debug!("start allgather"); 42 | c.allgather_inplace(&mut t.raw); 43 | tracing::debug!("internode communication done") 44 | } else { 45 | tracing::debug!("start allreduce"); 46 | if self.average { 47 | c.allreduce_inplace(&mut t.raw, BaguaReductionOp::AVG); 48 | } else { 49 | c.allreduce_inplace(&mut t.raw, BaguaReductionOp::SUM); 50 | } 51 | tracing::debug!("internode communication done"); 52 | } 53 | }, 54 | ); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /bagua-core-internal/src/resource_pool/mod.rs: -------------------------------------------------------------------------------- 1 | use dynamic_pool::DynamicPool; 2 | use sized_object_pool::{DynamicReset, SizedAllocatable, SizedPool}; 3 | 4 | #[derive(Debug)] 5 | pub struct CudaMemory { 6 | pub ptr: u64, 7 | pub num_bytes: usize, 8 | } 9 | 10 | impl CudaMemory { 11 | pub fn new(bytes: usize) -> Self { 12 | let ptr = unsafe { 13 | cpp::cpp!([bytes as "size_t"] -> u64 as "void *" 14 | { 15 | int *ptr = 0; 16 | CUDACHECK(cudaMalloc(&ptr, bytes)); 17 | return ptr; 18 | }) 19 | }; 20 | Self { 21 | ptr, 22 | num_bytes: bytes, 23 | } 24 | } 25 | } 26 | 27 | impl Drop for CudaMemory { 28 | fn drop(&mut self) { 29 | let ptr = self.ptr; 30 | unsafe { 31 | cpp::cpp!([ptr as "void *"] 32 | { 33 | CUDACHECK(cudaFree(ptr)); 34 | }) 35 | }; 36 | } 37 | } 38 | 39 | impl SizedAllocatable for CudaMemory { 40 | fn new(size: usize) -> Self { 41 | Self::new(size) 42 | } 43 | 44 | fn size(&self) -> usize { 45 | self.num_bytes 46 | } 47 | } 48 | 49 | impl DynamicReset for CudaMemory { 50 | fn reset(&mut self) {} 51 | } 52 | 53 | pub static CUDA_DEVICE_MEMORY_POOL: once_cell::sync::Lazy>> = 54 | once_cell::sync::Lazy::new(|| { 55 | let mut pools = Vec::new(); 56 | for _ in 0..64 { 57 | pools.push(SizedPool::::new(0, 40, 2048)) 58 | } 59 | pools 60 | }); 61 | 62 | #[derive(Debug)] 63 | pub struct CudaEvent { 64 | pub event: u64, 65 | } 66 | 67 | impl CudaEvent { 68 | pub fn new() -> Self { 69 | let event = unsafe { 70 | cpp::cpp!([] -> u64 as "cudaEvent_t" 71 | { 72 | cudaEvent_t event = 0; 73 | CUDACHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); 74 | return event; 75 | }) 76 | }; 77 | Self { event } 78 | } 79 | } 80 | 81 | impl Drop for CudaEvent { 82 | fn drop(&mut self) { 83 | let event = self.event; 84 | unsafe { 85 | cpp::cpp!([event as "cudaEvent_t"] 86 | { 87 | CUDACHECK(cudaEventDestroy(event)); 88 | }) 89 | }; 90 | } 91 | } 92 | 93 | impl DynamicReset for CudaEvent { 94 | fn reset(&mut self) {} 95 | } 96 | 97 | pub static CUDA_EVENT_POOL: once_cell::sync::Lazy> = 98 | once_cell::sync::Lazy::new(|| DynamicPool::new(0, 10, CudaEvent::new)); 99 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/centralized_low_precision_synchronous.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::CommOpTrait; 2 | use crate::communicators::BaguaCommunicator; 3 | use crate::datatypes::{BaguaBucket, BaguaTensorRaw, RawBaguaTensor, TensorCompressionMethod}; 4 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL; 5 | use crate::BaguaCommOpChannels; 6 | use std::sync::Arc; 7 | 8 | #[derive(Debug)] 9 | pub struct CentralizedLowPrecisionSynchronous { 10 | pub communicator: BaguaCommunicator, 11 | /// whether divide world_size after allreduce sum op 12 | pub average: bool, 13 | pub compression_method: TensorCompressionMethod, 14 | } 15 | 16 | impl CommOpTrait for CentralizedLowPrecisionSynchronous { 17 | fn execute_background_communication( 18 | &self, 19 | bucket: Arc, 20 | _comm_op_channels: &BaguaCommOpChannels, 21 | ) { 22 | let bucket = bucket.inner.lock(); 23 | let stream_ptr = self.communicator.stream_ptr(); 24 | let mut communication_tensor = bucket.get_communication_tensor(stream_ptr, false, false); 25 | self.communicator.execute_communication( 26 | &mut communication_tensor, 27 | self.average, 28 | true, 29 | true, 30 | &mut |c, t| { 31 | tracing::debug!("start compress"); 32 | let mut compressed_tensor = t 33 | .raw 34 | .compress(&self.compression_method, c.nranks, c.stream_ptr, -1) 35 | .expect("cannot compress tensor"); 36 | tracing::debug!("start alltoall"); 37 | c.alltoall_inplace(compressed_tensor.as_mut()); 38 | tracing::debug!("start decompress"); 39 | t.raw.decompress_from( 40 | &self.compression_method, 41 | c.nranks, 42 | compressed_tensor.as_ref(), 43 | c.stream_ptr, 44 | ); 45 | tracing::debug!("start reduce_sum"); 46 | if self.average { 47 | t.raw.reduce_mean_inplace(c.nranks, c.rank, c.stream_ptr); 48 | } else { 49 | t.raw.reduce_sum_inplace(c.nranks, c.rank, c.stream_ptr); 50 | } 51 | tracing::debug!("start compress"); 52 | let mut compressed_tensor = t 53 | .raw 54 | .compress( 55 | &self.compression_method, 56 | c.nranks, 57 | c.stream_ptr, 58 | c.rank as _, 59 | ) 60 | .expect("cannot compress tensor"); 61 | tracing::debug!("start allgather"); 62 | c.allgather_inplace(compressed_tensor.as_mut()); 63 | tracing::debug!("start decompress"); 64 | t.raw.decompress_from( 65 | &self.compression_method, 66 | c.nranks, 67 | compressed_tensor.as_ref(), 68 | c.stream_ptr, 69 | ); 70 | tracing::debug!("internode communication done"); 71 | }, 72 | ); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 47 | */ 48 | template 49 | struct BlockHistogramAtomic 50 | { 51 | /// Shared memory storage layout type 52 | struct TempStorage {}; 53 | 54 | 55 | /// Constructor 56 | __device__ __forceinline__ BlockHistogramAtomic( 57 | TempStorage &temp_storage) 58 | {} 59 | 60 | 61 | /// Composite data onto an existing histogram 62 | template < 63 | typename T, 64 | typename CounterT, 65 | int ITEMS_PER_THREAD> 66 | __device__ __forceinline__ void Composite( 67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 68 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 69 | { 70 | // Update histogram 71 | #pragma unroll 72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 73 | { 74 | atomicAdd(histogram + items[i], 1); 75 | } 76 | } 77 | 78 | }; 79 | 80 | } // CUB namespace 81 | CUB_NS_POSTFIX // Optional outer namespace(s) 82 | 83 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | 37 | // Block 38 | #include "block/block_histogram.cuh" 39 | #include "block/block_discontinuity.cuh" 40 | #include "block/block_exchange.cuh" 41 | #include "block/block_load.cuh" 42 | #include "block/block_radix_rank.cuh" 43 | #include "block/block_radix_sort.cuh" 44 | #include "block/block_reduce.cuh" 45 | #include "block/block_scan.cuh" 46 | #include "block/block_store.cuh" 47 | //#include "block/block_shift.cuh" 48 | 49 | // Device 50 | #include "device/device_histogram.cuh" 51 | #include "device/device_partition.cuh" 52 | #include "device/device_radix_sort.cuh" 53 | #include "device/device_reduce.cuh" 54 | #include "device/device_run_length_encode.cuh" 55 | #include "device/device_scan.cuh" 56 | #include "device/device_segmented_radix_sort.cuh" 57 | #include "device/device_segmented_reduce.cuh" 58 | #include "device/device_select.cuh" 59 | #include "device/device_spmv.cuh" 60 | 61 | // Grid 62 | //#include "grid/grid_barrier.cuh" 63 | #include "grid/grid_even_share.cuh" 64 | #include "grid/grid_mapping.cuh" 65 | #include "grid/grid_queue.cuh" 66 | 67 | // Thread 68 | #include "thread/thread_load.cuh" 69 | #include "thread/thread_operators.cuh" 70 | #include "thread/thread_reduce.cuh" 71 | #include "thread/thread_scan.cuh" 72 | #include "thread/thread_store.cuh" 73 | 74 | // Warp 75 | #include "warp/warp_reduce.cuh" 76 | #include "warp/warp_scan.cuh" 77 | 78 | // Iterator 79 | #include "iterator/arg_index_input_iterator.cuh" 80 | #include "iterator/cache_modified_input_iterator.cuh" 81 | #include "iterator/cache_modified_output_iterator.cuh" 82 | #include "iterator/constant_input_iterator.cuh" 83 | #include "iterator/counting_input_iterator.cuh" 84 | #include "iterator/tex_obj_input_iterator.cuh" 85 | #include "iterator/tex_ref_input_iterator.cuh" 86 | #include "iterator/transform_input_iterator.cuh" 87 | 88 | // Util 89 | #include "util_arch.cuh" 90 | #include "util_debug.cuh" 91 | #include "util_device.cuh" 92 | #include "util_macro.cuh" 93 | #include "util_ptx.cuh" 94 | #include "util_type.cuh" 95 | 96 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | #ifndef CUB_ALIGN 50 | #if defined(_WIN32) || defined(_WIN64) 51 | /// Align struct 52 | #define CUB_ALIGN(bytes) __declspec(align(32)) 53 | #else 54 | /// Align struct 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | #endif 58 | 59 | #ifndef CUB_MAX 60 | /// Select maximum(a, b) 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | #endif 63 | 64 | #ifndef CUB_MIN 65 | /// Select minimum(a, b) 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | #endif 68 | 69 | #ifndef CUB_QUOTIENT_FLOOR 70 | /// Quotient of x/y rounded down to nearest integer 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | #endif 73 | 74 | #ifndef CUB_QUOTIENT_CEILING 75 | /// Quotient of x/y rounded up to nearest integer 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | #endif 78 | 79 | #ifndef CUB_ROUND_UP_NEAREST 80 | /// x rounded up to the nearest multiple of y 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | #endif 83 | 84 | #ifndef CUB_ROUND_DOWN_NEAREST 85 | /// x rounded down to the nearest multiple of y 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | #endif 88 | 89 | 90 | #ifndef CUB_STATIC_ASSERT 91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 92 | #define CUB_CAT_(a, b) a ## b 93 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 94 | #endif // DOXYGEN_SHOULD_SKIP_THIS 95 | 96 | /// Static assert 97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 98 | #endif 99 | 100 | /** @} */ // end group UtilModule 101 | 102 | } // CUB namespace 103 | CUB_NS_POSTFIX // Optional outer namespace(s) 104 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/decentralized_full_precision_synchronous.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::CommOpTrait; 2 | use crate::communicators::{BaguaCommunicator, BaguaHierarchicalCommunicator, NCCLGroupGuard}; 3 | use crate::datatypes::{ 4 | BaguaBucket, BaguaReductionOp, BaguaTensor, BaguaTensorRaw, RawBaguaTensor, 5 | }; 6 | use crate::events::BaguaEventChannel; 7 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL; 8 | use crate::{BaguaCommOpChannels, BaguaScheduledCommOp}; 9 | use parking_lot::Mutex; 10 | use std::sync::Arc; 11 | 12 | #[derive(Clone, Debug)] 13 | pub enum PeerSelectionMode { 14 | All, 15 | ShiftOne, 16 | Ring, 17 | } 18 | 19 | #[derive(Debug)] 20 | pub struct DecentralizedFullPrecisionSynchronous { 21 | pub communicator: BaguaCommunicator, 22 | pub peer_selection_mode: PeerSelectionMode, 23 | pub step: Mutex, 24 | pub peer_weight: BaguaTensor, 25 | } 26 | 27 | impl CommOpTrait for DecentralizedFullPrecisionSynchronous { 28 | fn execute_background_communication( 29 | &self, 30 | bucket: Arc, 31 | comm_op_channels: &BaguaCommOpChannels, 32 | ) { 33 | let bucket_guard = bucket.inner.lock(); 34 | let stream_ptr = self.communicator.stream_ptr(); 35 | 36 | let mut communication_tensor = match &self.communicator { 37 | BaguaCommunicator::SingleCommunicator(_) => { 38 | bucket_guard.get_communication_tensor(stream_ptr, false, false) 39 | } 40 | BaguaCommunicator::HierarchicalCommunicator(x) => match x { 41 | BaguaHierarchicalCommunicator::Leader(_) => { 42 | bucket_guard.get_communication_tensor(stream_ptr, true, true) 43 | } 44 | BaguaHierarchicalCommunicator::Worker(_) => { 45 | bucket_guard.get_communication_tensor(stream_ptr, false, false) 46 | } 47 | }, 48 | }; 49 | 50 | let peer_mode = &self.peer_selection_mode; 51 | 52 | let mut peer_guard = self.peer_weight.inner.write(); 53 | let mut peer_tensor = peer_guard.raw.as_mut(); 54 | let step = { *self.step.lock() } as i64; 55 | 56 | self.communicator.execute_communication( 57 | &mut communication_tensor, 58 | true, 59 | true, 60 | false, 61 | &mut |c, t| { 62 | match peer_mode { 63 | PeerSelectionMode::All => { 64 | { 65 | peer_tensor.clone_from(&t.raw, c.stream_ptr); 66 | let _guard = NCCLGroupGuard::new(); 67 | c.allreduce_inplace(peer_tensor, BaguaReductionOp::AVG); 68 | } 69 | } 70 | PeerSelectionMode::ShiftOne => { 71 | assert_eq!( 72 | c.nranks % 2, 73 | 0, 74 | "you cannot use decentralized algorithm with average_all off when there are odd number of ranks, current n_ranks {}", 75 | c.nranks 76 | ); 77 | let rank = c.rank as i64; 78 | let nranks = c.nranks as i64; 79 | let peer_rank = if c.rank < c.nranks / 2 { 80 | ((step + rank) % ((nranks + 1) / 2)) + (nranks / 2) 81 | } else { 82 | (rank - (nranks / 2) - step).rem_euclid(nranks / 2) 83 | } as i32; 84 | tracing::debug!("rank {} peer_rank {}", c.rank, peer_rank); 85 | { 86 | let _guard = NCCLGroupGuard::new(); 87 | c.send(&t.raw, peer_rank); 88 | c.recv(peer_tensor, peer_rank); 89 | } 90 | peer_tensor.average_inplace(&t.raw, c.stream_ptr); 91 | }, 92 | PeerSelectionMode::Ring => { 93 | unimplemented!() 94 | }, 95 | } 96 | }, 97 | ); 98 | 99 | *self.step.lock() += 1; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /bagua-core-internal/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let nvcc_path = which::which("nvcc") 3 | .expect("Cannot find nvcc, please install CUDA Toolkit and make sure nvcc is in your PATH first. See https://developer.nvidia.com/cuda-downloads"); 4 | let cuda_home = nvcc_path 5 | .parent() 6 | .expect("cannot find nvcc parent directory") 7 | .parent() 8 | .expect("cannot find nvcc parent directory") 9 | .display(); 10 | let supported_sms = cmd_lib::run_fun!( 11 | bash -c "nvcc --help | sed -n -e '/gpu-architecture /,/gpu-code / p' | sed -n -e '/Allowed values/,/gpu-code / p' | grep -i sm_ | grep -Eo 'sm_[0-9]+' | sed -e s/sm_//g | sort -g -u | tr '\n' ' '" 12 | ).unwrap(); 13 | let supported_sms = supported_sms.strip_suffix(' ').unwrap().split(' '); 14 | let mut cuda_cc = cc::Build::new(); 15 | cuda_cc 16 | .cuda(true) 17 | .include("cpp/include") 18 | .include("third_party/cub-1.8.0") 19 | .include("../python/bagua_core/.data/include") 20 | .flag("-std=c++14") 21 | .flag("-cudart=shared"); 22 | 23 | if std::env::var("PROFILE").unwrap() == "release" { 24 | for sm in supported_sms { 25 | cuda_cc 26 | .flag("-gencode") 27 | .flag(format!("arch=compute_{},code=sm_{}", sm, sm).as_str()); 28 | } 29 | } 30 | cuda_cc 31 | .file("kernels/bagua_kernels.cu") 32 | .compile("libbagua_kernels.a"); 33 | 34 | let third_party_path = std::env::current_dir().unwrap(); 35 | let bagua_data_path = std::env::current_dir().unwrap(); 36 | let third_party_path = third_party_path.join("third_party"); 37 | let bagua_data_path = bagua_data_path.join("../python/bagua_core/.data"); 38 | let _al_builder = cmake::Config::new("third_party/Aluminum") 39 | .define("ALUMINUM_ENABLE_NCCL", "YES") 40 | .define("CUB_INCLUDE_PATH", third_party_path.join("cub-1.8.0")) 41 | .define("NCCL_LIBRARY", bagua_data_path.join("lib/libnccl.so")) 42 | .define("NCCL_INCLUDE_PATH", bagua_data_path.join("include")) 43 | .define("BUILD_SHARED_LIBS", "off") 44 | .out_dir(bagua_data_path.as_path().to_str().unwrap()) 45 | .always_configure(true) 46 | .build(); 47 | 48 | let mut cpp_builder = cpp_build::Config::new(); 49 | cpp_builder.include(format!("{}/include", cuda_home)); 50 | cpp_builder.include("cpp/include"); 51 | let mpi_include_dirs = cmd_lib::run_fun!(bash -c "mpicxx --showme:incdirs").unwrap(); 52 | let mpi_include_dirs: Vec<&str> = mpi_include_dirs.split(' ').collect(); 53 | for mpi_include_dir in mpi_include_dirs.iter() { 54 | cpp_builder.include(mpi_include_dir); 55 | } 56 | cpp_builder.include(third_party_path.join("cub-1.8.0")); 57 | cpp_builder.include(bagua_data_path.join("include")); 58 | cpp_builder.build("src/lib.rs"); 59 | 60 | let mpi_lib_dirs = cmd_lib::run_fun!(bash -c "mpicxx --showme:libdirs").unwrap(); 61 | let mpi_lib_dirs: Vec<&str> = mpi_lib_dirs.split(' ').collect(); 62 | for mpi_lib_dir in mpi_lib_dirs.iter() { 63 | println!("cargo:rustc-link-search={}", mpi_lib_dir); 64 | } 65 | println!( 66 | "cargo:rustc-link-search=native={}", 67 | format!("{}/lib64", cuda_home) 68 | ); 69 | println!( 70 | "cargo:rustc-link-search={}", 71 | bagua_data_path.join("lib").as_path().to_str().unwrap() 72 | ); 73 | println!( 74 | "cargo:rustc-link-search={}", 75 | bagua_data_path.join("lib64").as_path().to_str().unwrap() 76 | ); 77 | println!("cargo:rustc-link-lib=static=Al"); 78 | println!("cargo:rustc-link-lib=mpi"); 79 | println!("cargo:rustc-link-lib=nccl"); 80 | println!("cargo:rustc-link-lib=cudart"); 81 | println!("cargo:rustc-link-lib=nvrtc"); 82 | println!("cargo:rustc-link-lib=cuda"); 83 | println!("cargo:rerun-if-env-changed=CUDA_HOME"); 84 | println!("cargo:rerun-if-changed=src/"); 85 | println!("cargo:rerun-if-changed=kernels/"); 86 | println!("cargo:rerun-if-changed=build.rs"); 87 | 88 | // bindgen --allowlist-type '.*TensorImpl.*' --enable-cxx-namespaces --ignore-functions --ignore-methods --size_t-is-usize --default-enum-style=rust --opaque-type 'std.*' --opaque-type 'c10::optional.*' wrapper.h -- -x c++ -std=c++14 > src/torch_ffi.rs 89 | shadow_rs::new().unwrap(); 90 | } 91 | -------------------------------------------------------------------------------- /bagua-core-internal/src/kernels/mod.rs: -------------------------------------------------------------------------------- 1 | use std::ffi::c_void; 2 | 3 | #[link(name = "bagua_kernels", kind = "static")] 4 | extern "C" { 5 | pub fn divide_inplace_f32_host(x: *mut c_void, D_: f32, N: i32, stream: *const c_void); 6 | pub fn divide_inplace_f16_host(x: *mut c_void, D_: f32, N: i32, stream: *const c_void); 7 | pub fn average_inplace_f32_host( 8 | x: *mut c_void, 9 | y: *const c_void, 10 | N: i32, 11 | stream: *const c_void, 12 | ); 13 | pub fn average_inplace_f16_host( 14 | x: *mut c_void, 15 | y: *const c_void, 16 | N: i32, 17 | stream: *const c_void, 18 | ); 19 | pub fn substract_inplace_f32_host( 20 | x: *mut c_void, 21 | y: *const c_void, 22 | N: i32, 23 | stream: *const c_void, 24 | ); 25 | pub fn substract_inplace_f16_host( 26 | x: *mut c_void, 27 | y: *const c_void, 28 | N: i32, 29 | stream: *const c_void, 30 | ); 31 | pub fn add_inplace_f32_host(x: *mut c_void, y: *const c_void, N: i32, stream: *const c_void); 32 | pub fn add_inplace_f16_host(x: *mut c_void, y: *const c_void, N: i32, stream: *const c_void); 33 | pub fn addmul_inplace_f32_host( 34 | x: *mut c_void, 35 | y: *const c_void, 36 | N: i32, 37 | factor: f32, 38 | stream: *const c_void, 39 | ); 40 | pub fn addmul_inplace_f16_host( 41 | x: *mut c_void, 42 | y: *const c_void, 43 | N: i32, 44 | factor: f32, 45 | stream: *const c_void, 46 | ); 47 | pub fn reduce_mean_f32_inplace_host( 48 | input: *mut c_void, 49 | chunk_size: i32, 50 | num_chunks: i32, 51 | target_chunk: i32, 52 | stream: *const c_void, 53 | ); 54 | pub fn reduce_mean_f16_inplace_host( 55 | input: *mut c_void, 56 | chunk_size: i32, 57 | num_chunks: i32, 58 | target_chunk: i32, 59 | stream: *const c_void, 60 | ); 61 | pub fn reduce_sum_f32_inplace_host( 62 | input: *mut c_void, 63 | chunk_size: i32, 64 | num_chunks: i32, 65 | target_chunk: i32, 66 | stream: *const c_void, 67 | ); 68 | pub fn reduce_sum_f16_inplace_host( 69 | input: *mut c_void, 70 | chunk_size: i32, 71 | num_chunks: i32, 72 | target_chunk: i32, 73 | stream: *const c_void, 74 | ); 75 | /// temp_buffer size is the same as decompressed tensor 76 | /// target_chunk = -1 means compressing all chunks 77 | pub fn compress_f32_to_uint8_host( 78 | input: *mut c_void, 79 | input_num_element: i32, 80 | chunk_size: i32, 81 | num_chunks: i32, 82 | output: *mut c_void, 83 | output_size_bytes: usize, 84 | temp_buffer: *mut c_void, 85 | temp_buffer_size_bytes: usize, 86 | target_chunk: i32, 87 | stream: *const c_void, 88 | ); 89 | pub fn decompress_uint8_to_f32_host( 90 | input: *mut c_void, 91 | input_size_bytes: usize, 92 | chunk_size: i32, 93 | num_chunks: i32, 94 | output: *mut c_void, 95 | stream: *const c_void, 96 | ); 97 | pub fn compress_f16_to_uint8_host( 98 | input: *mut c_void, 99 | input_num_element: i32, 100 | chunk_size: i32, 101 | num_chunks: i32, 102 | output: *mut c_void, 103 | output_size_bytes: usize, 104 | temp_buffer: *mut c_void, 105 | temp_buffer_size_bytes: usize, 106 | target_chunk: i32, 107 | stream: *const c_void, 108 | ); 109 | pub fn decompress_uint8_to_f16_host( 110 | input: *mut c_void, 111 | input_size_bytes: usize, 112 | chunk_size: i32, 113 | num_chunks: i32, 114 | output: *mut c_void, 115 | stream: *const c_void, 116 | ); 117 | pub fn array_min_max_size_f32_host( 118 | input: *mut c_void, 119 | input_num_element: i32, 120 | output: *mut c_void, 121 | stream: *const c_void, 122 | ) -> usize; 123 | pub fn array_min_max_size_f16_host( 124 | input: *mut c_void, 125 | input_num_element: i32, 126 | output: *mut c_void, 127 | stream: *const c_void, 128 | ) -> usize; 129 | pub fn async_model_average_host( 130 | tensor: *mut c_void, 131 | reduced_tensor_copy: *const c_void, 132 | tensor_copy: *const c_void, 133 | nranks: f32, 134 | N: i32, 135 | stream: *const c_void, 136 | ); 137 | } 138 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_cub.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | using namespace cub; 31 | 32 | template < 33 | int NUM_CHANNELS, 34 | int ACTIVE_CHANNELS, 35 | int NUM_BINS, 36 | typename PixelType> 37 | double run_cub_histogram( 38 | PixelType *d_image, 39 | int width, 40 | int height, 41 | unsigned int *d_hist, 42 | bool is_warmup) 43 | { 44 | enum { 45 | is_float = Equals::VALUE, 46 | }; 47 | 48 | typedef typename If::Type SampleT; // Sample type 49 | typedef typename If::Type LevelT; // Level type (uint32 for uchar) 50 | 51 | // Setup data structures 52 | unsigned int* d_histogram[ACTIVE_CHANNELS]; 53 | int num_levels[ACTIVE_CHANNELS]; ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. 54 | LevelT lower_level[ACTIVE_CHANNELS]; ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. 55 | LevelT upper_level[ACTIVE_CHANNELS]; ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. 56 | 57 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 58 | { 59 | d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS); 60 | num_levels[CHANNEL] = NUM_BINS + 1; 61 | lower_level[CHANNEL] = 0; 62 | upper_level[CHANNEL] = (is_float) ? 1 : 256; 63 | } 64 | 65 | // Allocate temporary storage 66 | size_t temp_storage_bytes = 0; 67 | void *d_temp_storage = NULL; 68 | 69 | SampleT* d_image_samples = (SampleT*) d_image; 70 | 71 | // Get amount of temporary storage needed 72 | DeviceHistogram::MultiHistogramEven( 73 | d_temp_storage, 74 | temp_storage_bytes, 75 | d_image_samples, 76 | d_histogram, 77 | num_levels, 78 | lower_level, 79 | upper_level, 80 | width * height, 81 | (cudaStream_t) 0, 82 | is_warmup); 83 | 84 | cudaMalloc(&d_temp_storage, temp_storage_bytes); 85 | 86 | GpuTimer gpu_timer; 87 | gpu_timer.Start(); 88 | 89 | // Compute histogram 90 | DeviceHistogram::MultiHistogramEven( 91 | d_temp_storage, 92 | temp_storage_bytes, 93 | d_image_samples, 94 | d_histogram, 95 | num_levels, 96 | lower_level, 97 | upper_level, 98 | width * height, 99 | (cudaStream_t) 0, 100 | is_warmup); 101 | 102 | gpu_timer.Stop(); 103 | float elapsed_millis = gpu_timer.ElapsedMillis(); 104 | 105 | cudaFree(d_temp_storage); 106 | 107 | return elapsed_millis; 108 | } 109 | 110 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An a "raking" access pattern in which each thread block is 63 | * assigned a consecutive sequence of input tiles 64 | * 65 | * \par Overview 66 | * The input is evenly partitioned into \p p segments, where \p p is 67 | * constant and corresponds loosely to the number of thread blocks that may 68 | * actively reside on the target device. Each segment is comprised of 69 | * consecutive tiles, where a tile is a small, constant-sized unit of input 70 | * to be processed to completion before the thread block terminates or 71 | * obtains more work. The kernel invokes \p p thread blocks, each 72 | * of which iteratively consumes a segment of n/p elements 73 | * in tile-size increments. 74 | */ 75 | GRID_MAPPING_RAKE, 76 | 77 | /** 78 | * \brief An a "strip mining" access pattern in which the input tiles assigned 79 | * to each thread block are separated by a stride equal to the the extent of 80 | * the grid. 81 | * 82 | * \par Overview 83 | * The input is evenly partitioned into \p p sets, where \p p is 84 | * constant and corresponds loosely to the number of thread blocks that may 85 | * actively reside on the target device. Each set is comprised of 86 | * data tiles separated by stride \p tiles, where a tile is a small, 87 | * constant-sized unit of input to be processed to completion before the 88 | * thread block terminates or obtains more work. The kernel invokes \p p 89 | * thread blocks, each of which iteratively consumes a segment of 90 | * n/p elements in tile-size increments. 91 | */ 92 | GRID_MAPPING_STRIP_MINE, 93 | 94 | /** 95 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 96 | * 97 | * \par Overview 98 | * The input is treated as a queue to be dynamically consumed by a grid of 99 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 100 | * unit of input to be processed to completion before the thread block 101 | * terminates or obtains more work. The grid size \p p is constant, 102 | * loosely corresponding to the number of thread blocks that may actively 103 | * reside on the target device. 104 | */ 105 | GRID_MAPPING_DYNAMIC, 106 | }; 107 | 108 | 109 | /** @} */ // end group GridModule 110 | 111 | } // CUB namespace 112 | CUB_NS_POSTFIX // Optional outer namespace(s) 113 | 114 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 38 | #include 39 | #else 40 | #if defined(_WIN32) || defined(_WIN64) 41 | #include 42 | 43 | #define WIN32_LEAN_AND_MEAN 44 | #define NOMINMAX 45 | #include 46 | #undef WIN32_LEAN_AND_MEAN 47 | #undef NOMINMAX 48 | 49 | /** 50 | * Compiler read/write barrier 51 | */ 52 | #pragma intrinsic(_ReadWriteBarrier) 53 | 54 | #endif 55 | #endif 56 | 57 | #include "../util_namespace.cuh" 58 | 59 | 60 | /// Optional outer namespace(s) 61 | CUB_NS_PREFIX 62 | 63 | /// CUB namespace 64 | namespace cub { 65 | 66 | 67 | /** 68 | * Simple portable mutex 69 | * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) 70 | * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) 71 | */ 72 | struct Mutex 73 | { 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) 75 | 76 | std::mutex mtx; 77 | 78 | void Lock() 79 | { 80 | mtx.lock(); 81 | } 82 | 83 | void Unlock() 84 | { 85 | mtx.unlock(); 86 | } 87 | 88 | void TryLock() 89 | { 90 | mtx.try_lock(); 91 | } 92 | 93 | #else //__cplusplus > 199711L 94 | 95 | #if defined(_MSC_VER) 96 | 97 | // Microsoft VC++ 98 | typedef long Spinlock; 99 | 100 | #else 101 | 102 | // GNU g++ 103 | typedef int Spinlock; 104 | 105 | /** 106 | * Compiler read/write barrier 107 | */ 108 | __forceinline__ void _ReadWriteBarrier() 109 | { 110 | __sync_synchronize(); 111 | } 112 | 113 | /** 114 | * Atomic exchange 115 | */ 116 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 117 | { 118 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 119 | _ReadWriteBarrier(); 120 | return __sync_lock_test_and_set(Target, Value); 121 | } 122 | 123 | /** 124 | * Pause instruction to prevent excess processor bus usage 125 | */ 126 | __forceinline__ void YieldProcessor() 127 | { 128 | } 129 | 130 | #endif // defined(_MSC_VER) 131 | 132 | /// Lock member 133 | volatile Spinlock lock; 134 | 135 | /** 136 | * Constructor 137 | */ 138 | Mutex() : lock(0) {} 139 | 140 | /** 141 | * Return when the specified spinlock has been acquired 142 | */ 143 | __forceinline__ void Lock() 144 | { 145 | while (1) 146 | { 147 | if (!_InterlockedExchange(&lock, 1)) return; 148 | while (lock) YieldProcessor(); 149 | } 150 | } 151 | 152 | 153 | /** 154 | * Release the specified spinlock 155 | */ 156 | __forceinline__ void Unlock() 157 | { 158 | _ReadWriteBarrier(); 159 | lock = 0; 160 | } 161 | 162 | #endif // __cplusplus > 199711L 163 | 164 | }; 165 | 166 | 167 | 168 | 169 | } // CUB namespace 170 | CUB_NS_POSTFIX // Optional outer namespace(s) 171 | 172 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/Makefile: -------------------------------------------------------------------------------- 1 | #/****************************************************************************** 2 | # * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | # * 5 | # * Redistribution and use in source and binary forms, with or without 6 | # * modification, are permitted provided that the following conditions are met: 7 | # * * Redistributions of source code must retain the above copyright 8 | # * notice, this list of conditions and the following disclaimer. 9 | # * * Redistributions in binary form must reproduce the above copyright 10 | # * notice, this list of conditions and the following disclaimer in the 11 | # * documentation and/or other materials provided with the distribution. 12 | # * * Neither the name of the NVIDIA CORPORATION nor the 13 | # * names of its contributors may be used to endorse or promote products 14 | # * derived from this software without specific prior written permission. 15 | # * 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | # * 27 | #******************************************************************************/ 28 | 29 | #------------------------------------------------------------------------------- 30 | # 31 | # Makefile usage 32 | # 33 | # make [sm=] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>] 34 | # 35 | #------------------------------------------------------------------------------- 36 | 37 | include ../common.mk 38 | 39 | #------------------------------------------------------------------------------- 40 | # Commandline Options 41 | #------------------------------------------------------------------------------- 42 | 43 | # [mkl=<0|1>] compile against Intel MKL 44 | ifeq ($(mkl), 1) 45 | DEFINES += -DCUB_MKL 46 | 47 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) 48 | LIBS += mkl_intel_lp64.lib mkl_intel_thread.lib mkl_core.lib libiomp5md.lib 49 | NVCCFLAGS += -Xcompiler /openmp 50 | else 51 | LIBS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm 52 | NVCCFLAGS += -Xcompiler -fopenmp 53 | 54 | endif 55 | 56 | endif 57 | 58 | 59 | #------------------------------------------------------------------------------- 60 | # Compiler and compilation platform 61 | #------------------------------------------------------------------------------- 62 | 63 | # Includes 64 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 65 | 66 | # detect OS 67 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 68 | 69 | #------------------------------------------------------------------------------- 70 | # Dependency Lists 71 | #------------------------------------------------------------------------------- 72 | 73 | exp_rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) 74 | 75 | EXP_DEPS = $(call rwildcard, ./,*.cuh) \ 76 | $(call rwildcard, ./,*.h) 77 | 78 | DEPS = $(CUB_DEPS) \ 79 | $(EXP_DEPS) \ 80 | $(CUB_DIR)test/Makefile \ 81 | $(CUB_DIR)test/test_util.h \ 82 | $(CUB_DIR)test/mersenne.h \ 83 | 84 | 85 | 86 | #------------------------------------------------------------------------------- 87 | # make default 88 | #------------------------------------------------------------------------------- 89 | 90 | default: 91 | 92 | 93 | #------------------------------------------------------------------------------- 94 | # make clean 95 | #------------------------------------------------------------------------------- 96 | 97 | clean : 98 | rm -f bin/*$(CPU_ARCH_SUFFIX)* 99 | rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o 100 | 101 | 102 | 103 | #------------------------------------------------------------------------------- 104 | # make histogram_compare 105 | #------------------------------------------------------------------------------- 106 | 107 | histogram_compare: bin/histogram_compare_$(BIN_SUFFIX) 108 | 109 | bin/histogram_compare_$(BIN_SUFFIX) : histogram_compare.cu $(DEPS) 110 | mkdir -p bin 111 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/histogram_compare_$(BIN_SUFFIX) histogram_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 112 | 113 | 114 | 115 | #------------------------------------------------------------------------------- 116 | # make spmv_compare 117 | #------------------------------------------------------------------------------- 118 | 119 | spmv_compare: bin/spmv_compare_$(BIN_SUFFIX) 120 | 121 | bin/spmv_compare_$(BIN_SUFFIX) : spmv_compare.cu $(DEPS) 122 | mkdir -p bin 123 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/spmv_compare_$(BIN_SUFFIX) spmv_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse $(MKL_LIBS) -O3 124 | 125 | 126 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/thread/thread_search.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential search 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * Computes the begin offsets into A and B for the specific diagonal 47 | */ 48 | template < 49 | typename AIteratorT, 50 | typename BIteratorT, 51 | typename OffsetT, 52 | typename CoordinateT> 53 | __host__ __device__ __forceinline__ void MergePathSearch( 54 | OffsetT diagonal, 55 | AIteratorT a, 56 | BIteratorT b, 57 | OffsetT a_len, 58 | OffsetT b_len, 59 | CoordinateT& path_coordinate) 60 | { 61 | /// The value type of the input iterator 62 | typedef typename std::iterator_traits::value_type T; 63 | 64 | OffsetT split_min = CUB_MAX(diagonal - b_len, 0); 65 | OffsetT split_max = CUB_MIN(diagonal, a_len); 66 | 67 | while (split_min < split_max) 68 | { 69 | OffsetT split_pivot = (split_min + split_max) >> 1; 70 | if (a[split_pivot] <= b[diagonal - split_pivot - 1]) 71 | { 72 | // Move candidate split range up A, down B 73 | split_min = split_pivot + 1; 74 | } 75 | else 76 | { 77 | // Move candidate split range up B, down A 78 | split_max = split_pivot; 79 | } 80 | } 81 | 82 | path_coordinate.x = CUB_MIN(split_min, a_len); 83 | path_coordinate.y = diagonal - split_min; 84 | } 85 | 86 | 87 | 88 | /** 89 | * \brief Returns the offset of the first value within \p input which does not compare less than \p val 90 | */ 91 | template < 92 | typename InputIteratorT, 93 | typename OffsetT, 94 | typename T> 95 | __device__ __forceinline__ OffsetT LowerBound( 96 | InputIteratorT input, ///< [in] Input sequence 97 | OffsetT num_items, ///< [in] Input sequence length 98 | T val) ///< [in] Search key 99 | { 100 | OffsetT retval = 0; 101 | while (num_items > 0) 102 | { 103 | OffsetT half = num_items >> 1; 104 | if (input[retval + half] < val) 105 | { 106 | retval = retval + (half + 1); 107 | num_items = num_items - (half + 1); 108 | } 109 | else 110 | { 111 | num_items = half; 112 | } 113 | } 114 | 115 | return retval; 116 | } 117 | 118 | 119 | /** 120 | * \brief Returns the offset of the first value within \p input which compares greater than \p val 121 | */ 122 | template < 123 | typename InputIteratorT, 124 | typename OffsetT, 125 | typename T> 126 | __device__ __forceinline__ OffsetT UpperBound( 127 | InputIteratorT input, ///< [in] Input sequence 128 | OffsetT num_items, ///< [in] Input sequence length 129 | T val) ///< [in] Search key 130 | { 131 | OffsetT retval = 0; 132 | while (num_items > 0) 133 | { 134 | OffsetT half = num_items >> 1; 135 | if (val < input[retval + half]) 136 | { 137 | num_items = half; 138 | } 139 | else 140 | { 141 | retval = retval + (half + 1); 142 | num_items = num_items - (half + 1); 143 | } 144 | } 145 | 146 | return retval; 147 | } 148 | 149 | 150 | 151 | 152 | 153 | } // CUB namespace 154 | CUB_NS_POSTFIX // Optional outer namespace(s) 155 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/util_debug.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Error and event logging routines. 32 | * 33 | * The following macros definitions are supported: 34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout. 35 | */ 36 | 37 | #pragma once 38 | 39 | #include 40 | #include "util_namespace.cuh" 41 | #include "util_arch.cuh" 42 | 43 | /// Optional outer namespace(s) 44 | CUB_NS_PREFIX 45 | 46 | /// CUB namespace 47 | namespace cub { 48 | 49 | 50 | /** 51 | * \addtogroup UtilMgmt 52 | * @{ 53 | */ 54 | 55 | 56 | /// CUB error reporting macro (prints error messages to stderr) 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) 58 | #define CUB_STDERR 59 | #endif 60 | 61 | 62 | 63 | /** 64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. 65 | * 66 | * \return The CUDA error. 67 | */ 68 | __host__ __device__ __forceinline__ cudaError_t Debug( 69 | cudaError_t error, 70 | const char* filename, 71 | int line) 72 | { 73 | (void)filename; 74 | (void)line; 75 | #ifdef CUB_STDERR 76 | if (error) 77 | { 78 | #if (CUB_PTX_ARCH == 0) 79 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); 80 | fflush(stderr); 81 | #elif (CUB_PTX_ARCH >= 200) 82 | printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); 83 | #endif 84 | } 85 | #endif 86 | return error; 87 | } 88 | 89 | 90 | /** 91 | * \brief Debug macro 92 | */ 93 | #ifndef CubDebug 94 | #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) 95 | #endif 96 | 97 | 98 | /** 99 | * \brief Debug macro with exit 100 | */ 101 | #ifndef CubDebugExit 102 | #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } 103 | #endif 104 | 105 | 106 | /** 107 | * \brief Log macro for printf statements. 108 | */ 109 | #if !defined(_CubLog) 110 | #if !(defined(__clang__) && defined(__CUDA__)) 111 | #if (CUB_PTX_ARCH == 0) 112 | #define _CubLog(format, ...) printf(format,__VA_ARGS__); 113 | #elif (CUB_PTX_ARCH >= 200) 114 | #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); 115 | #endif 116 | #else 117 | // XXX shameless hack for clang around variadic printf... 118 | // Compilies w/o supplying -std=c++11 but shows warning, 119 | // so we sielence them :) 120 | #pragma clang diagnostic ignored "-Wc++11-extensions" 121 | #pragma clang diagnostic ignored "-Wunnamed-type-template-args" 122 | template 123 | inline __host__ __device__ void va_printf(char const* format, Args const&... args) 124 | { 125 | #ifdef __CUDA_ARCH__ 126 | printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); 127 | #else 128 | printf(format, args...); 129 | #endif 130 | } 131 | #ifndef __CUDA_ARCH__ 132 | #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); 133 | #else 134 | #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); 135 | #endif 136 | #endif 137 | #endif 138 | 139 | 140 | 141 | 142 | /** @} */ // end group UtilMgmt 143 | 144 | } // CUB namespace 145 | CUB_NS_POSTFIX // Optional outer namespace(s) 146 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/block/Makefile: -------------------------------------------------------------------------------- 1 | #/****************************************************************************** 2 | # * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | # * 5 | # * Redistribution and use in source and binary forms, with or without 6 | # * modification, are permitted provided that the following conditions are met: 7 | # * * Redistributions of source code must retain the above copyright 8 | # * notice, this list of conditions and the following disclaimer. 9 | # * * Redistributions in binary form must reproduce the above copyright 10 | # * notice, this list of conditions and the following disclaimer in the 11 | # * documentation and/or other materials provided with the distribution. 12 | # * * Neither the name of the NVIDIA CORPORATION nor the 13 | # * names of its contributors may be used to endorse or promote products 14 | # * derived from this software without specific prior written permission. 15 | # * 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | # * 27 | #******************************************************************************/ 28 | 29 | #------------------------------------------------------------------------------- 30 | # 31 | # Makefile usage 32 | # 33 | # make [sm=] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] 34 | # 35 | #------------------------------------------------------------------------------- 36 | 37 | include ../../common.mk 38 | 39 | 40 | #------------------------------------------------------------------------------- 41 | # Includes 42 | #------------------------------------------------------------------------------- 43 | 44 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 45 | 46 | 47 | 48 | #------------------------------------------------------------------------------- 49 | # Dependency Lists 50 | #------------------------------------------------------------------------------- 51 | 52 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) 53 | 54 | DEPS = $(CUB_DEPS) \ 55 | $(CUB_DIR)test/Makefile \ 56 | $(CUB_DIR)test/test_util.h \ 57 | $(CUB_DIR)test/mersenne.h \ 58 | 59 | ALL = example_block_radix_sort \ 60 | example_block_reduce \ 61 | example_block_scan 62 | 63 | 64 | 65 | #------------------------------------------------------------------------------- 66 | # make default 67 | #------------------------------------------------------------------------------- 68 | 69 | default: 70 | 71 | 72 | #------------------------------------------------------------------------------- 73 | # make clean 74 | #------------------------------------------------------------------------------- 75 | 76 | clean : 77 | rm -f bin/*$(CPU_ARCH_SUFFIX)* 78 | rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o 79 | 80 | 81 | #------------------------------------------------------------------------------- 82 | # make all 83 | #------------------------------------------------------------------------------- 84 | 85 | all : $(ALL) 86 | 87 | #------------------------------------------------------------------------------- 88 | # make run 89 | #------------------------------------------------------------------------------- 90 | 91 | run : 92 | for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done 93 | 94 | 95 | 96 | 97 | #------------------------------------------------------------------------------- 98 | # make example_block_reduce 99 | #------------------------------------------------------------------------------- 100 | 101 | example_block_reduce: bin/example_block_reduce_$(BIN_SUFFIX) 102 | 103 | bin/example_block_reduce_$(BIN_SUFFIX) : example_block_reduce.cu $(DEPS) 104 | mkdir -p bin 105 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_reduce_$(BIN_SUFFIX) example_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 106 | 107 | 108 | #------------------------------------------------------------------------------- 109 | # make example_block_scan 110 | #------------------------------------------------------------------------------- 111 | 112 | example_block_scan: bin/example_block_scan_$(BIN_SUFFIX) 113 | 114 | bin/example_block_scan_$(BIN_SUFFIX) : example_block_scan.cu $(DEPS) 115 | mkdir -p bin 116 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_scan_$(BIN_SUFFIX) example_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 117 | 118 | 119 | #------------------------------------------------------------------------------- 120 | # make example_block_radix_sort 121 | #------------------------------------------------------------------------------- 122 | 123 | example_block_radix_sort: bin/example_block_radix_sort_$(BIN_SUFFIX) 124 | 125 | bin/example_block_radix_sort_$(BIN_SUFFIX) : example_block_radix_sort.cu $(DEPS) 126 | mkdir -p bin 127 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_radix_sort_$(BIN_SUFFIX) example_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 128 | 129 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/test_grid_barrier.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Test evaluation for software global barrier throughput 31 | ******************************************************************************/ 32 | 33 | // Ensure printing of CUDA runtime errors to console 34 | #define CUB_STDERR 35 | 36 | #include 37 | 38 | #include 39 | 40 | #include "test_util.h" 41 | 42 | using namespace cub; 43 | 44 | 45 | //--------------------------------------------------------------------- 46 | // Test kernels 47 | //--------------------------------------------------------------------- 48 | 49 | /** 50 | * Kernel that iterates through the specified number of software global barriers 51 | */ 52 | __global__ void Kernel( 53 | GridBarrier global_barrier, 54 | int iterations) 55 | { 56 | for (int i = 0; i < iterations; i++) 57 | { 58 | global_barrier.Sync(); 59 | } 60 | } 61 | 62 | 63 | //--------------------------------------------------------------------- 64 | // Main 65 | //--------------------------------------------------------------------- 66 | 67 | /** 68 | * Main 69 | */ 70 | int main(int argc, char** argv) 71 | { 72 | cudaError_t retval = cudaSuccess; 73 | 74 | // Defaults 75 | int iterations = 10000; 76 | int block_size = 128; 77 | int grid_size = -1; 78 | 79 | // Initialize command line 80 | CommandLineArgs args(argc, argv); 81 | 82 | // Get args 83 | args.GetCmdLineArgument("i", iterations); 84 | args.GetCmdLineArgument("grid-size", grid_size); 85 | args.GetCmdLineArgument("block-size", block_size); 86 | 87 | // Print usage 88 | if (args.CheckCmdLineFlag("help")) 89 | { 90 | printf("%s " 91 | "[--device=]" 92 | "[--i=]" 93 | "[--grid-size]" 94 | "[--block-size]" 95 | "\n", argv[0]); 96 | exit(0); 97 | } 98 | 99 | // Initialize device 100 | CubDebugExit(args.DeviceInit()); 101 | 102 | // Get device ordinal 103 | int device_ordinal; 104 | CubDebugExit(cudaGetDevice(&device_ordinal)); 105 | 106 | // Get device SM version 107 | int sm_version; 108 | CubDebugExit(SmVersion(sm_version, device_ordinal)); 109 | 110 | // Get SM properties 111 | int sm_count, max_block_threads, max_sm_occupancy; 112 | CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal)); 113 | CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal)); 114 | CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel, 32)); 115 | 116 | // Compute grid size and occupancy 117 | int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy); 118 | 119 | if (grid_size == -1) 120 | { 121 | grid_size = occupancy * sm_count; 122 | } 123 | else 124 | { 125 | occupancy = grid_size / sm_count; 126 | } 127 | 128 | printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n", 129 | grid_size, block_size, occupancy); 130 | fflush(stdout); 131 | 132 | // Init global barrier 133 | GridBarrierLifetime global_barrier; 134 | global_barrier.Setup(grid_size); 135 | 136 | // Time kernel 137 | GpuTimer gpu_timer; 138 | gpu_timer.Start(); 139 | Kernel<<>>(global_barrier, iterations); 140 | gpu_timer.Stop(); 141 | 142 | retval = CubDebug(cudaThreadSynchronize()); 143 | 144 | // Output timing results 145 | float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations); 146 | printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n", 147 | iterations, 148 | gpu_timer.ElapsedMillis(), 149 | avg_elapsed); 150 | 151 | return retval; 152 | } 153 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/test/mersenne.h: -------------------------------------------------------------------------------- 1 | /* 2 | A C-program for MT19937, with initialization improved 2002/1/26. 3 | Coded by Takuji Nishimura and Makoto Matsumoto. 4 | 5 | Before using, initialize the state by using init_genrand(seed) 6 | or init_by_array(init_key, key_length). 7 | 8 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions 13 | are met: 14 | 15 | 1. Redistributions of source code must retain the above copyright 16 | notice, this list of conditions and the following disclaimer. 17 | 18 | 2. Redistributions in binary form must reproduce the above copyright 19 | notice, this list of conditions and the following disclaimer in the 20 | documentation and/or other materials provided with the distribution. 21 | 22 | 3. The names of its contributors may not be used to endorse or promote 23 | products derived from this software without specific prior written 24 | permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 27 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 28 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 29 | A PARTICAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 30 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 31 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 32 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 33 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 34 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 35 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | 38 | 39 | Any feedback is very welcome. 40 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html 41 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) 42 | */ 43 | 44 | #include 45 | 46 | namespace mersenne { 47 | 48 | /* Period parameters */ 49 | const unsigned int N = 624; 50 | const unsigned int M = 397; 51 | const unsigned int MATRIX_A = 0x9908b0df; /* constant vector a */ 52 | const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */ 53 | const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */ 54 | 55 | static unsigned int mt[N]; /* the array for the state vector */ 56 | static int mti = N + 1; /* mti==N+1 means mt[N] is not initialized */ 57 | 58 | /* initializes mt[N] with a seed */ 59 | void init_genrand(unsigned int s) 60 | { 61 | mt[0] = s & 0xffffffff; 62 | for (mti = 1; mti < N; mti++) 63 | { 64 | mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti); 65 | 66 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */ 67 | /* In the previous versions, MSBs of the seed affect */ 68 | /* only MSBs of the array mt[]. */ 69 | /* 2002/01/09 modified by Makoto Matsumoto */ 70 | 71 | mt[mti] &= 0xffffffff; 72 | /* for >32 bit machines */ 73 | } 74 | } 75 | 76 | /* initialize by an array with array-length */ 77 | /* init_key is the array for initializing keys */ 78 | /* key_length is its length */ 79 | /* slight change for C++, 2004/2/26 */ 80 | void init_by_array(unsigned int init_key[], int key_length) 81 | { 82 | int i, j, k; 83 | init_genrand(19650218); 84 | i = 1; 85 | j = 0; 86 | k = (N > key_length ? N : key_length); 87 | for (; k; k--) 88 | { 89 | mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525)) 90 | + init_key[j] + j; /* non linear */ 91 | mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ 92 | i++; 93 | j++; 94 | if (i >= N) 95 | { 96 | mt[0] = mt[N - 1]; 97 | i = 1; 98 | } 99 | if (j >= key_length) j = 0; 100 | } 101 | for (k = N - 1; k; k--) 102 | { 103 | mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */ 104 | mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */ 105 | i++; 106 | if (i >= N) 107 | { 108 | mt[0] = mt[N - 1]; 109 | i = 1; 110 | } 111 | } 112 | 113 | mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */ 114 | } 115 | 116 | /* generates a random number on [0,0xffffffff]-interval */ 117 | unsigned int genrand_int32(void) 118 | { 119 | unsigned int y; 120 | static unsigned int mag01[2] = { 0x0, MATRIX_A }; 121 | 122 | /* mag01[x] = x * MATRIX_A for x=0,1 */ 123 | 124 | if (mti >= N) 125 | { /* generate N words at one time */ 126 | int kk; 127 | 128 | if (mti == N + 1) /* if init_genrand() has not been called, */ 129 | init_genrand(5489); /* a defat initial seed is used */ 130 | 131 | for (kk = 0; kk < N - M; kk++) 132 | { 133 | y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); 134 | mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1]; 135 | } 136 | for (; kk < N - 1; kk++) 137 | { 138 | y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK); 139 | mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1]; 140 | } 141 | y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK); 142 | mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1]; 143 | 144 | mti = 0; 145 | } 146 | 147 | y = mt[mti++]; 148 | 149 | /* Tempering */ 150 | y ^= (y >> 11); 151 | y ^= (y << 7) & 0x9d2c5680; 152 | y ^= (y << 15) & 0xefc60000; 153 | y ^= (y >> 18); 154 | 155 | return y; 156 | } 157 | 158 | 159 | 160 | } // namespace mersenne 161 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/README.md: -------------------------------------------------------------------------------- 1 |
2 |

About CUB

3 | 4 | Current release: v1.8.0 (02/16/2018) 5 | 6 | We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples. 7 | 8 | CUB provides state-of-the-art, reusable software components for every layer 9 | of the CUDA programming model: 10 | - [Device-wide primitives] (https://nvlabs.github.com/cub/group___device_module.html) 11 | - Sort, prefix scan, reduction, histogram, etc. 12 | - Compatible with CUDA dynamic parallelism 13 | - [Block-wide "collective" primitives] (https://nvlabs.github.com/cub/group___block_module.html) 14 | - I/O, sort, prefix scan, reduction, histogram, etc. 15 | - Compatible with arbitrary thread block sizes and types 16 | - [Warp-wide "collective" primitives] (https://nvlabs.github.com/cub/group___warp_module.html) 17 | - Warp-wide prefix scan, reduction, etc. 18 | - Safe and architecture-specific 19 | - [Thread and resource utilities](https://nvlabs.github.com/cub/group___thread_module.html) 20 | - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. 21 | 22 | ![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png) 23 | 24 |

25 |

A Simple Example

26 | 27 | ```C++ 28 | #include 29 | 30 | // Block-sorting CUDA kernel 31 | __global__ void BlockSortKernel(int *d_in, int *d_out) 32 | { 33 | using namespace cub; 34 | 35 | // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 36 | // owning 16 integer items each 37 | typedef BlockRadixSort BlockRadixSort; 38 | typedef BlockLoad BlockLoad; 39 | typedef BlockStore BlockStore; 40 | 41 | // Allocate shared memory 42 | __shared__ union { 43 | typename BlockRadixSort::TempStorage sort; 44 | typename BlockLoad::TempStorage load; 45 | typename BlockStore::TempStorage store; 46 | } temp_storage; 47 | 48 | int block_offset = blockIdx.x * (128 * 16); // OffsetT for this block's ment 49 | 50 | // Obtain a segment of 2048 consecutive keys that are blocked across threads 51 | int thread_keys[16]; 52 | BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys); 53 | __syncthreads(); 54 | 55 | // Collectively sort the keys 56 | BlockRadixSort(temp_storage.sort).Sort(thread_keys); 57 | __syncthreads(); 58 | 59 | // Store the sorted segment 60 | BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys); 61 | } 62 | ``` 63 | 64 | Each thread block uses cub::BlockRadixSort to collectively sort 65 | its own input segment. The class is specialized by the 66 | data type being sorted, by the number of threads per block, by the number of 67 | keys per thread, and implicitly by the targeted compilation architecture. 68 | 69 | The cub::BlockLoad and cub::BlockStore classes are similarly specialized. 70 | Furthermore, to provide coalesced accesses to device memory, these primitives are 71 | configured to access memory using a striped access pattern (where consecutive threads 72 | simultaneously access consecutive items) and then transpose the keys into 73 | a [blocked arrangement](index.html#sec4sec3) of elements across threads. 74 | 75 | Once specialized, these classes expose opaque \p TempStorage member types. 76 | The thread block uses these storage types to statically allocate the union of 77 | shared memory needed by the thread block. (Alternatively these storage types 78 | could be aliased to global memory allocations). 79 | 80 |

81 |

Stable Releases

82 | 83 | CUB releases are labeled using version identifiers having three fields: 84 | *epoch.feature.update*. The *epoch* field corresponds to support for 85 | a major change in the CUDA programming model. The *feature* field 86 | corresponds to a stable set of features, functionality, and interface. The 87 | *update* field corresponds to a bug-fix or performance update for that 88 | feature set. At the moment, we do not publicly provide non-stable releases 89 | such as development snapshots, beta releases or rolling releases. (Feel free 90 | to contact us if you would like such things.) See the 91 | [CUB Project Website](http://nvlabs.github.com/cub) for more information. 92 | 93 |

94 |

Contributors

95 | 96 | CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com). The primary contributor is [Duane Merrill](http://github.com/dumerrill). 97 | 98 |

99 |

Open Source License

100 | 101 | CUB is available under the "New BSD" open-source license: 102 | 103 | ``` 104 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 105 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 106 | 107 | Redistribution and use in source and binary forms, with or without 108 | modification, are permitted provided that the following conditions are met: 109 | * Redistributions of source code must retain the above copyright 110 | notice, this list of conditions and the following disclaimer. 111 | * Redistributions in binary form must reproduce the above copyright 112 | notice, this list of conditions and the following disclaimer in the 113 | documentation and/or other materials provided with the distribution. 114 | * Neither the name of the NVIDIA CORPORATION nor the 115 | names of its contributors may be used to endorse or promote products 116 | derived from this software without specific prior written permission. 117 | 118 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 119 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 120 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 121 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 122 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 123 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 124 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 125 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 126 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 127 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 128 | ``` 129 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/decentralized_full_precision_asynchronous.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::decentralized_full_precision_synchronous::PeerSelectionMode; 2 | use crate::comm_ops::CommOpTrait; 3 | use crate::communicators::BaguaCommunicator; 4 | use crate::datatypes::{BaguaBucket, BaguaReductionOp, BaguaTensorRaw, RawBaguaTensor}; 5 | use crate::events::BaguaEventChannel; 6 | use crate::resource_pool::{CUDA_DEVICE_MEMORY_POOL, CUDA_EVENT_POOL}; 7 | use crate::{BaguaCommOpChannels, BaguaCoreError}; 8 | use std::sync::Arc; 9 | use std::time::Duration; 10 | 11 | #[derive(Debug)] 12 | pub struct DecentralizedFullPrecisionAsynchronous { 13 | pub communicator: BaguaCommunicator, 14 | pub peer_selection_mode: PeerSelectionMode, 15 | pub torch_stream: u64, 16 | } 17 | 18 | impl CommOpTrait for DecentralizedFullPrecisionAsynchronous { 19 | fn execute_background_communication( 20 | &self, 21 | bucket: Arc, 22 | comm_op_channels: &BaguaCommOpChannels, 23 | ) { 24 | let bucket_guard = bucket.inner.lock(); 25 | let comm_stream = self.communicator.stream_ptr(); 26 | 27 | let mut communication_tensor = match &self.communicator { 28 | BaguaCommunicator::SingleCommunicator(_) => { 29 | bucket_guard.get_communication_tensor(comm_stream, false, false) 30 | } 31 | BaguaCommunicator::HierarchicalCommunicator(x) => { 32 | panic!("asynchronous op only accepts non-hierarchical communicator"); 33 | } 34 | }; 35 | 36 | let peer_mode = &self.peer_selection_mode; 37 | 38 | let torch_stream = self.torch_stream; 39 | 40 | self.communicator.execute_communication( 41 | &mut communication_tensor, 42 | false, 43 | false, 44 | false, 45 | &mut |c, t| { 46 | let start_time = std::time::Instant::now(); 47 | tracing::debug!("async model average start"); 48 | 49 | let temp_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id()] 50 | .try_pull(t.raw.num_elements_allocated() * t.raw.dtype().bytes()) 51 | .expect("cannot allocate cuda memory"); 52 | 53 | let mut temp_tensor = BaguaTensorRaw { 54 | ptr: temp_buf.ptr, 55 | num_elem_allocated: t.raw.num_elements_allocated(), 56 | dtype: t.raw.dtype().clone(), 57 | num_elem: t.raw.num_elements(), 58 | device_id: t.raw.device_id(), 59 | pool_allocations: vec![Arc::new(temp_buf)], 60 | }; 61 | 62 | let reduced_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id()] 63 | .try_pull(t.raw.num_elements_allocated() * t.raw.dtype().bytes()) 64 | .expect("cannot allocate cuda memory"); 65 | 66 | let mut reduced_tensor = BaguaTensorRaw { 67 | ptr: reduced_buf.ptr, 68 | num_elem_allocated: t.raw.num_elements_allocated(), 69 | dtype: t.raw.dtype().clone(), 70 | num_elem: t.raw.num_elements(), 71 | device_id: t.raw.device_id(), 72 | pool_allocations: vec![Arc::new(reduced_buf)], 73 | }; 74 | 75 | // use default stream to copy weights 76 | temp_tensor.clone_from(&t.raw, torch_stream as u64); 77 | 78 | let src_ready_event = CUDA_EVENT_POOL.take().event; 79 | 80 | unsafe { 81 | cpp::cpp!([ 82 | src_ready_event as "cudaEvent_t", 83 | comm_stream as "cudaStream_t", 84 | torch_stream as "cudaStream_t"] 85 | { 86 | CUDACHECK(cudaEventRecord(src_ready_event, torch_stream)); 87 | CUDACHECK(cudaStreamWaitEvent(comm_stream, src_ready_event , 0)); 88 | }); 89 | } 90 | 91 | if c.check_abort() { 92 | return; 93 | } 94 | 95 | match peer_mode { 96 | PeerSelectionMode::All => { 97 | c.allreduce(&temp_tensor, &mut reduced_tensor, BaguaReductionOp::SUM); 98 | } 99 | PeerSelectionMode::Ring => { 100 | unimplemented!() 101 | } 102 | PeerSelectionMode::ShiftOne => { 103 | unimplemented!() 104 | } 105 | }; 106 | 107 | let comm_ready_event = CUDA_EVENT_POOL.take().event; 108 | 109 | unsafe { 110 | cpp::cpp!([ 111 | comm_ready_event as "cudaEvent_t", 112 | comm_stream as "cudaStream_t"] 113 | { 114 | CUDACHECK(cudaEventRecord(comm_ready_event, comm_stream)); 115 | CUDACHECK(cudaEventSynchronize(comm_ready_event)); 116 | }); 117 | } 118 | 119 | if c.check_abort() { 120 | return; 121 | } 122 | 123 | // do we need to wait default stream? 124 | unsafe { 125 | cpp::cpp!([ 126 | src_ready_event as "cudaEvent_t", 127 | comm_stream as "cudaStream_t", 128 | torch_stream as "cudaStream_t"] 129 | { 130 | CUDACHECK(cudaEventRecord(src_ready_event, torch_stream)); 131 | CUDACHECK(cudaStreamWaitEvent(comm_stream, src_ready_event , 0)); 132 | }); 133 | } 134 | 135 | t.raw.async_model_average( 136 | &reduced_tensor, 137 | &temp_tensor, 138 | c.nranks as f32, 139 | comm_stream, 140 | ); 141 | 142 | unsafe { 143 | cpp::cpp!([comm_stream as "cudaStream_t"] 144 | { 145 | CUDACHECK(cudaStreamSynchronize(comm_stream)); 146 | }); 147 | } 148 | 149 | tracing::debug!( 150 | "async model average update cost: {:?}", 151 | start_time.elapsed() 152 | ); 153 | }, 154 | ); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/device/example_device_reduce.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Simple example of DeviceReduce::Sum(). 31 | * 32 | * Sums an array of int keys. 33 | * 34 | * To compile using the command line: 35 | * nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3 36 | * 37 | ******************************************************************************/ 38 | 39 | // Ensure printing of CUDA runtime errors to console 40 | #define CUB_STDERR 41 | 42 | #include 43 | 44 | #include 45 | #include 46 | 47 | #include "../../test/test_util.h" 48 | 49 | using namespace cub; 50 | 51 | 52 | //--------------------------------------------------------------------- 53 | // Globals, constants and typedefs 54 | //--------------------------------------------------------------------- 55 | 56 | bool g_verbose = false; // Whether to display input/output to console 57 | CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory 58 | 59 | 60 | //--------------------------------------------------------------------- 61 | // Test generation 62 | //--------------------------------------------------------------------- 63 | 64 | /** 65 | * Initialize problem 66 | */ 67 | void Initialize( 68 | int *h_in, 69 | int num_items) 70 | { 71 | for (int i = 0; i < num_items; ++i) 72 | h_in[i] = i; 73 | 74 | if (g_verbose) 75 | { 76 | printf("Input:\n"); 77 | DisplayResults(h_in, num_items); 78 | printf("\n\n"); 79 | } 80 | } 81 | 82 | 83 | /** 84 | * Compute solution 85 | */ 86 | void Solve( 87 | int *h_in, 88 | int &h_reference, 89 | int num_items) 90 | { 91 | for (int i = 0; i < num_items; ++i) 92 | { 93 | if (i == 0) 94 | h_reference = h_in[0]; 95 | else 96 | h_reference += h_in[i]; 97 | } 98 | } 99 | 100 | 101 | //--------------------------------------------------------------------- 102 | // Main 103 | //--------------------------------------------------------------------- 104 | 105 | /** 106 | * Main 107 | */ 108 | int main(int argc, char** argv) 109 | { 110 | int num_items = 150; 111 | 112 | // Initialize command line 113 | CommandLineArgs args(argc, argv); 114 | g_verbose = args.CheckCmdLineFlag("v"); 115 | args.GetCmdLineArgument("n", num_items); 116 | 117 | // Print usage 118 | if (args.CheckCmdLineFlag("help")) 119 | { 120 | printf("%s " 121 | "[--n= " 122 | "[--device=] " 123 | "[--v] " 124 | "\n", argv[0]); 125 | exit(0); 126 | } 127 | 128 | // Initialize device 129 | CubDebugExit(args.DeviceInit()); 130 | 131 | printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n", 132 | num_items, (int) sizeof(int)); 133 | fflush(stdout); 134 | 135 | // Allocate host arrays 136 | int* h_in = new int[num_items]; 137 | int h_reference; 138 | 139 | // Initialize problem and solution 140 | Initialize(h_in, num_items); 141 | Solve(h_in, h_reference, num_items); 142 | 143 | // Allocate problem device arrays 144 | int *d_in = NULL; 145 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); 146 | 147 | // Initialize device input 148 | CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); 149 | 150 | // Allocate device output array 151 | int *d_out = NULL; 152 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1)); 153 | 154 | // Request and allocate temporary storage 155 | void *d_temp_storage = NULL; 156 | size_t temp_storage_bytes = 0; 157 | CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 158 | CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); 159 | 160 | // Run 161 | CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 162 | 163 | // Check for correctness (and display results, if specified) 164 | int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose); 165 | printf("\t%s", compare ? "FAIL" : "PASS"); 166 | AssertEquals(0, compare); 167 | 168 | // Cleanup 169 | if (h_in) delete[] h_in; 170 | if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); 171 | if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); 172 | if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); 173 | 174 | printf("\n\n"); 175 | 176 | return 0; 177 | } 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/thread/thread_reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential reduction over statically-sized array types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../thread/thread_operators.cuh" 37 | #include "../util_namespace.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) 46 | namespace internal { 47 | 48 | /** 49 | * Sequential reduction over statically-sized array types 50 | */ 51 | template < 52 | int LENGTH, 53 | typename T, 54 | typename ReductionOp> 55 | __device__ __forceinline__ T ThreadReduce( 56 | T* input, ///< [in] Input array 57 | ReductionOp reduction_op, ///< [in] Binary reduction operator 58 | T prefix, ///< [in] Prefix to seed reduction with 59 | Int2Type /*length*/) 60 | { 61 | T retval = prefix; 62 | 63 | #pragma unroll 64 | for (int i = 0; i < LENGTH; ++i) 65 | retval = reduction_op(retval, input[i]); 66 | 67 | return retval; 68 | } 69 | 70 | 71 | /** 72 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. 73 | * 74 | * \tparam LENGTH LengthT of input array 75 | * \tparam T [inferred] The data type to be reduced. 76 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 77 | */ 78 | template < 79 | int LENGTH, 80 | typename T, 81 | typename ReductionOp> 82 | __device__ __forceinline__ T ThreadReduce( 83 | T* input, ///< [in] Input array 84 | ReductionOp reduction_op, ///< [in] Binary reduction operator 85 | T prefix) ///< [in] Prefix to seed reduction with 86 | { 87 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 88 | } 89 | 90 | 91 | /** 92 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. 93 | * 94 | * \tparam LENGTH LengthT of input array 95 | * \tparam T [inferred] The data type to be reduced. 96 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 97 | */ 98 | template < 99 | int LENGTH, 100 | typename T, 101 | typename ReductionOp> 102 | __device__ __forceinline__ T ThreadReduce( 103 | T* input, ///< [in] Input array 104 | ReductionOp reduction_op) ///< [in] Binary reduction operator 105 | { 106 | T prefix = input[0]; 107 | return ThreadReduce(input + 1, reduction_op, prefix); 108 | } 109 | 110 | 111 | /** 112 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. 113 | * 114 | * \tparam LENGTH [inferred] LengthT of \p input array 115 | * \tparam T [inferred] The data type to be reduced. 116 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 117 | */ 118 | template < 119 | int LENGTH, 120 | typename T, 121 | typename ReductionOp> 122 | __device__ __forceinline__ T ThreadReduce( 123 | T (&input)[LENGTH], ///< [in] Input array 124 | ReductionOp reduction_op, ///< [in] Binary reduction operator 125 | T prefix) ///< [in] Prefix to seed reduction with 126 | { 127 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 128 | } 129 | 130 | 131 | /** 132 | * \brief Serial reduction with the specified operator 133 | * 134 | * \tparam LENGTH [inferred] LengthT of \p input array 135 | * \tparam T [inferred] The data type to be reduced. 136 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 137 | */ 138 | template < 139 | int LENGTH, 140 | typename T, 141 | typename ReductionOp> 142 | __device__ __forceinline__ T ThreadReduce( 143 | T (&input)[LENGTH], ///< [in] Input array 144 | ReductionOp reduction_op) ///< [in] Binary reduction operator 145 | { 146 | return ThreadReduce((T*) input, reduction_op); 147 | } 148 | 149 | 150 | } // internal namespace 151 | } // CUB namespace 152 | CUB_NS_POSTFIX // Optional outer namespace(s) 153 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_macro.cuh" 38 | #include "../util_arch.cuh" 39 | #include "../util_type.cuh" 40 | #include "../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | /** 49 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 50 | * \ingroup BlockModule 51 | * 52 | * \par Overview 53 | * This type facilitates a shared memory usage pattern where a block of CUDA 54 | * threads places elements into shared memory and then reduces the active 55 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 56 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 57 | * (for most data types). 58 | * 59 | * \tparam T The data type to be exchanged. 60 | * \tparam BLOCK_THREADS The thread block size in threads. 61 | * \tparam PTX_ARCH [optional] \ptxversion 62 | */ 63 | template < 64 | typename T, 65 | int BLOCK_THREADS, 66 | int PTX_ARCH = CUB_PTX_ARCH> 67 | struct BlockRakingLayout 68 | { 69 | //--------------------------------------------------------------------- 70 | // Constants and type definitions 71 | //--------------------------------------------------------------------- 72 | 73 | enum 74 | { 75 | /// The total number of elements that need to be cooperatively reduced 76 | SHARED_ELEMENTS = BLOCK_THREADS, 77 | 78 | /// Maximum number of warp-synchronous raking threads 79 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), 80 | 81 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 82 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 83 | 84 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 85 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 86 | 87 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 88 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), 89 | 90 | /// Degree of bank conflicts (e.g., 4-way) 91 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 92 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : 93 | 1, 94 | 95 | /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load 96 | USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), 97 | 98 | /// Total number of elements in the raking grid 99 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), 100 | 101 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 102 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 103 | }; 104 | 105 | 106 | /** 107 | * \brief Shared memory storage type 108 | */ 109 | struct __align__(16) _TempStorage 110 | { 111 | T buff[BlockRakingLayout::GRID_ELEMENTS]; 112 | }; 113 | 114 | /// Alias wrapper allowing storage to be unioned 115 | struct TempStorage : Uninitialized<_TempStorage> {}; 116 | 117 | 118 | /** 119 | * \brief Returns the location for the calling thread to place data into the grid 120 | */ 121 | static __device__ __forceinline__ T* PlacementPtr( 122 | TempStorage &temp_storage, 123 | unsigned int linear_tid) 124 | { 125 | // Offset for partial 126 | unsigned int offset = linear_tid; 127 | 128 | // Add in one padding element for every segment 129 | if (USE_SEGMENT_PADDING > 0) 130 | { 131 | offset += offset / SEGMENT_LENGTH; 132 | } 133 | 134 | // Incorporating a block of padding partials every shared memory segment 135 | return temp_storage.Alias().buff + offset; 136 | } 137 | 138 | 139 | /** 140 | * \brief Returns the location for the calling thread to begin sequential raking 141 | */ 142 | static __device__ __forceinline__ T* RakingPtr( 143 | TempStorage &temp_storage, 144 | unsigned int linear_tid) 145 | { 146 | return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); 147 | } 148 | }; 149 | 150 | } // CUB namespace 151 | CUB_NS_POSTFIX // Optional outer namespace(s) 152 | 153 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/grid/grid_barrier.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_debug.cuh" 37 | #include "../util_namespace.cuh" 38 | #include "../thread/thread_load.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 55 | */ 56 | class GridBarrier 57 | { 58 | protected : 59 | 60 | typedef unsigned int SyncFlag; 61 | 62 | // Counters in global device memory 63 | SyncFlag* d_sync; 64 | 65 | public: 66 | 67 | /** 68 | * Constructor 69 | */ 70 | GridBarrier() : d_sync(NULL) {} 71 | 72 | 73 | /** 74 | * Synchronize 75 | */ 76 | __device__ __forceinline__ void Sync() const 77 | { 78 | volatile SyncFlag *d_vol_sync = d_sync; 79 | 80 | // Threadfence and syncthreads to make sure global writes are visible before 81 | // thread-0 reports in with its sync counter 82 | __threadfence(); 83 | CTA_SYNC(); 84 | 85 | if (blockIdx.x == 0) 86 | { 87 | // Report in ourselves 88 | if (threadIdx.x == 0) 89 | { 90 | d_vol_sync[blockIdx.x] = 1; 91 | } 92 | 93 | CTA_SYNC(); 94 | 95 | // Wait for everyone else to report in 96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 97 | { 98 | while (ThreadLoad(d_sync + peer_block) == 0) 99 | { 100 | __threadfence_block(); 101 | } 102 | } 103 | 104 | CTA_SYNC(); 105 | 106 | // Let everyone know it's safe to proceed 107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 108 | { 109 | d_vol_sync[peer_block] = 0; 110 | } 111 | } 112 | else 113 | { 114 | if (threadIdx.x == 0) 115 | { 116 | // Report in 117 | d_vol_sync[blockIdx.x] = 1; 118 | 119 | // Wait for acknowledgment 120 | while (ThreadLoad(d_sync + blockIdx.x) == 1) 121 | { 122 | __threadfence_block(); 123 | } 124 | } 125 | 126 | CTA_SYNC(); 127 | } 128 | } 129 | }; 130 | 131 | 132 | /** 133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 134 | * 135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when 136 | * the destructor is called. 137 | */ 138 | class GridBarrierLifetime : public GridBarrier 139 | { 140 | protected: 141 | 142 | // Number of bytes backed by d_sync 143 | size_t sync_bytes; 144 | 145 | public: 146 | 147 | /** 148 | * Constructor 149 | */ 150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 151 | 152 | 153 | /** 154 | * DeviceFrees and resets the progress counters 155 | */ 156 | cudaError_t HostReset() 157 | { 158 | cudaError_t retval = cudaSuccess; 159 | if (d_sync) 160 | { 161 | CubDebug(retval = cudaFree(d_sync)); 162 | d_sync = NULL; 163 | } 164 | sync_bytes = 0; 165 | return retval; 166 | } 167 | 168 | 169 | /** 170 | * Destructor 171 | */ 172 | virtual ~GridBarrierLifetime() 173 | { 174 | HostReset(); 175 | } 176 | 177 | 178 | /** 179 | * Sets up the progress counters for the next kernel launch (lazily 180 | * allocating and initializing them if necessary) 181 | */ 182 | cudaError_t Setup(int sweep_grid_size) 183 | { 184 | cudaError_t retval = cudaSuccess; 185 | do { 186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 187 | if (new_sync_bytes > sync_bytes) 188 | { 189 | if (d_sync) 190 | { 191 | if (CubDebug(retval = cudaFree(d_sync))) break; 192 | } 193 | 194 | sync_bytes = new_sync_bytes; 195 | 196 | // Allocate and initialize to zero 197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 199 | } 200 | } while (0); 201 | 202 | return retval; 203 | } 204 | }; 205 | 206 | 207 | /** @} */ // end group GridModule 208 | 209 | } // CUB namespace 210 | CUB_NS_POSTFIX // Optional outer namespace(s) 211 | 212 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/examples/device/example_device_scan.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Simple example of DeviceScan::ExclusiveSum(). 31 | * 32 | * Computes an exclusive sum of int keys. 33 | * 34 | * To compile using the command line: 35 | * nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3 36 | * 37 | ******************************************************************************/ 38 | 39 | // Ensure printing of CUDA runtime errors to console 40 | #define CUB_STDERR 41 | 42 | #include 43 | 44 | #include 45 | #include 46 | 47 | #include "../../test/test_util.h" 48 | 49 | using namespace cub; 50 | 51 | 52 | //--------------------------------------------------------------------- 53 | // Globals, constants and typedefs 54 | //--------------------------------------------------------------------- 55 | 56 | bool g_verbose = false; // Whether to display input/output to console 57 | CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory 58 | 59 | 60 | //--------------------------------------------------------------------- 61 | // Test generation 62 | //--------------------------------------------------------------------- 63 | 64 | 65 | /** 66 | * Initialize problem 67 | */ 68 | void Initialize( 69 | int *h_in, 70 | int num_items) 71 | { 72 | for (int i = 0; i < num_items; ++i) 73 | h_in[i] = i; 74 | 75 | if (g_verbose) 76 | { 77 | printf("Input:\n"); 78 | DisplayResults(h_in, num_items); 79 | printf("\n\n"); 80 | } 81 | } 82 | 83 | /** 84 | * Solve exclusive-scan problem 85 | */ 86 | int Solve( 87 | int *h_in, 88 | int *h_reference, 89 | int num_items) 90 | { 91 | int inclusive = 0; 92 | int aggregate = 0; 93 | 94 | for (int i = 0; i < num_items; ++i) 95 | { 96 | h_reference[i] = inclusive; 97 | inclusive += h_in[i]; 98 | aggregate += h_in[i]; 99 | } 100 | 101 | return aggregate; 102 | } 103 | 104 | 105 | 106 | //--------------------------------------------------------------------- 107 | // Main 108 | //--------------------------------------------------------------------- 109 | 110 | /** 111 | * Main 112 | */ 113 | int main(int argc, char** argv) 114 | { 115 | int num_items = 150; 116 | 117 | // Initialize command line 118 | CommandLineArgs args(argc, argv); 119 | g_verbose = args.CheckCmdLineFlag("v"); 120 | args.GetCmdLineArgument("n", num_items); 121 | 122 | // Print usage 123 | if (args.CheckCmdLineFlag("help")) 124 | { 125 | printf("%s " 126 | "[--n= " 127 | "[--device=] " 128 | "[--v] " 129 | "\n", argv[0]); 130 | exit(0); 131 | } 132 | 133 | // Initialize device 134 | CubDebugExit(args.DeviceInit()); 135 | 136 | printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n", 137 | num_items, (int) sizeof(int)); 138 | fflush(stdout); 139 | 140 | // Allocate host arrays 141 | int* h_in = new int[num_items]; 142 | int* h_reference = new int[num_items]; 143 | 144 | // Initialize problem and solution 145 | Initialize(h_in, num_items); 146 | Solve(h_in, h_reference, num_items); 147 | 148 | // Allocate problem device arrays 149 | int *d_in = NULL; 150 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); 151 | 152 | // Initialize device input 153 | CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); 154 | 155 | // Allocate device output array 156 | int *d_out = NULL; 157 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items)); 158 | 159 | // Allocate temporary storage 160 | void *d_temp_storage = NULL; 161 | size_t temp_storage_bytes = 0; 162 | CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 163 | CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); 164 | 165 | // Run 166 | CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 167 | 168 | // Check for correctness (and display results, if specified) 169 | int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose); 170 | printf("\t%s", compare ? "FAIL" : "PASS"); 171 | AssertEquals(0, compare); 172 | 173 | // Cleanup 174 | if (h_in) delete[] h_in; 175 | if (h_reference) delete[] h_reference; 176 | if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); 177 | if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); 178 | if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); 179 | 180 | printf("\n\n"); 181 | 182 | return 0; 183 | } 184 | 185 | 186 | 187 | -------------------------------------------------------------------------------- /bagua-core-internal/src/comm_ops/decentralized_low_precision_synchronous.rs: -------------------------------------------------------------------------------- 1 | use crate::comm_ops::decentralized_full_precision_synchronous::PeerSelectionMode; 2 | use crate::comm_ops::CommOpTrait; 3 | use crate::communicators::{BaguaCommunicator, BaguaHierarchicalCommunicator, NCCLGroupGuard}; 4 | use crate::datatypes::{ 5 | BaguaBucket, BaguaTensor, BaguaTensorRaw, RawBaguaTensor, TensorCompressionMethod, 6 | }; 7 | use crate::events::BaguaEventChannel; 8 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL; 9 | use crate::{BaguaCommOpChannels, BaguaScheduledCommOp}; 10 | use parking_lot::Mutex; 11 | use std::sync::Arc; 12 | 13 | #[derive(Debug)] 14 | pub struct DecentralizedLowPrecisionSynchronous { 15 | pub communicator: BaguaCommunicator, 16 | pub peer_selection_mode: PeerSelectionMode, 17 | pub compression_method: TensorCompressionMethod, 18 | pub weight: BaguaTensor, 19 | pub left_peer_weight: BaguaTensor, 20 | pub right_peer_weight: BaguaTensor, 21 | } 22 | 23 | impl CommOpTrait for DecentralizedLowPrecisionSynchronous { 24 | fn execute_background_communication( 25 | &self, 26 | bucket: Arc, 27 | _comm_op_channels: &BaguaCommOpChannels, 28 | ) { 29 | let bucket_guard = bucket.inner.lock(); 30 | let stream_ptr = self.communicator.stream_ptr(); 31 | 32 | let mut communication_tensor = 33 | bucket_guard.get_communication_tensor(stream_ptr, false, false); 34 | 35 | let peer_mode = &self.peer_selection_mode; 36 | 37 | self.communicator.execute_communication( 38 | &mut communication_tensor, 39 | true, 40 | true, 41 | true, 42 | &mut |c, t| { 43 | tracing::debug!("start compress diff"); 44 | 45 | t.raw.addmul_inplace( 46 | self.left_peer_weight.inner.read().raw.as_ref(), 47 | 1.0 / 3.0, 48 | c.stream_ptr, 49 | ); 50 | t.raw.addmul_inplace( 51 | self.right_peer_weight.inner.read().raw.as_ref(), 52 | 1.0 / 3.0, 53 | c.stream_ptr, 54 | ); 55 | 56 | { 57 | let weight_guard = self.weight.inner.read(); 58 | t.raw 59 | .addmul_inplace(weight_guard.raw.as_ref(), -5.0 / 3.0, c.stream_ptr); 60 | } 61 | let compressed_tensor = t 62 | .raw 63 | .compress(&self.compression_method, 1, c.stream_ptr, -1) 64 | .expect("cannot compress tensor"); 65 | 66 | tracing::debug!("start communicate with peers"); 67 | let lrecv_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id] 68 | .try_pull( 69 | compressed_tensor.num_elements_allocated() 70 | * compressed_tensor.dtype().bytes(), 71 | ) 72 | .expect("cannot allocate cuda memory"); 73 | let mut lrecv_tensor = BaguaTensorRaw { 74 | ptr: lrecv_buf.ptr, 75 | num_elem_allocated: compressed_tensor.num_elements_allocated(), 76 | dtype: compressed_tensor.dtype().clone(), 77 | num_elem: compressed_tensor.num_elements(), 78 | device_id: compressed_tensor.device_id(), 79 | pool_allocations: vec![Arc::new(lrecv_buf)], 80 | }; 81 | 82 | let rrecv_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id] 83 | .try_pull( 84 | compressed_tensor.num_elements_allocated() 85 | * compressed_tensor.dtype().bytes(), 86 | ) 87 | .expect("cannot allocate cuda memory"); 88 | let mut rrecv_tensor = BaguaTensorRaw { 89 | ptr: rrecv_buf.ptr, 90 | num_elem_allocated: compressed_tensor.num_elements_allocated(), 91 | dtype: compressed_tensor.dtype().clone(), 92 | num_elem: compressed_tensor.num_elements(), 93 | device_id: compressed_tensor.device_id(), 94 | pool_allocations: vec![Arc::new(rrecv_buf)], 95 | }; 96 | 97 | match peer_mode { 98 | PeerSelectionMode::Ring => { 99 | let left_peer_rank = ((c.rank + c.nranks - 1) % c.nranks) as i32; 100 | let right_peer_rank = ((c.rank + 1) % c.nranks) as i32; 101 | 102 | { 103 | let _guard = NCCLGroupGuard::new(); 104 | 105 | tracing::debug!( 106 | "rank: {} left peer: {} right peer: {}", 107 | c.rank, 108 | left_peer_rank, 109 | right_peer_rank 110 | ); 111 | c.send(compressed_tensor.as_ref(), left_peer_rank); 112 | c.send(compressed_tensor.as_ref(), right_peer_rank); 113 | c.recv(&mut lrecv_tensor, left_peer_rank); 114 | c.recv(&mut rrecv_tensor, right_peer_rank); 115 | } 116 | } 117 | PeerSelectionMode::All => { 118 | unimplemented!() 119 | } 120 | PeerSelectionMode::ShiftOne => { 121 | unimplemented!() 122 | } 123 | }; 124 | 125 | tracing::debug!("start decompress diff and update weights"); 126 | t.raw 127 | .decompress_from(&self.compression_method, 1, &lrecv_tensor, c.stream_ptr); 128 | { 129 | let mut weight_guard = self.left_peer_weight.inner.write(); 130 | weight_guard.raw.add_inplace(&t.raw, c.stream_ptr); 131 | } 132 | 133 | t.raw 134 | .decompress_from(&self.compression_method, 1, &rrecv_tensor, c.stream_ptr); 135 | { 136 | let mut weight_guard = self.right_peer_weight.inner.write(); 137 | weight_guard.raw.add_inplace(&t.raw, c.stream_ptr); 138 | } 139 | 140 | t.raw.decompress_from( 141 | &self.compression_method, 142 | 1, 143 | compressed_tensor.as_ref(), 144 | c.stream_ptr, 145 | ); 146 | 147 | { 148 | let mut weight_guard = self.weight.inner.write(); 149 | t.raw.add_inplace(weight_guard.raw.as_ref(), c.stream_ptr); 150 | weight_guard.raw.clone_from(&t.raw, c.stream_ptr); 151 | } 152 | }, 153 | ); 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_gmem_atomics.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | namespace histogram_gmem_atomics 31 | { 32 | // Decode float4 pixel into bins 33 | template 34 | __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 35 | { 36 | float* samples = reinterpret_cast(&pixel); 37 | 38 | #pragma unroll 39 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 40 | bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS)); 41 | } 42 | 43 | // Decode uchar4 pixel into bins 44 | template 45 | __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 46 | { 47 | unsigned char* samples = reinterpret_cast(&pixel); 48 | 49 | #pragma unroll 50 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 51 | bins[CHANNEL] = (unsigned int) (samples[CHANNEL]); 52 | } 53 | 54 | // Decode uchar1 pixel into bins 55 | template 56 | __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 57 | { 58 | bins[0] = (unsigned int) pixel.x; 59 | } 60 | 61 | // First-pass histogram kernel (binning into privatized counters) 62 | template < 63 | int NUM_PARTS, 64 | int ACTIVE_CHANNELS, 65 | int NUM_BINS, 66 | typename PixelType> 67 | __global__ void histogram_gmem_atomics( 68 | const PixelType *in, 69 | int width, 70 | int height, 71 | unsigned int *out) 72 | { 73 | // global position and size 74 | int x = blockIdx.x * blockDim.x + threadIdx.x; 75 | int y = blockIdx.y * blockDim.y + threadIdx.y; 76 | int nx = blockDim.x * gridDim.x; 77 | int ny = blockDim.y * gridDim.y; 78 | 79 | // threads in workgroup 80 | int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1 81 | int nt = blockDim.x * blockDim.y; // total threads in workgroup 82 | 83 | // group index in 0..ngroups-1 84 | int g = blockIdx.x + blockIdx.y * gridDim.x; 85 | 86 | // initialize smem 87 | unsigned int *gmem = out + g * NUM_PARTS; 88 | for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt) 89 | gmem[i] = 0; 90 | __syncthreads(); 91 | 92 | // process pixels (updates our group's partial histogram in gmem) 93 | for (int col = x; col < width; col += nx) 94 | { 95 | for (int row = y; row < height; row += ny) 96 | { 97 | PixelType pixel = in[row * width + col]; 98 | 99 | unsigned int bins[ACTIVE_CHANNELS]; 100 | DecodePixel(pixel, bins); 101 | 102 | #pragma unroll 103 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 104 | atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1); 105 | } 106 | } 107 | } 108 | 109 | // Second pass histogram kernel (accumulation) 110 | template < 111 | int NUM_PARTS, 112 | int ACTIVE_CHANNELS, 113 | int NUM_BINS> 114 | __global__ void histogram_gmem_accum( 115 | const unsigned int *in, 116 | int n, 117 | unsigned int *out) 118 | { 119 | int i = blockIdx.x * blockDim.x + threadIdx.x; 120 | if (i > ACTIVE_CHANNELS * NUM_BINS) 121 | return; // out of range 122 | 123 | unsigned int total = 0; 124 | for (int j = 0; j < n; j++) 125 | total += in[i + NUM_PARTS * j]; 126 | 127 | out[i] = total; 128 | } 129 | 130 | 131 | } // namespace histogram_gmem_atomics 132 | 133 | 134 | template < 135 | int ACTIVE_CHANNELS, 136 | int NUM_BINS, 137 | typename PixelType> 138 | double run_gmem_atomics( 139 | PixelType *d_image, 140 | int width, 141 | int height, 142 | unsigned int *d_hist, 143 | bool warmup) 144 | { 145 | enum 146 | { 147 | NUM_PARTS = 1024 148 | }; 149 | 150 | cudaDeviceProp props; 151 | cudaGetDeviceProperties(&props, 0); 152 | 153 | dim3 block(32, 4); 154 | dim3 grid(16, 16); 155 | int total_blocks = grid.x * grid.y; 156 | 157 | // allocate partial histogram 158 | unsigned int *d_part_hist; 159 | cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int)); 160 | 161 | dim3 block2(128); 162 | dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x); 163 | 164 | GpuTimer gpu_timer; 165 | gpu_timer.Start(); 166 | 167 | histogram_gmem_atomics::histogram_gmem_atomics<<>>( 168 | d_image, 169 | width, 170 | height, 171 | d_part_hist); 172 | 173 | histogram_gmem_atomics::histogram_gmem_accum<<>>( 174 | d_part_hist, 175 | total_blocks, 176 | d_hist); 177 | 178 | gpu_timer.Stop(); 179 | float elapsed_millis = gpu_timer.ElapsedMillis(); 180 | 181 | cudaFree(d_part_hist); 182 | 183 | return elapsed_millis; 184 | } 185 | 186 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/tune/Makefile: -------------------------------------------------------------------------------- 1 | #/****************************************************************************** 2 | # * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | # * 5 | # * Redistribution and use in source and binary forms, with or without 6 | # * modification, are permitted provided that the following conditions are met: 7 | # * * Redistributions of source code must retain the above copyright 8 | # * notice, this list of conditions and the following disclaimer. 9 | # * * Redistributions in binary form must reproduce the above copyright 10 | # * notice, this list of conditions and the following disclaimer in the 11 | # * documentation and/or other materials provided with the distribution. 12 | # * * Neither the name of the NVIDIA CORPORATION nor the 13 | # * names of its contributors may be used to endorse or promote products 14 | # * derived from this software without specific prior written permission. 15 | # * 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | # * 27 | #******************************************************************************/ 28 | 29 | #------------------------------------------------------------------------------- 30 | # Build script for project 31 | #------------------------------------------------------------------------------- 32 | 33 | NVCC = "$(shell which nvcc)" 34 | NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//')) 35 | 36 | # detect OS 37 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 38 | 39 | #------------------------------------------------------------------------------- 40 | # Libs 41 | #------------------------------------------------------------------------------- 42 | 43 | 44 | #------------------------------------------------------------------------------- 45 | # Includes 46 | #------------------------------------------------------------------------------- 47 | 48 | INC = -I. -I.. -I../test 49 | 50 | #------------------------------------------------------------------------------- 51 | # Libs 52 | #------------------------------------------------------------------------------- 53 | 54 | LIBS += -lcudart 55 | 56 | #------------------------------------------------------------------------------- 57 | # Defines 58 | #------------------------------------------------------------------------------- 59 | 60 | DEFINES = 61 | 62 | #------------------------------------------------------------------------------- 63 | # SM Arch 64 | #------------------------------------------------------------------------------- 65 | 66 | ifdef sm 67 | SM_ARCH = $(sm) 68 | else 69 | SM_ARCH = 200 70 | endif 71 | 72 | # Only one arch per tuning binary 73 | ifeq (350, $(findstring 350, $(SM_ARCH))) 74 | SM_TARGETS = -arch=sm_35 75 | SM_ARCH = 350 76 | endif 77 | ifeq (300, $(findstring 300, $(SM_ARCH))) 78 | SM_TARGETS = -arch=sm_30 79 | SM_ARCH = 300 80 | endif 81 | ifeq (200, $(findstring 200, $(SM_ARCH))) 82 | SM_TARGETS = -arch=sm_20 83 | SM_ARCH = 200 84 | endif 85 | ifeq (130, $(findstring 130, $(SM_ARCH))) 86 | SM_TARGETS = -arch=sm_13 87 | SM_ARCH = 130 88 | endif 89 | ifeq (110, $(findstring 110, $(SM_ARCH))) 90 | SM_TARGETS = -arch=sm_11 91 | SM_ARCH = 110 92 | endif 93 | ifeq (100, $(findstring 100, $(SM_ARCH))) 94 | SM_TARGETS = -arch=sm_10 95 | SM_ARCH = 100 96 | endif 97 | 98 | 99 | #------------------------------------------------------------------------------- 100 | # Compiler Flags 101 | #------------------------------------------------------------------------------- 102 | 103 | NVCCFLAGS = -Xptxas -v -Xcudafe -\# 104 | 105 | # Help the compiler/linker work with huge numbers of kernels on Windows 106 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) 107 | NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500 108 | endif 109 | 110 | # 32/64-bit (32-bit device pointers by default) 111 | ifeq ($(force32), 1) 112 | CPU_ARCH = -m32 113 | CPU_ARCH_SUFFIX = i386 114 | else 115 | CPU_ARCH = -m64 116 | CPU_ARCH_SUFFIX = x86_64 117 | endif 118 | 119 | # CUDA ABI enable/disable (enabled by default) 120 | ifneq ($(abi), 0) 121 | ABI_SUFFIX = abi 122 | else 123 | NVCCFLAGS += -Xptxas -abi=no 124 | ABI_SUFFIX = noabi 125 | endif 126 | 127 | # NVVM/Open64 middle-end compiler (nvvm by default) 128 | ifeq ($(open64), 1) 129 | NVCCFLAGS += -open64 130 | PTX_SUFFIX = open64 131 | else 132 | PTX_SUFFIX = nvvm 133 | endif 134 | 135 | # Verbose toolchain output from nvcc 136 | ifeq ($(verbose), 1) 137 | NVCCFLAGS += -v 138 | endif 139 | 140 | # Keep intermediate compilation artifacts 141 | ifeq ($(keep), 1) 142 | NVCCFLAGS += -keep 143 | endif 144 | 145 | # Data type size to compile a schmoo binary for 146 | ifdef tunesize 147 | TUNE_SIZE = $(tunesize) 148 | else 149 | TUNE_SIZE = 4 150 | endif 151 | 152 | 153 | SUFFIX = $(TUNE_SIZE)B_sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CPU_ARCH_SUFFIX) 154 | 155 | #------------------------------------------------------------------------------- 156 | # Dependency Lists 157 | #------------------------------------------------------------------------------- 158 | 159 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) 160 | 161 | DEPS = ./Makefile \ 162 | ../test/test_util.h \ 163 | $(call rwildcard,../cub/,*.cuh) 164 | 165 | 166 | #------------------------------------------------------------------------------- 167 | # make default 168 | #------------------------------------------------------------------------------- 169 | 170 | default: 171 | 172 | 173 | #------------------------------------------------------------------------------- 174 | # make clean 175 | #------------------------------------------------------------------------------- 176 | 177 | clean : 178 | rm -f bin/*$(CPU_ARCH_SUFFIX)* 179 | rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o 180 | 181 | 182 | 183 | #------------------------------------------------------------------------------- 184 | # make tune_device_reduce 185 | #------------------------------------------------------------------------------- 186 | 187 | tune_device_reduce: bin/tune_device_reduce_$(SUFFIX) 188 | 189 | bin/tune_device_reduce_$(SUFFIX) : tune_device_reduce.cu $(DEPS) 190 | mkdir -p bin 191 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/tune_device_reduce_$(SUFFIX) tune_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE) 192 | 193 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 45 | 46 | #if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) 47 | #define CUB_USE_COOPERATIVE_GROUPS 48 | #endif 49 | 50 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). 51 | #ifndef CUB_PTX_ARCH 52 | #ifndef __CUDA_ARCH__ 53 | #define CUB_PTX_ARCH 0 54 | #else 55 | #define CUB_PTX_ARCH __CUDA_ARCH__ 56 | #endif 57 | #endif 58 | 59 | 60 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 61 | #ifndef CUB_RUNTIME_FUNCTION 62 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 63 | #define CUB_RUNTIME_ENABLED 64 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 65 | #else 66 | #define CUB_RUNTIME_FUNCTION __host__ 67 | #endif 68 | #endif 69 | 70 | 71 | /// Number of threads per warp 72 | #ifndef CUB_LOG_WARP_THREADS 73 | #define CUB_LOG_WARP_THREADS(arch) \ 74 | (5) 75 | #define CUB_WARP_THREADS(arch) \ 76 | (1 << CUB_LOG_WARP_THREADS(arch)) 77 | 78 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 79 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 80 | #endif 81 | 82 | 83 | /// Number of smem banks 84 | #ifndef CUB_LOG_SMEM_BANKS 85 | #define CUB_LOG_SMEM_BANKS(arch) \ 86 | ((arch >= 200) ? \ 87 | (5) : \ 88 | (4)) 89 | #define CUB_SMEM_BANKS(arch) \ 90 | (1 << CUB_LOG_SMEM_BANKS(arch)) 91 | 92 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 93 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 94 | #endif 95 | 96 | 97 | /// Oversubscription factor 98 | #ifndef CUB_SUBSCRIPTION_FACTOR 99 | #define CUB_SUBSCRIPTION_FACTOR(arch) \ 100 | ((arch >= 300) ? \ 101 | (5) : \ 102 | ((arch >= 200) ? \ 103 | (3) : \ 104 | (10))) 105 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 106 | #endif 107 | 108 | 109 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 110 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 111 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 112 | ((arch >= 300) ? \ 113 | (1) : \ 114 | (4)) 115 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 116 | #endif 117 | 118 | 119 | /// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. 120 | #ifndef CUB_SCALED_BLOCK_THREADS 121 | #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 122 | (CUB_MIN( \ 123 | NOMINAL_4B_BLOCK_THREADS, \ 124 | CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ 125 | 2, \ 126 | (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) 127 | #endif 128 | 129 | /// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread 130 | #ifndef CUB_SCALED_ITEMS_PER_THREAD 131 | #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 132 | CUB_MAX( \ 133 | 1, \ 134 | (sizeof(T) < 4) ? \ 135 | ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ 136 | ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) 137 | #endif 138 | 139 | /// Define both nominal threads-per-block and items-per-thread 140 | #ifndef CUB_SCALED_GRANULARITIES 141 | #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ 142 | CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ 143 | CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) 144 | #endif 145 | 146 | 147 | 148 | #endif // Do not document 149 | 150 | } // CUB namespace 151 | CUB_NS_POSTFIX // Optional outer namespace(s) 152 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_smem_atomics.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | namespace histogram_smem_atomics 31 | { 32 | // Decode float4 pixel into bins 33 | template 34 | __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 35 | { 36 | float* samples = reinterpret_cast(&pixel); 37 | 38 | #pragma unroll 39 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 40 | bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS)); 41 | } 42 | 43 | // Decode uchar4 pixel into bins 44 | template 45 | __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 46 | { 47 | unsigned char* samples = reinterpret_cast(&pixel); 48 | 49 | #pragma unroll 50 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 51 | bins[CHANNEL] = (unsigned int) (samples[CHANNEL]); 52 | } 53 | 54 | // Decode uchar1 pixel into bins 55 | template 56 | __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS]) 57 | { 58 | bins[0] = (unsigned int) pixel.x; 59 | } 60 | 61 | // First-pass histogram kernel (binning into privatized counters) 62 | template < 63 | int NUM_PARTS, 64 | int ACTIVE_CHANNELS, 65 | int NUM_BINS, 66 | typename PixelType> 67 | __global__ void histogram_smem_atomics( 68 | const PixelType *in, 69 | int width, 70 | int height, 71 | unsigned int *out) 72 | { 73 | // global position and size 74 | int x = blockIdx.x * blockDim.x + threadIdx.x; 75 | int y = blockIdx.y * blockDim.y + threadIdx.y; 76 | int nx = blockDim.x * gridDim.x; 77 | int ny = blockDim.y * gridDim.y; 78 | 79 | // threads in workgroup 80 | int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1 81 | int nt = blockDim.x * blockDim.y; // total threads in workgroup 82 | 83 | // group index in 0..ngroups-1 84 | int g = blockIdx.x + blockIdx.y * gridDim.x; 85 | 86 | // initialize smem 87 | __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3]; 88 | for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt) 89 | smem[i] = 0; 90 | __syncthreads(); 91 | 92 | // process pixels 93 | // updates our group's partial histogram in smem 94 | for (int col = x; col < width; col += nx) 95 | { 96 | for (int row = y; row < height; row += ny) 97 | { 98 | PixelType pixel = in[row * width + col]; 99 | 100 | unsigned int bins[ACTIVE_CHANNELS]; 101 | DecodePixel(pixel, bins); 102 | 103 | #pragma unroll 104 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 105 | atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1); 106 | } 107 | } 108 | 109 | __syncthreads(); 110 | 111 | // move to our workgroup's slice of output 112 | out += g * NUM_PARTS; 113 | 114 | // store local output to global 115 | for (int i = t; i < NUM_BINS; i += nt) 116 | { 117 | #pragma unroll 118 | for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) 119 | out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL]; 120 | } 121 | } 122 | 123 | // Second pass histogram kernel (accumulation) 124 | template < 125 | int NUM_PARTS, 126 | int ACTIVE_CHANNELS, 127 | int NUM_BINS> 128 | __global__ void histogram_smem_accum( 129 | const unsigned int *in, 130 | int n, 131 | unsigned int *out) 132 | { 133 | int i = blockIdx.x * blockDim.x + threadIdx.x; 134 | if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range 135 | unsigned int total = 0; 136 | for (int j = 0; j < n; j++) 137 | total += in[i + NUM_PARTS * j]; 138 | out[i] = total; 139 | } 140 | 141 | } // namespace histogram_smem_atomics 142 | 143 | 144 | template < 145 | int ACTIVE_CHANNELS, 146 | int NUM_BINS, 147 | typename PixelType> 148 | double run_smem_atomics( 149 | PixelType *d_image, 150 | int width, 151 | int height, 152 | unsigned int *d_hist, 153 | bool warmup) 154 | { 155 | enum 156 | { 157 | NUM_PARTS = 1024 158 | }; 159 | 160 | cudaDeviceProp props; 161 | cudaGetDeviceProperties(&props, 0); 162 | 163 | dim3 block(32, 4); 164 | dim3 grid(16, 16); 165 | int total_blocks = grid.x * grid.y; 166 | 167 | // allocate partial histogram 168 | unsigned int *d_part_hist; 169 | cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int)); 170 | 171 | dim3 block2(128); 172 | dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x); 173 | 174 | GpuTimer gpu_timer; 175 | gpu_timer.Start(); 176 | 177 | histogram_smem_atomics::histogram_smem_atomics<<>>( 178 | d_image, 179 | width, 180 | height, 181 | d_part_hist); 182 | 183 | histogram_smem_atomics::histogram_smem_accum<<>>( 184 | d_part_hist, 185 | total_blocks, 186 | d_hist); 187 | 188 | gpu_timer.Stop(); 189 | float elapsed_millis = gpu_timer.ElapsedMillis(); 190 | 191 | cudaFree(d_part_hist); 192 | 193 | return elapsed_millis; 194 | } 195 | 196 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/iterator/discard_output_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../util_namespace.cuh" 40 | #include "../util_macro.cuh" 41 | 42 | #if (THRUST_VERSION >= 100700) 43 | // This iterator is compatible with Thrust API 1.7 and newer 44 | #include 45 | #include 46 | #endif // THRUST_VERSION 47 | 48 | 49 | /// Optional outer namespace(s) 50 | CUB_NS_PREFIX 51 | 52 | /// CUB namespace 53 | namespace cub { 54 | 55 | 56 | /** 57 | * \addtogroup UtilIterator 58 | * @{ 59 | */ 60 | 61 | 62 | /** 63 | * \brief A discard iterator 64 | */ 65 | template 66 | class DiscardOutputIterator 67 | { 68 | public: 69 | 70 | // Required iterator traits 71 | typedef DiscardOutputIterator self_type; ///< My own type 72 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 73 | typedef void value_type; ///< The type of the element the iterator can point to 74 | typedef void pointer; ///< The type of a pointer to an element the iterator can point to 75 | typedef void reference; ///< The type of a reference to an element the iterator can point to 76 | 77 | #if (THRUST_VERSION >= 100700) 78 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 79 | typedef typename thrust::detail::iterator_facade_category< 80 | thrust::any_system_tag, 81 | thrust::random_access_traversal_tag, 82 | value_type, 83 | reference 84 | >::type iterator_category; ///< The iterator category 85 | #else 86 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 87 | #endif // THRUST_VERSION 88 | 89 | private: 90 | 91 | OffsetT offset; 92 | 93 | #if defined(_WIN32) || !defined(_WIN64) 94 | // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) 95 | OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; 96 | #endif 97 | 98 | public: 99 | 100 | /// Constructor 101 | __host__ __device__ __forceinline__ DiscardOutputIterator( 102 | OffsetT offset = 0) ///< Base offset 103 | : 104 | offset(offset) 105 | {} 106 | 107 | /// Postfix increment 108 | __host__ __device__ __forceinline__ self_type operator++(int) 109 | { 110 | self_type retval = *this; 111 | offset++; 112 | return retval; 113 | } 114 | 115 | /// Prefix increment 116 | __host__ __device__ __forceinline__ self_type operator++() 117 | { 118 | offset++; 119 | return *this; 120 | } 121 | 122 | /// Indirection 123 | __host__ __device__ __forceinline__ self_type& operator*() 124 | { 125 | // return self reference, which can be assigned to anything 126 | return *this; 127 | } 128 | 129 | /// Addition 130 | template 131 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 132 | { 133 | self_type retval(offset + n); 134 | return retval; 135 | } 136 | 137 | /// Addition assignment 138 | template 139 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 140 | { 141 | offset += n; 142 | return *this; 143 | } 144 | 145 | /// Subtraction 146 | template 147 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 148 | { 149 | self_type retval(offset - n); 150 | return retval; 151 | } 152 | 153 | /// Subtraction assignment 154 | template 155 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 156 | { 157 | offset -= n; 158 | return *this; 159 | } 160 | 161 | /// Distance 162 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 163 | { 164 | return offset - other.offset; 165 | } 166 | 167 | /// Array subscript 168 | template 169 | __host__ __device__ __forceinline__ self_type& operator[](Distance n) 170 | { 171 | // return self reference, which can be assigned to anything 172 | return *this; 173 | } 174 | 175 | /// Structure dereference 176 | __host__ __device__ __forceinline__ pointer operator->() 177 | { 178 | return; 179 | } 180 | 181 | /// Assignment to self (no-op) 182 | __host__ __device__ __forceinline__ void operator=(self_type const& other) 183 | { 184 | offset = other.offset; 185 | } 186 | 187 | /// Assignment to anything else (no-op) 188 | template 189 | __host__ __device__ __forceinline__ void operator=(T const&) 190 | {} 191 | 192 | /// Cast to void* operator 193 | __host__ __device__ __forceinline__ operator void*() const { return NULL; } 194 | 195 | /// Equal to 196 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 197 | { 198 | return (offset == rhs.offset); 199 | } 200 | 201 | /// Not equal to 202 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 203 | { 204 | return (offset != rhs.offset); 205 | } 206 | 207 | /// ostream operator 208 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 209 | { 210 | os << "[" << itr.offset << "]"; 211 | return os; 212 | } 213 | 214 | }; 215 | 216 | 217 | /** @} */ // end group UtilIterator 218 | 219 | } // CUB namespace 220 | CUB_NS_POSTFIX // Optional outer namespace(s) 221 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from distutils.errors import ( 3 | DistutilsPlatformError, 4 | ) 5 | from setuptools import setup, find_packages 6 | from setuptools_rust import Binding, RustExtension 7 | import sys 8 | import platform 9 | import shutil 10 | import sys 11 | import tempfile 12 | import urllib.request 13 | from tqdm import tqdm 14 | 15 | 16 | _nccl_records = [] 17 | library_records = {} 18 | 19 | 20 | class DownloadProgressBar(tqdm): 21 | def update_to(self, b=1, bsize=1, tsize=None): 22 | if tsize is not None: 23 | self.total = tsize 24 | self.update(b * bsize - self.n) 25 | 26 | 27 | def download_url(url, output_path): 28 | with DownloadProgressBar(unit='B', unit_scale=True, 29 | miniters=1, desc=url.split('/')[-1]) as t: 30 | urllib.request.urlretrieve( 31 | url, filename=output_path, reporthook=t.update_to) 32 | 33 | 34 | def _make_nccl_url(public_version, filename): 35 | # https://developer.download.nvidia.com/compute/redist/nccl/v2.8/nccl_2.8.4-1+cuda11.2_x86_64.txz 36 | return ( 37 | "https://developer.download.nvidia.com/compute/redist/nccl/" 38 | + "v{}/{}".format(public_version, filename) 39 | ) 40 | 41 | 42 | def _make_nccl_record(cuda_version, full_version, public_version, filename_linux): 43 | return { 44 | "cuda": cuda_version, 45 | "nccl": full_version, 46 | "assets": { 47 | "Linux": { 48 | "url": _make_nccl_url(public_version, filename_linux), 49 | "filename": "libnccl.so.{}".format(full_version), 50 | }, 51 | }, 52 | } 53 | 54 | 55 | _nccl_records.append( 56 | _make_nccl_record("11.4", "2.10.3", "2.10", 57 | "nccl_2.10.3-1+cuda11.4_x86_64.txz") 58 | ) 59 | _nccl_records.append( 60 | _make_nccl_record("11.3", "2.10.3", "2.10", 61 | "nccl_2.10.3-1+cuda11.0_x86_64.txz") 62 | ) 63 | _nccl_records.append( 64 | _make_nccl_record("11.2", "2.10.3", "2.10", 65 | "nccl_2.10.3-1+cuda11.0_x86_64.txz") 66 | ) 67 | _nccl_records.append( 68 | _make_nccl_record("11.1", "2.10.3", "2.10", 69 | "nccl_2.10.3-1+cuda11.0_x86_64.txz") 70 | ) 71 | _nccl_records.append( 72 | _make_nccl_record("11.0", "2.10.3", "2.10", 73 | "nccl_2.10.3-1+cuda11.0_x86_64.txz") 74 | ) 75 | _nccl_records.append( 76 | _make_nccl_record("10.2", "2.10.3", "2.10", 77 | "nccl_2.10.3-1+cuda10.2_x86_64.txz") 78 | ) 79 | _nccl_records.append( 80 | _make_nccl_record("10.1", "2.10.3", "2.10", 81 | "nccl_2.10.3-1+cuda10.2_x86_64.txz") 82 | ) 83 | library_records["nccl"] = _nccl_records 84 | 85 | 86 | def install_baguanet(url, destination): 87 | with tempfile.TemporaryDirectory() as tmpdir: 88 | filename = os.path.join(tmpdir, os.path.basename(url)) 89 | print("Downloading {}...".format(url)) 90 | download_url(url, filename) 91 | outdir = os.path.join(tmpdir, "extract") 92 | shutil.unpack_archive(filename, outdir) 93 | lib_dir = os.path.join(outdir, 'build') 94 | for filename in os.listdir(lib_dir): 95 | shutil.move(os.path.join(lib_dir, filename), destination) 96 | 97 | 98 | def install_lib(cuda, prefix, library): 99 | record = None 100 | lib_records = library_records 101 | for record in lib_records[library]: 102 | if record["cuda"] == cuda: 103 | break 104 | else: 105 | raise RuntimeError( 106 | """ 107 | The CUDA version({}) specified is not supported. 108 | Should be one of {}.""".format( 109 | cuda, str([x["cuda"] for x in lib_records[library]]) 110 | ) 111 | ) 112 | if prefix is None: 113 | prefix = os.path.expanduser("~/.bagua_core/cuda_lib") 114 | destination = calculate_destination(prefix, cuda, library, record[library]) 115 | 116 | if os.path.exists(destination): 117 | print("The destination directory {} already exists.".format(destination)) 118 | shutil.rmtree(destination) 119 | 120 | target_platform = platform.system() 121 | asset = record["assets"].get(target_platform, None) 122 | if asset is None: 123 | raise RuntimeError( 124 | """ 125 | The current platform ({}) is not supported.""".format( 126 | target_platform 127 | ) 128 | ) 129 | 130 | print( 131 | "Installing {} {} for CUDA {} to: {}".format( 132 | library, record[library], record["cuda"], destination 133 | ) 134 | ) 135 | 136 | url = asset["url"] 137 | print("Downloading {}...".format(url)) 138 | with tempfile.TemporaryDirectory() as tmpdir: 139 | filename = os.path.join(tmpdir, os.path.basename(url)) 140 | download_url(url, filename) 141 | print("Extracting...") 142 | outdir = os.path.join(tmpdir, "extract") 143 | shutil.unpack_archive(filename, outdir) 144 | print("Installing...") 145 | if library == "nccl": 146 | subdir = os.listdir(outdir) 147 | assert len(subdir) == 1 148 | shutil.move(os.path.join(outdir, subdir[0]), destination) 149 | 150 | # Install bagua-net 151 | dst_dir = os.path.join(destination, 'bagua-net') 152 | os.mkdir(dst_dir) 153 | install_baguanet( 154 | "https://github.com/BaguaSys/bagua-net/releases/download/v0.1.1/bagua-net_refs.tags.v0.1.1_x86_64.tar.gz", 155 | dst_dir) 156 | else: 157 | assert False 158 | print("Cleaning up...") 159 | print("Done!") 160 | 161 | 162 | def calculate_destination(prefix, cuda, lib, lib_ver): 163 | """Calculates the installation directory.""" 164 | return os.path.join(prefix, ".data") 165 | 166 | 167 | def check_torch_version(): 168 | try: 169 | import torch 170 | except ImportError: 171 | print("import torch failed, is it installed?") 172 | 173 | version = torch.__version__ 174 | if version is None: 175 | raise DistutilsPlatformError( 176 | "Unable to determine PyTorch version from the version string '%s'" 177 | % torch.__version__ 178 | ) 179 | return version 180 | 181 | 182 | def install_dependency_library(): 183 | nvcc_version = ( 184 | os.popen( 185 | "nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'" 186 | ) 187 | .read() 188 | .strip() 189 | ) 190 | print("nvcc_version: ", nvcc_version) 191 | install_lib(nvcc_version, os.path.join(cwd, "python/bagua_core"), "nccl") 192 | 193 | 194 | if __name__ == "__main__": 195 | import colorama 196 | colorama.init(autoreset=True) 197 | cwd = os.path.dirname(os.path.abspath(__file__)) 198 | 199 | if int(os.getenv("BAGUA_NO_INSTALL_DEPS", 0)) == 0 and \ 200 | len(sys.argv) > 1 and sys.argv[1] in ["install", "bdist_wheel"]: 201 | print( 202 | colorama.Fore.BLACK 203 | + colorama.Back.CYAN 204 | + "Bagua is automatically installing some system dependencies like NCCL, to disable set env variable BAGUA_NO_INSTALL_DEPS=1", 205 | ) 206 | install_dependency_library() 207 | 208 | setup( 209 | name="bagua-core", 210 | use_scm_version={"local_scheme": "no-local-version"}, 211 | setup_requires=["setuptools_scm"], 212 | url="https://github.com/BaguaSys/bagua-core", 213 | python_requires=">=3.6", 214 | description="Core communication lib for Bagua.", 215 | package_dir={"": "python/"}, 216 | packages=find_packages("python/"), 217 | package_data={"": [".data/lib/libnccl.so", 218 | ".data/bagua-net/libbagua_net.so", 219 | ".data/bagua-net/libnccl-net.so"]}, 220 | rust_extensions=[ 221 | RustExtension( 222 | "bagua_core.bagua_core", 223 | path="bagua-core-py/Cargo.toml", 224 | binding=Binding.PyO3, 225 | native=True, 226 | ) 227 | ], 228 | author="Kuaishou AI Platform & DS3 Lab", 229 | author_email="admin@mail.xrlian.com", 230 | install_requires=[ 231 | "setuptools_rust", 232 | "colorama", 233 | ], 234 | zip_safe=False, 235 | ) 236 | -------------------------------------------------------------------------------- /bagua-core-internal/third_party/cub-1.8.0/cub/iterator/counting_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | /** 63 | * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. 64 | * 65 | * \par Overview 66 | * - After initializing a CountingInputIteratorTto a certain integer \p base, read references 67 | * at \p offset will return the value \p base + \p offset. 68 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device 69 | * functions. 70 | * - Compatible with Thrust API v1.7 or newer. 71 | * 72 | * \par Snippet 73 | * The code snippet below illustrates the use of \p CountingInputIteratorTto 74 | * dereference a sequence of incrementing integers. 75 | * \par 76 | * \code 77 | * #include // or equivalently 78 | * 79 | * cub::CountingInputIterator itr(5); 80 | * 81 | * printf("%d\n", itr[0]); // 5 82 | * printf("%d\n", itr[1]); // 6 83 | * printf("%d\n", itr[2]); // 7 84 | * printf("%d\n", itr[50]); // 55 85 | * 86 | * \endcode 87 | * 88 | * \tparam ValueType The value type of this iterator 89 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 90 | */ 91 | template < 92 | typename ValueType, 93 | typename OffsetT = ptrdiff_t> 94 | class CountingInputIterator 95 | { 96 | public: 97 | 98 | // Required iterator traits 99 | typedef CountingInputIterator self_type; ///< My own type 100 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 101 | typedef ValueType value_type; ///< The type of the element the iterator can point to 102 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 103 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 104 | 105 | #if (THRUST_VERSION >= 100700) 106 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 107 | typedef typename thrust::detail::iterator_facade_category< 108 | thrust::any_system_tag, 109 | thrust::random_access_traversal_tag, 110 | value_type, 111 | reference 112 | >::type iterator_category; ///< The iterator category 113 | #else 114 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 115 | #endif // THRUST_VERSION 116 | 117 | private: 118 | 119 | ValueType val; 120 | 121 | public: 122 | 123 | /// Constructor 124 | __host__ __device__ __forceinline__ CountingInputIterator( 125 | const ValueType &val) ///< Starting value for the iterator instance to report 126 | : 127 | val(val) 128 | {} 129 | 130 | /// Postfix increment 131 | __host__ __device__ __forceinline__ self_type operator++(int) 132 | { 133 | self_type retval = *this; 134 | val++; 135 | return retval; 136 | } 137 | 138 | /// Prefix increment 139 | __host__ __device__ __forceinline__ self_type operator++() 140 | { 141 | val++; 142 | return *this; 143 | } 144 | 145 | /// Indirection 146 | __host__ __device__ __forceinline__ reference operator*() const 147 | { 148 | return val; 149 | } 150 | 151 | /// Addition 152 | template 153 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 154 | { 155 | self_type retval(val + (ValueType) n); 156 | return retval; 157 | } 158 | 159 | /// Addition assignment 160 | template 161 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 162 | { 163 | val += (ValueType) n; 164 | return *this; 165 | } 166 | 167 | /// Subtraction 168 | template 169 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 170 | { 171 | self_type retval(val - (ValueType) n); 172 | return retval; 173 | } 174 | 175 | /// Subtraction assignment 176 | template 177 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 178 | { 179 | val -= n; 180 | return *this; 181 | } 182 | 183 | /// Distance 184 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 185 | { 186 | return (difference_type) (val - other.val); 187 | } 188 | 189 | /// Array subscript 190 | template 191 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 192 | { 193 | return val + (ValueType) n; 194 | } 195 | 196 | /// Structure dereference 197 | __host__ __device__ __forceinline__ pointer operator->() 198 | { 199 | return &val; 200 | } 201 | 202 | /// Equal to 203 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 204 | { 205 | return (val == rhs.val); 206 | } 207 | 208 | /// Not equal to 209 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 210 | { 211 | return (val != rhs.val); 212 | } 213 | 214 | /// ostream operator 215 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 216 | { 217 | os << "[" << itr.val << "]"; 218 | return os; 219 | } 220 | 221 | }; 222 | 223 | 224 | 225 | /** @} */ // end group UtilIterator 226 | 227 | } // CUB namespace 228 | CUB_NS_POSTFIX // Optional outer namespace(s) 229 | --------------------------------------------------------------------------------