├── python
    └── bagua_core
    │   ├── version.py
    │   ├── __init__.py
    │   └── _environment.py
├── bagua-core-internal
    ├── third_party
    │   └── cub-1.8.0
    │   │   ├── tune
    │   │       ├── .gitignore
    │   │       └── Makefile
    │   │   ├── experimental
    │   │       ├── .gitignore
    │   │       ├── spmv_script.sh
    │   │       ├── histogram
    │   │       │   ├── histogram_cub.h
    │   │       │   ├── histogram_gmem_atomics.h
    │   │       │   └── histogram_smem_atomics.h
    │   │       └── Makefile
    │   │   ├── .settings
    │   │       ├── .gitignore
    │   │       ├── org.eclipse.cdt.ui.prefs
    │   │       └── org.eclipse.core.runtime.prefs
    │   │   ├── test
    │   │       ├── .gitignore
    │   │       ├── link_main.cpp
    │   │       ├── link_a.cu
    │   │       ├── link_b.cu
    │   │       ├── test_grid_barrier.cu
    │   │       └── mersenne.h
    │   │   ├── examples
    │   │       ├── block
    │   │       │   ├── .gitignore
    │   │       │   ├── reduce_by_key.cu
    │   │       │   └── Makefile
    │   │       └── device
    │   │       │   ├── .gitignore
    │   │       │   ├── example_device_reduce.cu
    │   │       │   └── example_device_scan.cu
    │   │   ├── .project
    │   │   ├── LICENSE.TXT
    │   │   ├── cub
    │   │       ├── util_namespace.cuh
    │   │       ├── block
    │   │       │   ├── specializations
    │   │       │   │   └── block_histogram_atomic.cuh
    │   │       │   └── block_raking_layout.cuh
    │   │       ├── cub.cuh
    │   │       ├── util_macro.cuh
    │   │       ├── grid
    │   │       │   ├── grid_mapping.cuh
    │   │       │   └── grid_barrier.cuh
    │   │       ├── host
    │   │       │   └── mutex.cuh
    │   │       ├── thread
    │   │       │   ├── thread_search.cuh
    │   │       │   └── thread_reduce.cuh
    │   │       ├── util_debug.cuh
    │   │       ├── util_arch.cuh
    │   │       └── iterator
    │   │       │   ├── discard_output_iterator.cuh
    │   │       │   └── counting_input_iterator.cuh
    │   │   └── README.md
    ├── src
    │   ├── cuda_utils.rs
    │   ├── comm_ops
    │   │   ├── mod.rs
    │   │   ├── python_ffi_op.rs
    │   │   ├── centralized_full_precision_synchronous.rs
    │   │   ├── centralized_low_precision_synchronous.rs
    │   │   ├── decentralized_full_precision_synchronous.rs
    │   │   ├── decentralized_full_precision_asynchronous.rs
    │   │   └── decentralized_low_precision_synchronous.rs
    │   ├── events.rs
    │   ├── resource_pool
    │   │   └── mod.rs
    │   └── kernels
    │   │   └── mod.rs
    ├── cpp
    │   └── include
    │   │   └── bagua_utils.h
    ├── Cargo.toml
    └── build.rs
├── Cargo.toml
├── .gitignore
├── .gitmodules
├── pyproject.toml
├── README.md
├── .github
    ├── workflows
    │   ├── check-package-install.yml
    │   ├── rustfmt.yml
    │   └── pypi-publish.yml
    └── dependabot.yml
├── MANIFEST.in
├── bagua-opentelemetry
    ├── Cargo.toml
    └── src
    │   ├── lib.rs
    │   └── exporter
    │       ├── agent.rs
    │       └── mod.rs
├── bagua-core-c
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── bagua-core-py
    └── Cargo.toml
├── LICENSE
├── CHANGELOG.md
└── setup.py


/python/bagua_core/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "dev"
2 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/tune/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/.settings/.gitignore:
--------------------------------------------------------------------------------
1 | /language.settings.xml
2 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /link_main.obj
3 | /dummy/
4 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | workspace = { members = [
2 |     "bagua-core-internal",
3 |     "bagua-core-py",
4 |     "bagua-opentelemetry",
5 | ], exclude = [] }
6 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/block/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /Release
4 | /cuda55.sdf
5 | /cuda55.suo
6 | /cuda60.sdf
7 | /cuda60.suo
8 | 


--------------------------------------------------------------------------------
/python/bagua_core/__init__.py:
--------------------------------------------------------------------------------
1 | from . import _environment
2 | _environment._preload_libraries()
3 | from .version import __version__
4 | from .bagua_core import *
5 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | formatter_profile=_B40C
3 | formatter_settings_version=1
4 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/device/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /ipch
4 | /Release
5 | /cuda55.sdf
6 | /cuda55.suo
7 | /cuda60.sdf
8 | /cuda60.suo
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | target/
 3 | /data/
 4 | scratchpad.org
 5 | build.sh
 6 | push.sh
 7 | __pycache__/
 8 | *.egg-info/
 9 | /dist/
10 | /.eggs/
11 | /build/
12 | .data/
13 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bagua-core-internal/third_party/Aluminum"]
2 | 	path = bagua-core-internal/third_party/Aluminum
3 | 	url = https://github.com/BaguaSys/Aluminum.git
4 | 	branch = bagua
5 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/link_main.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | extern void a();
 4 | extern void b();
 5 | 
 6 | int main()
 7 | {
 8 |     printf("hello world\n");
 9 |     return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/python/bagua_core/_environment.py:
--------------------------------------------------------------------------------
1 | import ctypes
2 | import os
3 | 
4 | 
5 | def _preload_libraries():
6 |     cwd = os.path.dirname(os.path.abspath(__file__))
7 |     libnccl_path = os.path.join(cwd, ".data", "lib", "libnccl.so")
8 |     ctypes.CDLL(libnccl_path)
9 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs:
--------------------------------------------------------------------------------
1 | content-types/enabled=true
2 | content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
3 | content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
4 | eclipse.preferences.version=1
5 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/cuda_utils.rs:
--------------------------------------------------------------------------------
1 | pub unsafe fn cuda_memcpy_device_to_host_sync(host_ptr: u64, device_ptr: u64, num_bytes: i32) {
2 |     cpp::cpp!([host_ptr as "void*", device_ptr as "void*", num_bytes as "int"]
3 |     {
4 |         CUDACHECK(cudaMemcpy(host_ptr, device_ptr, num_bytes, cudaMemcpyDeviceToHost));
5 |     });
6 | }
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools >= 43.0.0", "wheel", "setuptools-rust", "colorama", "tqdm", "setuptools_scm[toml]>=6.0"]
3 | build-backend = 'setuptools.build_meta'
4 | 
5 | [tool.setuptools_scm]
6 | local_scheme = "no-local-version"
7 | write_to = "python/bagua_core/version.py"
8 | write_to_template = "__version__ = \"{version}\""
9 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/link_a.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void a()
 4 | {
 5 |     printf("a() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/link_b.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void b()
 4 | {
 5 |     printf("b() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | `bagua-core` has been merged into [Bagua](https://github.com/BaguaSys/bagua)!
2 | =====
3 | 
4 | [![PyPI version](https://badge.fury.io/py/bagua-core.svg)](https://badge.fury.io/py/bagua-core) [![GitHub license](https://img.shields.io/github/license/BaguaSys/bagua-core)](https://github.com/BaguaSys/bagua-core/blob/master/LICENSE)
5 | 
6 | The core communication logic for [Bagua](https://github.com/BaguaSys/bagua).
7 | 
8 | * PyPI: https://pypi.org/project/bagua-core/
9 | 


--------------------------------------------------------------------------------
/.github/workflows/check-package-install.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 | 
 9 | name: check package install
10 | 
11 | jobs:
12 |   check:
13 |     runs-on: ubuntu-latest
14 |     container: ikzk/bagua-ci:latest
15 |     steps:
16 |       - uses: actions/checkout@v2
17 |         with:
18 |           submodules: recursive
19 |       - run: rustup default stable
20 |       - run: pip install .
21 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "cargo" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/rustfmt.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 | 
 9 | name: Rustfmt
10 | 
11 | jobs:
12 |   format:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - uses: actions-rs/toolchain@v1
17 |         with:
18 |             toolchain: stable
19 |             components: rustfmt
20 |             override: true
21 |       - uses: mbrobbel/rustfmt-check@master
22 |         with:
23 |           token: ${{ secrets.GITHUB_TOKEN }}
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include Cargo.toml
 2 | include bagua-core-internal/Cargo.toml
 3 | include bagua-core-internal/build.rs
 4 | recursive-include bagua-core-internal/src *
 5 | recursive-include bagua-core-internal/kernels *
 6 | recursive-include bagua-core-internal/cpp *
 7 | recursive-include bagua-core-internal/third_party *
 8 | include bagua-core-py/Cargo.toml
 9 | include bagua-core-py/build.rs
10 | recursive-include bagua-core-py/src *
11 | include bagua-core-c/Cargo.toml
12 | include bagua-core-c/build.rs
13 | recursive-include bagua-core-c/src *
14 | recursive-include python *
15 | exclude bagua-core-internal/third_party/Aluminum/.git
16 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod centralized_full_precision_synchronous;
 2 | pub mod centralized_low_precision_synchronous;
 3 | pub mod decentralized_full_precision_asynchronous;
 4 | pub mod decentralized_full_precision_synchronous;
 5 | pub mod decentralized_low_precision_synchronous;
 6 | pub mod python_ffi_op;
 7 | 
 8 | use crate::datatypes::BaguaBucket;
 9 | use crate::{BaguaCommOpChannels, BaguaCoreError};
10 | use std::fmt::Debug;
11 | use std::sync::Arc;
12 | 
13 | pub trait CommOpTrait: Debug {
14 |     fn execute_background_communication(
15 |         &self,
16 |         bucket: Arc<BaguaBucket>,
17 |         comm_channels: &BaguaCommOpChannels,
18 |     );
19 | }
20 | 


--------------------------------------------------------------------------------
/bagua-opentelemetry/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bagua-opentelemetry"
 3 | version = "0.1.0"
 4 | edition = "2018"
 5 | publish = ["private"]
 6 | 
 7 | [dependencies]
 8 | tracing = "0.1"
 9 | async-std = { version = "1.10", features = ["attributes", "tokio1"] }
10 | async-trait = { version = "0.1" }
11 | hyper = { version = "0.14", features = ["full"] }
12 | opentelemetry = { version = "0.15", default-features = false, features = [
13 |     "trace",
14 |     "rt-async-std",
15 | ] }
16 | serde = { version = "1.0", features = ["derive"] }
17 | serde_json = "1.0"
18 | reqwest = { version = "0.11", features = ["json"] }
19 | tokio = { version = "1", features = ["full"] }
20 | futures = { version = "0.3" }
21 | 
22 | tokio-stream = { version = "0.1" }
23 | 


--------------------------------------------------------------------------------
/bagua-core-c/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bagua-core-c"
 3 | version = "0.1.2"
 4 | edition = "2018"
 5 | 
 6 | [lib]
 7 | name = "bagua_comm_core_c"
 8 | crate-type = ["dylib"]
 9 | path = "src/lib.rs"
10 | 
11 | [dependencies]
12 | bagua-core-internal = { path = "../bagua-core-internal" }
13 | tracing = "0.1"
14 | tracing-subscriber = "0.2"
15 | anyhow = "1.0"
16 | color-eyre = "0.5"
17 | numpy = "0.14.1"
18 | parking_lot = { version = "0.11", features = ["deadlock_detection"] }
19 | openssl-sys = { version = "*", features = ["vendored"] }
20 | 
21 | [dependencies.pyo3]
22 | version = "0.14.1"
23 | features = ["extension-module"]
24 | 
25 | [build-dependencies]
26 | shadow-rs = "0.6"
27 | cpp_build = "0.5"
28 | cc = "1.0"
29 | cmd_lib = "1.0"
30 | which = "4.2"
31 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/python_ffi_op.rs:
--------------------------------------------------------------------------------
 1 | use crate::comm_ops::CommOpTrait;
 2 | use crate::datatypes::BaguaBucket;
 3 | use crate::BaguaCommOpChannels;
 4 | use pyo3::Python;
 5 | use std::sync::Arc;
 6 | 
 7 | #[derive(Debug)]
 8 | pub struct PythonFFIOp {
 9 |     pub py_callable: pyo3::Py<pyo3::PyAny>,
10 | }
11 | 
12 | impl CommOpTrait for PythonFFIOp {
13 |     fn execute_background_communication(
14 |         &self,
15 |         bucket: Arc<BaguaBucket>,
16 |         _comm_op_channels: &BaguaCommOpChannels,
17 |     ) {
18 |         Python::with_gil(|python| {
19 |             let result = self.py_callable.call1(python, (bucket.name.as_str(),));
20 |             if let Err(e) = result {
21 |                 tracing::error!("python ffi op error: {:?}", e);
22 |             }
23 |         });
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/bagua-opentelemetry/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod exporter;
 2 | 
 3 | use crate::exporter::agent::AgentAsyncClientHTTP;
 4 | use crate::exporter::Exporter;
 5 | use opentelemetry;
 6 | use opentelemetry::{global, sdk, trace::Tracer, trace::TracerProvider};
 7 | 
 8 | pub fn init_tracer(autotune_server_addr: &str) -> impl Tracer {
 9 |     let exporter = Exporter {
10 |         uploader: AgentAsyncClientHTTP::new(autotune_server_addr.to_string()),
11 |     };
12 | 
13 |     let builder = sdk::trace::TracerProvider::builder()
14 |         .with_batch_exporter(exporter, opentelemetry::runtime::AsyncStd);
15 | 
16 |     let tracer_provider = builder.build();
17 |     let tracer = tracer_provider.get_tracer("bagua-opentelemetry", Some(env!("CARGO_PKG_VERSION")));
18 |     let _ = global::set_tracer_provider(tracer_provider);
19 | 
20 |     tracer
21 | }
22 | 


--------------------------------------------------------------------------------
/bagua-core-py/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bagua-core-py"
 3 | version = "0.1.2"
 4 | authors = ["Xiangru Lian <admin@mail.xrlian.com>"]
 5 | edition = "2018"
 6 | publish = ["private"]
 7 | 
 8 | [lib]
 9 | name = "bagua_core_py"
10 | crate-type = ["cdylib"]
11 | path = "src/lib.rs"
12 | 
13 | [dependencies]
14 | bagua-core-internal = { path = "../bagua-core-internal" }
15 | ndarray = "0.15.3"
16 | tracing = "0.1"
17 | tracing-subscriber = "0.2"
18 | anyhow = "1.0"
19 | color-eyre = "0.5"
20 | numpy = "0.14.1"
21 | parking_lot = { version = "0.11", features = ["deadlock_detection"] }
22 | openssl-sys = { version = "*", features = ["vendored"] }
23 | num-traits = "0.2"
24 | num-derive = "0.3"
25 | 
26 | [dependencies.pyo3]
27 | version = "0.14.5"
28 | features = ["extension-module"]
29 | 
30 | [build-dependencies]
31 | shadow-rs = "0.7"
32 | cpp_build = "0.5"
33 | cc = "1.0"
34 | cmd_lib = "1.0"
35 | which = "4.2"
36 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/events.rs:
--------------------------------------------------------------------------------
 1 | use parking_lot::Mutex;
 2 | use std::sync::Arc;
 3 | 
 4 | #[derive(Clone, Debug)]
 5 | pub struct BaguaEventChannel {
 6 |     pub name: String,
 7 |     inner: Arc<(Mutex<bool>, parking_lot::Condvar)>,
 8 | }
 9 | 
10 | impl BaguaEventChannel {
11 |     pub fn new(name: &str) -> Self {
12 |         Self {
13 |             name: name.to_string(),
14 |             inner: Arc::new((Mutex::new(false), parking_lot::Condvar::new())),
15 |         }
16 |     }
17 | 
18 |     pub fn finish(&self) {
19 |         let &(ref lock, ref cvar) = &*self.inner;
20 |         let mut finished = lock.lock();
21 |         *finished = true;
22 |         cvar.notify_all();
23 |     }
24 | 
25 |     pub fn wait(&self) {
26 |         let &(ref lock, ref cvar) = &*self.inner;
27 |         let mut finished = lock.lock();
28 |         if !*finished {
29 |             cvar.wait(&mut finished);
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/spmv_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for i in 1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384 32768 65536 131072 262144 524288 1048576 2097152 4194304 8388608 16777216
 4 | do
 5 | 	echo `date`, `$1 --dense=$i $2 $3 $4 $5 $6 $7`
 6 | done
 7 | 
 8 | echo
 9 | echo
10 | 
11 | for i in `ls /home/dumerrill/graphs/spmv/*.mtx`
12 | do
13 |     if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
14 |     then
15 |     	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
16 |     fi
17 | done
18 | 
19 | echo
20 | echo
21 | 
22 | for i in `ls /scratch/dumerrill/graphs/mtx/*.mtx`
23 | #for i in `ls /cygdrive/w/Dev/UFget/mtx/*.mtx`
24 | do 
25 |     if [[ ( "`head -n 50 $i | grep complex`" = "" ) && ( "`head -n 50 $i | grep array`" = "" ) ]] 
26 |     then
27 |     	echo `date`, `$1 --mtx=$i $2 $3 $4 $5 $6 $7 2>/dev/null`
28 |     fi
29 | done 
30 | 
31 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>GIT_CUB</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
10 | 			<triggers>clean,full,incremental,</triggers>
11 | 			<arguments>
12 | 			</arguments>
13 | 		</buildCommand>
14 | 		<buildCommand>
15 | 			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
16 | 			<triggers>full,incremental,</triggers>
17 | 			<arguments>
18 | 			</arguments>
19 | 		</buildCommand>
20 | 	</buildSpec>
21 | 	<natures>
22 | 		<nature>org.eclipse.cdt.core.cnature</nature>
23 | 		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
24 | 		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
25 | 		<nature>org.eclipse.cdt.core.ccnature</nature>
26 | 	</natures>
27 | </projectDescription>
28 | 


--------------------------------------------------------------------------------
/bagua-core-internal/cpp/include/bagua_utils.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef __BAGUA_UTILS_HPP__
 3 | #define __BAGUA_UTILS_HPP__
 4 | 
 5 | #define CUDACHECK(cmd) do { cudaError_t e = cmd; if( e != cudaSuccess ) { printf("Failed: Cuda error %s:%d '%s'\n", __FILE__,__LINE__,cudaGetErrorString(e)); exit(EXIT_FAILURE); } } while(0)
 6 | #define NCCLCHECK(cmd) do { ncclResult_t r = cmd; if (r!= ncclSuccess) { printf("Failed, NCCL error %s:%d '%s'\n", __FILE__,__LINE__,ncclGetErrorString(r)); exit(EXIT_FAILURE); } } while(0)
 7 | 
 8 | #define ALIGN_SIZE(size, align) (((size) + (align) - 1) / (align) * (align))
 9 | #define DIVUP(x, y) (((x)+(y)-1)/(y))
10 | 
11 | #include<nccl.h>
12 | 
13 | 
14 | ncclResult_t ncclAllToAll(void *sendbuf, void *recvbuf,
15 |                           size_t count,
16 |                           ncclDataType_t datatype,
17 |                           ncclComm_t comm,
18 |                           int nranks,
19 |                           int rank,
20 |                           cudaStream_t stream);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Kuaishou AI Platform & DS3 Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bagua-core-internal/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "bagua-core-internal"
 3 | version = "0.1.2"
 4 | authors = ["Xiangru Lian <admin@mail.xrlian.com>"]
 5 | edition = "2018"
 6 | publish = ["private"]
 7 | build = "build.rs"
 8 | 
 9 | [dependencies]
10 | tracing = "0.1"
11 | tracing-subscriber = "0.2"
12 | thiserror = "1"
13 | base64 = "0.13"
14 | itertools = "0.10"
15 | shadow-rs = "0.7"
16 | parking_lot = { version = "0.11", features = ["deadlock_detection"] }
17 | hashbrown = "0.11"
18 | flume = "0.10"
19 | derivative = "2.2.0"
20 | oneshot = "0.1"
21 | cpp = "0.5"
22 | sized-object-pool = "0.2"
23 | dynamic-pool = "0.2"
24 | once_cell = "1.7"
25 | ndarray = "0.15.3"
26 | serde = { version = "1", features = ["derive"] }
27 | scheduled-thread-pool = "0.2"
28 | serde_json = "1.0"
29 | ureq = "2.2"
30 | num-traits = "0.2"
31 | num-derive = "0.3"
32 | display_utils = "0.4.0"
33 | opentelemetry = { version = "0.15", features = ["serialize", "metrics"] }
34 | bagua-opentelemetry = { path = "../bagua-opentelemetry" }
35 | 
36 | [dependencies.pyo3]
37 | version = "0.14.5"
38 | features = ["auto-initialize"]
39 | 
40 | [build-dependencies]
41 | shadow-rs = "0.7"
42 | cmake = "0.1"
43 | cpp_build = "0.5"
44 | cc = "1.0"
45 | cmd_lib = "1.0"
46 | which = "4.2"
47 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - master
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 | 
 9 | name: pypi publish
10 | 
11 | jobs:
12 |   publish:
13 |     runs-on: ubuntu-latest
14 |     container: ikzk/bagua-ci:latest
15 |     steps:
16 |       - uses: actions/checkout@v2
17 |         with:
18 |           fetch-depth: 0
19 |           submodules: recursive
20 |       - run: env BAGUA_NO_INSTALL_DEPS=1 python -m build -s
21 |       - name: Publish a Python distribution to PyPI
22 |         if: github.ref == 'refs/heads/master' && !startsWith(github.ref, 'refs/tags')
23 |         uses: pypa/gh-action-pypi-publish@release/v1
24 |         with:
25 |           user: __token__
26 |           password: ${{ secrets.PYPI_API_TOKEN }}
27 |       - name: Publish package
28 |         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
29 |         uses: pypa/gh-action-pypi-publish@release/v1
30 |         with:
31 |           user: __token__
32 |           password: ${{ secrets.PYPI_API_TOKEN }}
33 |       - name: Archive package artifacts
34 |         uses: actions/upload-artifact@v2
35 |         with:
36 |           name: bagua-pypi-package
37 |           path: |
38 |             dist/*
39 | 


--------------------------------------------------------------------------------
/bagua-opentelemetry/src/exporter/agent.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Serialize, Deserialize, Clone, Debug, Hash)]
 4 | pub struct BaguaSpan {
 5 |     pub trace_id: u128,
 6 |     pub action: String,
 7 |     pub tensor_name: String,
 8 |     pub start_time: u128,
 9 |     pub end_time: u128,
10 | }
11 | 
12 | #[derive(Serialize, Deserialize, Clone, Debug, Hash)]
13 | pub struct BaguaBatch {
14 |     pub spans: Vec<BaguaSpan>,
15 | }
16 | 
17 | #[derive(Debug)]
18 | pub struct AgentAsyncClientHTTP {
19 |     server_addr: String,
20 |     client: reqwest::Client,
21 | }
22 | 
23 | impl AgentAsyncClientHTTP {
24 |     pub fn new(server_addr: String) -> AgentAsyncClientHTTP {
25 |         Self {
26 |             server_addr: server_addr,
27 |             client: reqwest::Client::new(),
28 |         }
29 |     }
30 | 
31 |     pub async fn emit_batch(
32 |         &mut self,
33 |         batch: BaguaBatch,
34 |     ) -> Result<reqwest::Response, reqwest::Error> {
35 |         let uri = format!(
36 |             "http://{}/api/v1/report_tensor_execution_order",
37 |             self.server_addr
38 |         );
39 | 
40 |         let resp = self.client.post(uri).json(&batch).send().await?;
41 | 
42 |         Ok(resp)
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/LICENSE.TXT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |    *  Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |    *  Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |    *  Neither the name of the NVIDIA CORPORATION nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/block/reduce_by_key.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #include <cub/cub.cuh>
 4 | 
 5 | 
 6 | template <
 7 |     int         BLOCK_THREADS,          ///< Number of CTA threads
 8 |     typename    KeyT,                   ///< Key type
 9 |     typename    ValueT>                 ///< Value type
10 | __global__ void Kernel()
11 | {
12 |     // Tuple type for scanning (pairs accumulated segment-value with segment-index)
13 |     typedef cub::KeyValuePair<int, ValueT> OffsetValuePairT;
14 | 
15 |     // Reduce-value-by-segment scan operator
16 |     typedef cub::ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
17 | 
18 |     // Parameterized BlockDiscontinuity type for setting head flags
19 |     typedef cub::BlockDiscontinuity<
20 |             KeyT,
21 |             BLOCK_THREADS>
22 |         BlockDiscontinuityKeysT;
23 | 
24 |     // Parameterized BlockScan type
25 |     typedef cub::BlockScan<
26 |             OffsetValuePairT,
27 |             BLOCK_THREADS,
28 |             cub::BLOCK_SCAN_WARP_SCANS>
29 |         BlockScanT;
30 | 
31 |     // Shared memory
32 |     __shared__ union TempStorage
33 |     {
34 |         typename BlockScanT::TempStorage                scan;           // Scan storage
35 |         typename BlockDiscontinuityKeysT::TempStorage   discontinuity;  // Discontinuity storage
36 |     } temp_storage;
37 | 
38 | 
39 |     // Read data (each thread gets 3 items each, every 9 items is a segment)
40 |     KeyT    my_keys[3]      = {threadIdx.x / 3, threadIdx.x / 3, threadIdx.x / 3};
41 |     ValueT  my_values[3]    = {1, 1, 1};
42 | 
43 |     // Set head segment head flags
44 |     int     my_flags[3];
45 |     BlockDiscontinuityKeysT(temp_storage.discontinuity).FlagHeads(
46 |         my_flags,
47 |         my_keys,
48 |         cub::Inequality());
49 | 
50 |     __syncthreads();
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/bagua-opentelemetry/src/exporter/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod agent;
 2 | 
 3 | use crate::exporter::agent::{AgentAsyncClientHTTP, BaguaBatch, BaguaSpan};
 4 | use async_trait::async_trait;
 5 | use opentelemetry::{sdk::export::trace, Key};
 6 | use reqwest::StatusCode;
 7 | use std::time::UNIX_EPOCH;
 8 | 
 9 | #[derive(Debug)]
10 | pub struct Exporter {
11 |     pub uploader: AgentAsyncClientHTTP,
12 | }
13 | 
14 | #[async_trait]
15 | impl trace::SpanExporter for Exporter {
16 |     async fn export(&mut self, batch: Vec<trace::SpanData>) -> trace::ExportResult {
17 |         let mut bagua_spans = Vec::new();
18 |         for span in batch {
19 |             let bagua_span = BaguaSpan {
20 |                 trace_id: span.span_context.trace_id().to_u128(),
21 |                 action: span.name.into_owned(),
22 |                 tensor_name: span
23 |                     .attributes
24 |                     .get(&Key::new("tensor_name"))
25 |                     .unwrap()
26 |                     .as_str()
27 |                     .to_string(),
28 |                 start_time: span
29 |                     .start_time
30 |                     .duration_since(UNIX_EPOCH)
31 |                     .unwrap()
32 |                     .as_millis(),
33 |                 end_time: span
34 |                     .end_time
35 |                     .duration_since(UNIX_EPOCH)
36 |                     .unwrap()
37 |                     .as_millis(),
38 |             };
39 | 
40 |             bagua_spans.push(bagua_span);
41 |         }
42 | 
43 |         let resp = self
44 |             .uploader
45 |             .emit_batch(BaguaBatch { spans: bagua_spans })
46 |             .await;
47 |         match resp {
48 |             Ok(resp) => {
49 |                 if resp.status() != StatusCode::OK {
50 |                     tracing::warn!("upload bagua span failed, resp={:?}", resp);
51 |                 }
52 |             }
53 |             Err(err) => {
54 |                 tracing::warn!("upload bagua span failed, err={:?}", err);
55 |             }
56 |         }
57 | 
58 |         Ok(())
59 |     }
60 | 
61 |     fn shutdown(&mut self) {}
62 | }
63 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # CHANGELOG
 2 | 
 3 | ## [0.4.1] - 2021-08-14
 4 | 
 5 |     ### Features
 6 |     
 7 |         - Add opentelemetry to report tensor ready order (#42)
 8 | 
 9 | 
10 | ## [0.4.0] - 2021-07-23
11 | 
12 |     ### Features
13 |     
14 |         - Better debug log including tensor info when executing ops
15 |         - Make full precision decentralized op stateless (#36)
16 | 
17 | 
18 | ## [0.3.1] - 2021-07-01
19 | 
20 |     ### Bug Fixes
21 |     
22 |         - Always mark bagua padding tensor as ready
23 | 
24 | 
25 | ## [0.3.0] - 2021-07-01
26 | 
27 |     ### Bug Fixes
28 |     
29 |         - Fix decompress incorrect pointer and typo in error msg
30 |         - Fix python gil deadlock during getting data ptr
31 | 
32 |     ### Features
33 |     
34 |         - Replace NCCL with Aluminum (#7)
35 |         - Support creating BaguaTensor by passing torch tensor directly (#19)
36 |         - Compatible mode for getting pytorch tensor info with Python interpreter
37 | 
38 | 
39 | ## [0.2.0] - 2021-06-17
40 | 
41 |     ### Features
42 |     
43 |         - Initial support for python op (#2)
44 | 
45 | 
46 | ## [0.1.3] - 2021-06-17
47 | 
48 |     ### Bug Fixes
49 |     
50 |         - Move import bagua_install_library to install library function
51 |         - Merge bagua_install_library and setup.py, remove nccl<=2.6 support
52 | 
53 | 
54 | ## [0.1.2] - 2021-06-17
55 | 
56 |     ### Features
57 |     
58 |         - Add version.py placeholder to prevent file not found error
59 | 
60 | 
61 | ## [0.1.1] - 2021-06-10
62 | 
63 |     ### Bug Fixes
64 |     
65 |         - Only run publish once on git tag
66 | 
67 |     ### Features
68 |     
69 |         - Install nccl deps in bagua core and add generated __version__ variable
70 | 
71 | 
72 | ## [0.1.0] - 2021-06-10
73 | 
74 |     ### Bug Fixes
75 |     
76 |         - Fix ci pypi versioning
77 |         - Remove __init__.py and python __version__, use cargo version
78 | 
79 |     ### Features
80 |     
81 |         - Initial commit of bagua core impl
82 |         - Add python packaging related files
83 |         - Only publish pypi for master commits
84 |         - Add __version__ variable
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 | 
40 | #ifndef CUB_NS_PREFIX
41 | #define CUB_NS_PREFIX
42 | #endif
43 | 
44 | #ifndef CUB_NS_POSTFIX
45 | #define CUB_NS_POSTFIX
46 | #endif
47 | 


--------------------------------------------------------------------------------
/bagua-core-c/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use bagua_core_internal::communicators::BaguaSingleCommunicator;
 2 | use std::ffi::CStr;
 3 | use std::os::raw::c_char;
 4 | 
 5 | pub struct BaguaSingleCommunicatorC {
 6 |     inner: BaguaSingleCommunicator,
 7 | }
 8 | 
 9 | pub extern "C" fn bagua_single_communicator_c_create(
10 |     rank: usize,
11 |     nranks: usize,
12 |     device_id: usize,
13 |     stream_ptr: u64,
14 |     nccl_unique_id_str: *const c_char,
15 | ) -> *mut BaguaSingleCommunicatorC {
16 |     let obj = BaguaSingleCommunicatorC {
17 |         inner: bagua_core_internal::communicators::BaguaSingleCommunicator::new(
18 |             rank,
19 |             nranks,
20 |             device_id,
21 |             stream_ptr,
22 |             unsafe { CStr::from_ptr(nccl_unique_id_str).to_str().unwrap() },
23 |         ),
24 |     };
25 | 
26 |     // into_raw turns the Box into a *mut, which the borrow checker
27 |     // ignores, without calling its destructor.
28 |     Box::into_raw(Box::new(obj))
29 | }
30 | 
31 | pub extern "C" fn bagua_single_communicator_c_destroy(ptr: &mut *mut BaguaSingleCommunicatorC) {
32 |     // First, we **must** check to see if the pointer is null.
33 |     if ptr.is_null() {
34 |         // Do nothing.
35 |         return;
36 |     }
37 | 
38 |     // Now we know the pointer is non-null, we can continue. from_raw is the
39 |     // inverse of into_raw: it turns the *mut Dramatic back into a
40 |     // Box<Dramatic>. You must only call from_raw once per pointer.
41 |     let obj: Box<BaguaSingleCommunicatorC> = unsafe { Box::from_raw(*ptr) };
42 | 
43 |     // We don't *have* to do anything else; once obj goes out of scope, it will
44 |     // be dropped.  I'm going to drop it explicitly, however, for clarity.
45 |     drop(obj);
46 | 
47 |     // I am, however, going to null out the `ptr` we were passed just so the
48 |     // calling code is less likely to accidentally re-use the pointer.
49 |     *ptr = ::std::ptr::null_mut();
50 | }
51 | 
52 | /// Error code
53 | /// 0: success
54 | /// -1: null pointer
55 | pub extern "C" fn bagua_single_communicator_c_nranks(
56 |     ptr: &mut *mut BaguaSingleCommunicatorC,
57 |     nranks: *mut usize,
58 | ) -> i32 {
59 |     // First, we **must** check to see if the pointer is null.
60 |     if ptr.is_null() {
61 |         // Do nothing.
62 |         return -1;
63 |     }
64 | 
65 |     unsafe {
66 |         *nranks = (*(*ptr)).inner.nranks();
67 |     }
68 |     return 0;
69 | }
70 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/centralized_full_precision_synchronous.rs:
--------------------------------------------------------------------------------
 1 | use crate::comm_ops::CommOpTrait;
 2 | use crate::communicators::BaguaCommunicator;
 3 | use crate::datatypes::{BaguaBucket, BaguaReductionOp, BaguaTensorRaw, RawBaguaTensor};
 4 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL;
 5 | use crate::BaguaCommOpChannels;
 6 | use std::sync::Arc;
 7 | 
 8 | #[derive(Debug)]
 9 | pub struct CentralizedFullPrecisionSynchronous {
10 |     pub communicator: BaguaCommunicator,
11 |     /// whether divide world_size after allreduce sum op
12 |     pub average: bool,
13 |     pub scattergather: bool,
14 | }
15 | 
16 | impl CommOpTrait for CentralizedFullPrecisionSynchronous {
17 |     fn execute_background_communication(
18 |         &self,
19 |         bucket: Arc<BaguaBucket>,
20 |         _comm_op_channels: &BaguaCommOpChannels,
21 |     ) {
22 |         let bucket = bucket.inner.lock();
23 |         let stream_ptr = self.communicator.stream_ptr();
24 |         let mut communication_tensor = bucket.get_communication_tensor(stream_ptr, false, false);
25 |         self.communicator.execute_communication(
26 |             &mut communication_tensor,
27 |             self.average,
28 |             true,
29 |             true,
30 |             &mut |c, t| {
31 |                 tracing::debug!("internode communication started");
32 |                 if self.scattergather {
33 |                     tracing::debug!("start alltoall");
34 |                     c.alltoall_inplace(&mut t.raw);
35 |                     tracing::debug!("start reduce_sum");
36 |                     if self.average {
37 |                         t.raw.reduce_mean_inplace(c.nranks, c.rank, c.stream_ptr);
38 |                     } else {
39 |                         t.raw.reduce_sum_inplace(c.nranks, c.rank, c.stream_ptr);
40 |                     }
41 |                     tracing::debug!("start allgather");
42 |                     c.allgather_inplace(&mut t.raw);
43 |                     tracing::debug!("internode communication done")
44 |                 } else {
45 |                     tracing::debug!("start allreduce");
46 |                     if self.average {
47 |                         c.allreduce_inplace(&mut t.raw, BaguaReductionOp::AVG);
48 |                     } else {
49 |                         c.allreduce_inplace(&mut t.raw, BaguaReductionOp::SUM);
50 |                     }
51 |                     tracing::debug!("internode communication done");
52 |                 }
53 |             },
54 |         );
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/resource_pool/mod.rs:
--------------------------------------------------------------------------------
 1 | use dynamic_pool::DynamicPool;
 2 | use sized_object_pool::{DynamicReset, SizedAllocatable, SizedPool};
 3 | 
 4 | #[derive(Debug)]
 5 | pub struct CudaMemory {
 6 |     pub ptr: u64,
 7 |     pub num_bytes: usize,
 8 | }
 9 | 
10 | impl CudaMemory {
11 |     pub fn new(bytes: usize) -> Self {
12 |         let ptr = unsafe {
13 |             cpp::cpp!([bytes as "size_t"] -> u64 as "void *"
14 |             {
15 |                 int *ptr = 0;
16 |                 CUDACHECK(cudaMalloc(&ptr, bytes));
17 |                 return ptr;
18 |             })
19 |         };
20 |         Self {
21 |             ptr,
22 |             num_bytes: bytes,
23 |         }
24 |     }
25 | }
26 | 
27 | impl Drop for CudaMemory {
28 |     fn drop(&mut self) {
29 |         let ptr = self.ptr;
30 |         unsafe {
31 |             cpp::cpp!([ptr as "void *"]
32 |             {
33 |                 CUDACHECK(cudaFree(ptr));
34 |             })
35 |         };
36 |     }
37 | }
38 | 
39 | impl SizedAllocatable for CudaMemory {
40 |     fn new(size: usize) -> Self {
41 |         Self::new(size)
42 |     }
43 | 
44 |     fn size(&self) -> usize {
45 |         self.num_bytes
46 |     }
47 | }
48 | 
49 | impl DynamicReset for CudaMemory {
50 |     fn reset(&mut self) {}
51 | }
52 | 
53 | pub static CUDA_DEVICE_MEMORY_POOL: once_cell::sync::Lazy<Vec<SizedPool<CudaMemory>>> =
54 |     once_cell::sync::Lazy::new(|| {
55 |         let mut pools = Vec::new();
56 |         for _ in 0..64 {
57 |             pools.push(SizedPool::<CudaMemory>::new(0, 40, 2048))
58 |         }
59 |         pools
60 |     });
61 | 
62 | #[derive(Debug)]
63 | pub struct CudaEvent {
64 |     pub event: u64,
65 | }
66 | 
67 | impl CudaEvent {
68 |     pub fn new() -> Self {
69 |         let event = unsafe {
70 |             cpp::cpp!([] -> u64 as "cudaEvent_t"
71 |             {
72 |                 cudaEvent_t event = 0;
73 |                 CUDACHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
74 |                 return event;
75 |             })
76 |         };
77 |         Self { event }
78 |     }
79 | }
80 | 
81 | impl Drop for CudaEvent {
82 |     fn drop(&mut self) {
83 |         let event = self.event;
84 |         unsafe {
85 |             cpp::cpp!([event as "cudaEvent_t"]
86 |             {
87 |                 CUDACHECK(cudaEventDestroy(event));
88 |             })
89 |         };
90 |     }
91 | }
92 | 
93 | impl DynamicReset for CudaEvent {
94 |     fn reset(&mut self) {}
95 | }
96 | 
97 | pub static CUDA_EVENT_POOL: once_cell::sync::Lazy<DynamicPool<CudaEvent>> =
98 |     once_cell::sync::Lazy::new(|| DynamicPool::new(0, 10, CudaEvent::new));
99 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/centralized_low_precision_synchronous.rs:
--------------------------------------------------------------------------------
 1 | use crate::comm_ops::CommOpTrait;
 2 | use crate::communicators::BaguaCommunicator;
 3 | use crate::datatypes::{BaguaBucket, BaguaTensorRaw, RawBaguaTensor, TensorCompressionMethod};
 4 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL;
 5 | use crate::BaguaCommOpChannels;
 6 | use std::sync::Arc;
 7 | 
 8 | #[derive(Debug)]
 9 | pub struct CentralizedLowPrecisionSynchronous {
10 |     pub communicator: BaguaCommunicator,
11 |     /// whether divide world_size after allreduce sum op
12 |     pub average: bool,
13 |     pub compression_method: TensorCompressionMethod,
14 | }
15 | 
16 | impl CommOpTrait for CentralizedLowPrecisionSynchronous {
17 |     fn execute_background_communication(
18 |         &self,
19 |         bucket: Arc<BaguaBucket>,
20 |         _comm_op_channels: &BaguaCommOpChannels,
21 |     ) {
22 |         let bucket = bucket.inner.lock();
23 |         let stream_ptr = self.communicator.stream_ptr();
24 |         let mut communication_tensor = bucket.get_communication_tensor(stream_ptr, false, false);
25 |         self.communicator.execute_communication(
26 |             &mut communication_tensor,
27 |             self.average,
28 |             true,
29 |             true,
30 |             &mut |c, t| {
31 |                 tracing::debug!("start compress");
32 |                 let mut compressed_tensor = t
33 |                     .raw
34 |                     .compress(&self.compression_method, c.nranks, c.stream_ptr, -1)
35 |                     .expect("cannot compress tensor");
36 |                 tracing::debug!("start alltoall");
37 |                 c.alltoall_inplace(compressed_tensor.as_mut());
38 |                 tracing::debug!("start decompress");
39 |                 t.raw.decompress_from(
40 |                     &self.compression_method,
41 |                     c.nranks,
42 |                     compressed_tensor.as_ref(),
43 |                     c.stream_ptr,
44 |                 );
45 |                 tracing::debug!("start reduce_sum");
46 |                 if self.average {
47 |                     t.raw.reduce_mean_inplace(c.nranks, c.rank, c.stream_ptr);
48 |                 } else {
49 |                     t.raw.reduce_sum_inplace(c.nranks, c.rank, c.stream_ptr);
50 |                 }
51 |                 tracing::debug!("start compress");
52 |                 let mut compressed_tensor = t
53 |                     .raw
54 |                     .compress(
55 |                         &self.compression_method,
56 |                         c.nranks,
57 |                         c.stream_ptr,
58 |                         c.rank as _,
59 |                     )
60 |                     .expect("cannot compress tensor");
61 |                 tracing::debug!("start allgather");
62 |                 c.allgather_inplace(compressed_tensor.as_mut());
63 |                 tracing::debug!("start decompress");
64 |                 t.raw.decompress_from(
65 |                     &self.compression_method,
66 |                     c.nranks,
67 |                     compressed_tensor.as_ref(),
68 |                     c.stream_ptr,
69 |                 );
70 |                 tracing::debug!("internode communication done");
71 |             },
72 |         );
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 |  */
48 | template <int BINS>
49 | struct BlockHistogramAtomic
50 | {
51 |     /// Shared memory storage layout type
52 |     struct TempStorage {};
53 | 
54 | 
55 |     /// Constructor
56 |     __device__ __forceinline__ BlockHistogramAtomic(
57 |         TempStorage &temp_storage)
58 |     {}
59 | 
60 | 
61 |     /// Composite data onto an existing histogram
62 |     template <
63 |         typename            T,
64 |         typename            CounterT,     
65 |         int                 ITEMS_PER_THREAD>
66 |     __device__ __forceinline__ void Composite(
67 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
68 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
69 |     {
70 |         // Update histogram
71 |         #pragma unroll
72 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 |         {
74 |               atomicAdd(histogram + items[i], 1);
75 |         }
76 |     }
77 | 
78 | };
79 | 
80 | }               // CUB namespace
81 | CUB_NS_POSTFIX  // Optional outer namespace(s)
82 | 
83 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/cub.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * CUB umbrella include file
32 |  */
33 | 
34 | #pragma once
35 | 
36 | 
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | //#include "block/block_shift.cuh"
48 | 
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_run_length_encode.cuh"
55 | #include "device/device_scan.cuh"
56 | #include "device/device_segmented_radix_sort.cuh"
57 | #include "device/device_segmented_reduce.cuh"
58 | #include "device/device_select.cuh"
59 | #include "device/device_spmv.cuh"
60 | 
61 | // Grid
62 | //#include "grid/grid_barrier.cuh"
63 | #include "grid/grid_even_share.cuh"
64 | #include "grid/grid_mapping.cuh"
65 | #include "grid/grid_queue.cuh"
66 | 
67 | // Thread
68 | #include "thread/thread_load.cuh"
69 | #include "thread/thread_operators.cuh"
70 | #include "thread/thread_reduce.cuh"
71 | #include "thread/thread_scan.cuh"
72 | #include "thread/thread_store.cuh"
73 | 
74 | // Warp
75 | #include "warp/warp_reduce.cuh"
76 | #include "warp/warp_scan.cuh"
77 | 
78 | // Iterator
79 | #include "iterator/arg_index_input_iterator.cuh"
80 | #include "iterator/cache_modified_input_iterator.cuh"
81 | #include "iterator/cache_modified_output_iterator.cuh"
82 | #include "iterator/constant_input_iterator.cuh"
83 | #include "iterator/counting_input_iterator.cuh"
84 | #include "iterator/tex_obj_input_iterator.cuh"
85 | #include "iterator/tex_ref_input_iterator.cuh"
86 | #include "iterator/transform_input_iterator.cuh"
87 | 
88 | // Util
89 | #include "util_arch.cuh"
90 | #include "util_debug.cuh"
91 | #include "util_device.cuh"
92 | #include "util_macro.cuh"
93 | #include "util_ptx.cuh"
94 | #include "util_type.cuh"
95 | 
96 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | #ifndef CUB_ALIGN
 50 |     #if defined(_WIN32) || defined(_WIN64)
 51 |         /// Align struct
 52 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 53 |     #else
 54 |         /// Align struct
 55 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 |     #endif
 57 | #endif
 58 | 
 59 | #ifndef CUB_MAX
 60 |     /// Select maximum(a, b)
 61 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | #endif
 63 | 
 64 | #ifndef CUB_MIN
 65 |     /// Select minimum(a, b)
 66 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | #endif
 68 | 
 69 | #ifndef CUB_QUOTIENT_FLOOR
 70 |     /// Quotient of x/y rounded down to nearest integer
 71 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | #endif
 73 | 
 74 | #ifndef CUB_QUOTIENT_CEILING
 75 |     /// Quotient of x/y rounded up to nearest integer
 76 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | #endif
 78 | 
 79 | #ifndef CUB_ROUND_UP_NEAREST
 80 |     /// x rounded up to the nearest multiple of y
 81 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | #endif
 83 | 
 84 | #ifndef CUB_ROUND_DOWN_NEAREST
 85 |     /// x rounded down to the nearest multiple of y
 86 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | #endif
 88 | 
 89 | 
 90 | #ifndef CUB_STATIC_ASSERT
 91 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 92 |         #define CUB_CAT_(a, b) a ## b
 93 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
 94 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
 95 | 
 96 |     /// Static assert
 97 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
 98 | #endif
 99 | 
100 | /** @} */       // end group UtilModule
101 | 
102 | }               // CUB namespace
103 | CUB_NS_POSTFIX  // Optional outer namespace(s)
104 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/decentralized_full_precision_synchronous.rs:
--------------------------------------------------------------------------------
  1 | use crate::comm_ops::CommOpTrait;
  2 | use crate::communicators::{BaguaCommunicator, BaguaHierarchicalCommunicator, NCCLGroupGuard};
  3 | use crate::datatypes::{
  4 |     BaguaBucket, BaguaReductionOp, BaguaTensor, BaguaTensorRaw, RawBaguaTensor,
  5 | };
  6 | use crate::events::BaguaEventChannel;
  7 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL;
  8 | use crate::{BaguaCommOpChannels, BaguaScheduledCommOp};
  9 | use parking_lot::Mutex;
 10 | use std::sync::Arc;
 11 | 
 12 | #[derive(Clone, Debug)]
 13 | pub enum PeerSelectionMode {
 14 |     All,
 15 |     ShiftOne,
 16 |     Ring,
 17 | }
 18 | 
 19 | #[derive(Debug)]
 20 | pub struct DecentralizedFullPrecisionSynchronous {
 21 |     pub communicator: BaguaCommunicator,
 22 |     pub peer_selection_mode: PeerSelectionMode,
 23 |     pub step: Mutex<usize>,
 24 |     pub peer_weight: BaguaTensor,
 25 | }
 26 | 
 27 | impl CommOpTrait for DecentralizedFullPrecisionSynchronous {
 28 |     fn execute_background_communication(
 29 |         &self,
 30 |         bucket: Arc<BaguaBucket>,
 31 |         comm_op_channels: &BaguaCommOpChannels,
 32 |     ) {
 33 |         let bucket_guard = bucket.inner.lock();
 34 |         let stream_ptr = self.communicator.stream_ptr();
 35 | 
 36 |         let mut communication_tensor = match &self.communicator {
 37 |             BaguaCommunicator::SingleCommunicator(_) => {
 38 |                 bucket_guard.get_communication_tensor(stream_ptr, false, false)
 39 |             }
 40 |             BaguaCommunicator::HierarchicalCommunicator(x) => match x {
 41 |                 BaguaHierarchicalCommunicator::Leader(_) => {
 42 |                     bucket_guard.get_communication_tensor(stream_ptr, true, true)
 43 |                 }
 44 |                 BaguaHierarchicalCommunicator::Worker(_) => {
 45 |                     bucket_guard.get_communication_tensor(stream_ptr, false, false)
 46 |                 }
 47 |             },
 48 |         };
 49 | 
 50 |         let peer_mode = &self.peer_selection_mode;
 51 | 
 52 |         let mut peer_guard = self.peer_weight.inner.write();
 53 |         let mut peer_tensor = peer_guard.raw.as_mut();
 54 |         let step = { *self.step.lock() } as i64;
 55 | 
 56 |         self.communicator.execute_communication(
 57 |             &mut communication_tensor,
 58 |             true,
 59 |             true,
 60 |             false,
 61 |             &mut |c, t| {
 62 |                 match peer_mode {
 63 |                     PeerSelectionMode::All => {
 64 |                         {
 65 |                             peer_tensor.clone_from(&t.raw, c.stream_ptr);
 66 |                             let _guard = NCCLGroupGuard::new();
 67 |                             c.allreduce_inplace(peer_tensor, BaguaReductionOp::AVG);
 68 |                         }
 69 |                     }
 70 |                     PeerSelectionMode::ShiftOne => {
 71 |                         assert_eq!(
 72 |                             c.nranks % 2,
 73 |                             0,
 74 |                             "you cannot use decentralized algorithm with average_all off when there are odd number of ranks, current n_ranks {}",
 75 |                             c.nranks
 76 |                         );
 77 |                         let rank = c.rank as i64;
 78 |                         let nranks = c.nranks as i64;
 79 |                         let peer_rank = if c.rank < c.nranks / 2 {
 80 |                             ((step + rank) % ((nranks + 1) / 2)) + (nranks / 2)
 81 |                         } else {
 82 |                             (rank - (nranks / 2) - step).rem_euclid(nranks / 2)
 83 |                         } as i32;
 84 |                         tracing::debug!("rank {} peer_rank {}", c.rank, peer_rank);
 85 |                         {
 86 |                             let _guard = NCCLGroupGuard::new();
 87 |                             c.send(&t.raw, peer_rank);
 88 |                             c.recv(peer_tensor, peer_rank);
 89 |                         }
 90 |                         peer_tensor.average_inplace(&t.raw, c.stream_ptr);
 91 |                     },
 92 |                     PeerSelectionMode::Ring => {
 93 |                         unimplemented!()
 94 |                     },
 95 |                 }
 96 |             },
 97 |         );
 98 | 
 99 |         *self.step.lock() += 1;
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/bagua-core-internal/build.rs:
--------------------------------------------------------------------------------
 1 | fn main() {
 2 |     let nvcc_path = which::which("nvcc")
 3 |         .expect("Cannot find nvcc, please install CUDA Toolkit and make sure nvcc is in your PATH first. See https://developer.nvidia.com/cuda-downloads");
 4 |     let cuda_home = nvcc_path
 5 |         .parent()
 6 |         .expect("cannot find nvcc parent directory")
 7 |         .parent()
 8 |         .expect("cannot find nvcc parent directory")
 9 |         .display();
10 |     let supported_sms = cmd_lib::run_fun!(
11 |         bash -c "nvcc --help | sed -n -e '/gpu-architecture <arch>/,/gpu-code <code>/ p' | sed -n -e '/Allowed values/,/gpu-code <code>/ p' | grep -i sm_ | grep -Eo 'sm_[0-9]+' | sed -e s/sm_//g | sort -g -u | tr '\n' ' '"
12 |     ).unwrap();
13 |     let supported_sms = supported_sms.strip_suffix(' ').unwrap().split(' ');
14 |     let mut cuda_cc = cc::Build::new();
15 |     cuda_cc
16 |         .cuda(true)
17 |         .include("cpp/include")
18 |         .include("third_party/cub-1.8.0")
19 |         .include("../python/bagua_core/.data/include")
20 |         .flag("-std=c++14")
21 |         .flag("-cudart=shared");
22 | 
23 |     if std::env::var("PROFILE").unwrap() == "release" {
24 |         for sm in supported_sms {
25 |             cuda_cc
26 |                 .flag("-gencode")
27 |                 .flag(format!("arch=compute_{},code=sm_{}", sm, sm).as_str());
28 |         }
29 |     }
30 |     cuda_cc
31 |         .file("kernels/bagua_kernels.cu")
32 |         .compile("libbagua_kernels.a");
33 | 
34 |     let third_party_path = std::env::current_dir().unwrap();
35 |     let bagua_data_path = std::env::current_dir().unwrap();
36 |     let third_party_path = third_party_path.join("third_party");
37 |     let bagua_data_path = bagua_data_path.join("../python/bagua_core/.data");
38 |     let _al_builder = cmake::Config::new("third_party/Aluminum")
39 |         .define("ALUMINUM_ENABLE_NCCL", "YES")
40 |         .define("CUB_INCLUDE_PATH", third_party_path.join("cub-1.8.0"))
41 |         .define("NCCL_LIBRARY", bagua_data_path.join("lib/libnccl.so"))
42 |         .define("NCCL_INCLUDE_PATH", bagua_data_path.join("include"))
43 |         .define("BUILD_SHARED_LIBS", "off")
44 |         .out_dir(bagua_data_path.as_path().to_str().unwrap())
45 |         .always_configure(true)
46 |         .build();
47 | 
48 |     let mut cpp_builder = cpp_build::Config::new();
49 |     cpp_builder.include(format!("{}/include", cuda_home));
50 |     cpp_builder.include("cpp/include");
51 |     let mpi_include_dirs = cmd_lib::run_fun!(bash -c "mpicxx --showme:incdirs").unwrap();
52 |     let mpi_include_dirs: Vec<&str> = mpi_include_dirs.split(' ').collect();
53 |     for mpi_include_dir in mpi_include_dirs.iter() {
54 |         cpp_builder.include(mpi_include_dir);
55 |     }
56 |     cpp_builder.include(third_party_path.join("cub-1.8.0"));
57 |     cpp_builder.include(bagua_data_path.join("include"));
58 |     cpp_builder.build("src/lib.rs");
59 | 
60 |     let mpi_lib_dirs = cmd_lib::run_fun!(bash -c "mpicxx --showme:libdirs").unwrap();
61 |     let mpi_lib_dirs: Vec<&str> = mpi_lib_dirs.split(' ').collect();
62 |     for mpi_lib_dir in mpi_lib_dirs.iter() {
63 |         println!("cargo:rustc-link-search={}", mpi_lib_dir);
64 |     }
65 |     println!(
66 |         "cargo:rustc-link-search=native={}",
67 |         format!("{}/lib64", cuda_home)
68 |     );
69 |     println!(
70 |         "cargo:rustc-link-search={}",
71 |         bagua_data_path.join("lib").as_path().to_str().unwrap()
72 |     );
73 |     println!(
74 |         "cargo:rustc-link-search={}",
75 |         bagua_data_path.join("lib64").as_path().to_str().unwrap()
76 |     );
77 |     println!("cargo:rustc-link-lib=static=Al");
78 |     println!("cargo:rustc-link-lib=mpi");
79 |     println!("cargo:rustc-link-lib=nccl");
80 |     println!("cargo:rustc-link-lib=cudart");
81 |     println!("cargo:rustc-link-lib=nvrtc");
82 |     println!("cargo:rustc-link-lib=cuda");
83 |     println!("cargo:rerun-if-env-changed=CUDA_HOME");
84 |     println!("cargo:rerun-if-changed=src/");
85 |     println!("cargo:rerun-if-changed=kernels/");
86 |     println!("cargo:rerun-if-changed=build.rs");
87 | 
88 |     // bindgen --allowlist-type '.*TensorImpl.*' --enable-cxx-namespaces --ignore-functions --ignore-methods --size_t-is-usize --default-enum-style=rust --opaque-type 'std.*' --opaque-type 'c10::optional.*' wrapper.h -- -x c++ -std=c++14 > src/torch_ffi.rs
89 |     shadow_rs::new().unwrap();
90 | }
91 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/kernels/mod.rs:
--------------------------------------------------------------------------------
  1 | use std::ffi::c_void;
  2 | 
  3 | #[link(name = "bagua_kernels", kind = "static")]
  4 | extern "C" {
  5 |     pub fn divide_inplace_f32_host(x: *mut c_void, D_: f32, N: i32, stream: *const c_void);
  6 |     pub fn divide_inplace_f16_host(x: *mut c_void, D_: f32, N: i32, stream: *const c_void);
  7 |     pub fn average_inplace_f32_host(
  8 |         x: *mut c_void,
  9 |         y: *const c_void,
 10 |         N: i32,
 11 |         stream: *const c_void,
 12 |     );
 13 |     pub fn average_inplace_f16_host(
 14 |         x: *mut c_void,
 15 |         y: *const c_void,
 16 |         N: i32,
 17 |         stream: *const c_void,
 18 |     );
 19 |     pub fn substract_inplace_f32_host(
 20 |         x: *mut c_void,
 21 |         y: *const c_void,
 22 |         N: i32,
 23 |         stream: *const c_void,
 24 |     );
 25 |     pub fn substract_inplace_f16_host(
 26 |         x: *mut c_void,
 27 |         y: *const c_void,
 28 |         N: i32,
 29 |         stream: *const c_void,
 30 |     );
 31 |     pub fn add_inplace_f32_host(x: *mut c_void, y: *const c_void, N: i32, stream: *const c_void);
 32 |     pub fn add_inplace_f16_host(x: *mut c_void, y: *const c_void, N: i32, stream: *const c_void);
 33 |     pub fn addmul_inplace_f32_host(
 34 |         x: *mut c_void,
 35 |         y: *const c_void,
 36 |         N: i32,
 37 |         factor: f32,
 38 |         stream: *const c_void,
 39 |     );
 40 |     pub fn addmul_inplace_f16_host(
 41 |         x: *mut c_void,
 42 |         y: *const c_void,
 43 |         N: i32,
 44 |         factor: f32,
 45 |         stream: *const c_void,
 46 |     );
 47 |     pub fn reduce_mean_f32_inplace_host(
 48 |         input: *mut c_void,
 49 |         chunk_size: i32,
 50 |         num_chunks: i32,
 51 |         target_chunk: i32,
 52 |         stream: *const c_void,
 53 |     );
 54 |     pub fn reduce_mean_f16_inplace_host(
 55 |         input: *mut c_void,
 56 |         chunk_size: i32,
 57 |         num_chunks: i32,
 58 |         target_chunk: i32,
 59 |         stream: *const c_void,
 60 |     );
 61 |     pub fn reduce_sum_f32_inplace_host(
 62 |         input: *mut c_void,
 63 |         chunk_size: i32,
 64 |         num_chunks: i32,
 65 |         target_chunk: i32,
 66 |         stream: *const c_void,
 67 |     );
 68 |     pub fn reduce_sum_f16_inplace_host(
 69 |         input: *mut c_void,
 70 |         chunk_size: i32,
 71 |         num_chunks: i32,
 72 |         target_chunk: i32,
 73 |         stream: *const c_void,
 74 |     );
 75 |     /// temp_buffer size is the same as decompressed tensor
 76 |     /// target_chunk = -1 means compressing all chunks
 77 |     pub fn compress_f32_to_uint8_host(
 78 |         input: *mut c_void,
 79 |         input_num_element: i32,
 80 |         chunk_size: i32,
 81 |         num_chunks: i32,
 82 |         output: *mut c_void,
 83 |         output_size_bytes: usize,
 84 |         temp_buffer: *mut c_void,
 85 |         temp_buffer_size_bytes: usize,
 86 |         target_chunk: i32,
 87 |         stream: *const c_void,
 88 |     );
 89 |     pub fn decompress_uint8_to_f32_host(
 90 |         input: *mut c_void,
 91 |         input_size_bytes: usize,
 92 |         chunk_size: i32,
 93 |         num_chunks: i32,
 94 |         output: *mut c_void,
 95 |         stream: *const c_void,
 96 |     );
 97 |     pub fn compress_f16_to_uint8_host(
 98 |         input: *mut c_void,
 99 |         input_num_element: i32,
100 |         chunk_size: i32,
101 |         num_chunks: i32,
102 |         output: *mut c_void,
103 |         output_size_bytes: usize,
104 |         temp_buffer: *mut c_void,
105 |         temp_buffer_size_bytes: usize,
106 |         target_chunk: i32,
107 |         stream: *const c_void,
108 |     );
109 |     pub fn decompress_uint8_to_f16_host(
110 |         input: *mut c_void,
111 |         input_size_bytes: usize,
112 |         chunk_size: i32,
113 |         num_chunks: i32,
114 |         output: *mut c_void,
115 |         stream: *const c_void,
116 |     );
117 |     pub fn array_min_max_size_f32_host(
118 |         input: *mut c_void,
119 |         input_num_element: i32,
120 |         output: *mut c_void,
121 |         stream: *const c_void,
122 |     ) -> usize;
123 |     pub fn array_min_max_size_f16_host(
124 |         input: *mut c_void,
125 |         input_num_element: i32,
126 |         output: *mut c_void,
127 |         stream: *const c_void,
128 |     ) -> usize;
129 |     pub fn async_model_average_host(
130 |         tensor: *mut c_void,
131 |         reduced_tensor_copy: *const c_void,
132 |         tensor_copy: *const c_void,
133 |         nranks: f32,
134 |         N: i32,
135 |         stream: *const c_void,
136 |     );
137 | }
138 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_cub.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <cub/device/device_histogram.cuh>
 29 | 
 30 | using namespace cub;
 31 | 
 32 | template <
 33 |     int         NUM_CHANNELS,
 34 |     int         ACTIVE_CHANNELS,
 35 |     int         NUM_BINS,
 36 |     typename    PixelType>
 37 | double run_cub_histogram(
 38 |     PixelType *d_image,
 39 |     int width,
 40 |     int height,
 41 |     unsigned int *d_hist, 
 42 |     bool is_warmup)
 43 | {
 44 |     enum {
 45 |         is_float = Equals<PixelType, float4>::VALUE,
 46 |     };
 47 | 
 48 |     typedef typename If<is_float, float, unsigned char>::Type    SampleT;    // Sample type
 49 |     typedef typename If<is_float, float, unsigned int>::Type     LevelT;     // Level type (uint32 for uchar)
 50 | 
 51 |     // Setup data structures
 52 |     unsigned int*       d_histogram[ACTIVE_CHANNELS];
 53 |     int                 num_levels[ACTIVE_CHANNELS];            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
 54 |     LevelT              lower_level[ACTIVE_CHANNELS];           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
 55 |     LevelT              upper_level[ACTIVE_CHANNELS];           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
 56 | 
 57 |     for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
 58 |     {
 59 |         d_histogram[CHANNEL] = d_hist + (CHANNEL * NUM_BINS);
 60 |         num_levels[CHANNEL] = NUM_BINS + 1;
 61 |         lower_level[CHANNEL] = 0;
 62 |         upper_level[CHANNEL] = (is_float) ? 1 : 256;
 63 |     }
 64 | 
 65 |     // Allocate temporary storage
 66 |     size_t temp_storage_bytes = 0;
 67 |     void *d_temp_storage = NULL;
 68 | 
 69 |     SampleT* d_image_samples = (SampleT*) d_image;
 70 | 
 71 |     // Get amount of temporary storage needed
 72 |     DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
 73 |         d_temp_storage,
 74 |         temp_storage_bytes,
 75 |         d_image_samples,
 76 |         d_histogram,
 77 |         num_levels,
 78 |         lower_level,
 79 |         upper_level,
 80 |         width * height, 
 81 |         (cudaStream_t) 0,
 82 |         is_warmup);
 83 | 
 84 |     cudaMalloc(&d_temp_storage, temp_storage_bytes);
 85 | 
 86 |     GpuTimer gpu_timer;
 87 |     gpu_timer.Start();
 88 | 
 89 |     // Compute histogram
 90 |     DeviceHistogram::MultiHistogramEven<NUM_CHANNELS, ACTIVE_CHANNELS>(
 91 |         d_temp_storage,
 92 |         temp_storage_bytes,
 93 |         d_image_samples,
 94 |         d_histogram,
 95 |         num_levels,
 96 |         lower_level,
 97 |         upper_level,
 98 |         width * height, 
 99 |         (cudaStream_t) 0,
100 |         is_warmup);
101 | 
102 |     gpu_timer.Stop();
103 |     float elapsed_millis = gpu_timer.ElapsedMillis();
104 | 
105 |     cudaFree(d_temp_storage);
106 | 
107 |     return elapsed_millis;
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * \addtogroup GridModule
 47 |  * @{
 48 |  */
 49 | 
 50 | 
 51 | /******************************************************************************
 52 |  * Mapping policies
 53 |  *****************************************************************************/
 54 | 
 55 | 
 56 | /**
 57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 58 |  */
 59 | enum GridMappingStrategy
 60 | {
 61 |     /**
 62 |      * \brief An a "raking" access pattern in which each thread block is
 63 |      * assigned a consecutive sequence of input tiles
 64 |      *
 65 |      * \par Overview
 66 |      * The input is evenly partitioned into \p p segments, where \p p is
 67 |      * constant and corresponds loosely to the number of thread blocks that may
 68 |      * actively reside on the target device. Each segment is comprised of
 69 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 70 |      * to be processed to completion before the thread block terminates or
 71 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 72 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 73 |      * in tile-size increments.
 74 |      */
 75 |     GRID_MAPPING_RAKE,
 76 | 
 77 |     /**
 78 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 79 |      * to each thread block are separated by a stride equal to the the extent of
 80 |      * the grid.
 81 |      *
 82 |      * \par Overview
 83 |      * The input is evenly partitioned into \p p sets, where \p p is
 84 |      * constant and corresponds loosely to the number of thread blocks that may
 85 |      * actively reside on the target device. Each set is comprised of
 86 |      * data tiles separated by stride \p tiles, where a tile is a small,
 87 |      * constant-sized unit of input to be processed to completion before the
 88 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 89 |      * thread blocks, each of which iteratively consumes a segment of
 90 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 91 |      */
 92 |     GRID_MAPPING_STRIP_MINE,
 93 | 
 94 |     /**
 95 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 96 |      *
 97 |      * \par Overview
 98 |      * The input is treated as a queue to be dynamically consumed by a grid of
 99 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
100 |      * unit of input to be processed to completion before the thread block
101 |      * terminates or obtains more work.  The grid size \p p is constant,
102 |      * loosely corresponding to the number of thread blocks that may actively
103 |      * reside on the target device.
104 |      */
105 |     GRID_MAPPING_DYNAMIC,
106 | };
107 | 
108 | 
109 | /** @} */       // end group GridModule
110 | 
111 | }               // CUB namespace
112 | CUB_NS_POSTFIX  // Optional outer namespace(s)
113 | 
114 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple portable mutex
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 38 |     #include <mutex>
 39 | #else
 40 |     #if defined(_WIN32) || defined(_WIN64)
 41 |         #include <intrin.h>
 42 | 
 43 |         #define WIN32_LEAN_AND_MEAN
 44 |         #define NOMINMAX
 45 |         #include <windows.h>
 46 |         #undef WIN32_LEAN_AND_MEAN
 47 |         #undef NOMINMAX
 48 | 
 49 |         /**
 50 |          * Compiler read/write barrier
 51 |          */
 52 |         #pragma intrinsic(_ReadWriteBarrier)
 53 | 
 54 |     #endif
 55 | #endif
 56 | 
 57 | #include "../util_namespace.cuh"
 58 | 
 59 | 
 60 | /// Optional outer namespace(s)
 61 | CUB_NS_PREFIX
 62 | 
 63 | /// CUB namespace
 64 | namespace cub {
 65 | 
 66 | 
 67 | /**
 68 |  * Simple portable mutex
 69 |  *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
 70 |  *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
 71 |  */
 72 | struct Mutex
 73 | {
 74 | #if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
 75 | 
 76 |     std::mutex mtx;
 77 | 
 78 |     void Lock()
 79 |     {
 80 |         mtx.lock();
 81 |     }
 82 | 
 83 |     void Unlock()
 84 |     {
 85 |         mtx.unlock();
 86 |     }
 87 | 
 88 |     void TryLock()
 89 |     {
 90 |         mtx.try_lock();
 91 |     }
 92 | 
 93 | #else       //__cplusplus > 199711L
 94 | 
 95 |     #if defined(_MSC_VER)
 96 | 
 97 |         // Microsoft VC++
 98 |         typedef long Spinlock;
 99 | 
100 |     #else
101 | 
102 |         // GNU g++
103 |         typedef int Spinlock;
104 | 
105 |         /**
106 |          * Compiler read/write barrier
107 |          */
108 |         __forceinline__ void _ReadWriteBarrier()
109 |         {
110 |             __sync_synchronize();
111 |         }
112 | 
113 |         /**
114 |          * Atomic exchange
115 |          */
116 |         __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
117 |         {
118 |             // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
119 |             _ReadWriteBarrier();
120 |             return __sync_lock_test_and_set(Target, Value);
121 |         }
122 | 
123 |         /**
124 |          * Pause instruction to prevent excess processor bus usage
125 |          */
126 |         __forceinline__ void YieldProcessor()
127 |         {
128 |         }
129 | 
130 |     #endif  // defined(_MSC_VER)
131 | 
132 |         /// Lock member
133 |         volatile Spinlock lock;
134 | 
135 |         /**
136 |          * Constructor
137 |          */
138 |         Mutex() : lock(0) {}
139 | 
140 |         /**
141 |          * Return when the specified spinlock has been acquired
142 |          */
143 |         __forceinline__ void Lock()
144 |         {
145 |             while (1)
146 |             {
147 |                 if (!_InterlockedExchange(&lock, 1)) return;
148 |                 while (lock) YieldProcessor();
149 |             }
150 |         }
151 | 
152 | 
153 |         /**
154 |          * Release the specified spinlock
155 |          */
156 |         __forceinline__ void Unlock()
157 |         {
158 |             _ReadWriteBarrier();
159 |             lock = 0;
160 |         }
161 | 
162 | #endif      // __cplusplus > 199711L
163 | 
164 | };
165 | 
166 | 
167 | 
168 | 
169 | }               // CUB namespace
170 | CUB_NS_POSTFIX  // Optional outer namespace(s)
171 | 
172 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/Makefile:
--------------------------------------------------------------------------------
  1 | #/******************************************************************************
  2 | # * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # * 
  5 | # * Redistribution and use in source and binary forms, with or without
  6 | # * modification, are permitted provided that the following conditions are met:
  7 | # *	 * Redistributions of source code must retain the above copyright
  8 | # *	   notice, this list of conditions and the following disclaimer.
  9 | # *	 * Redistributions in binary form must reproduce the above copyright
 10 | # *	   notice, this list of conditions and the following disclaimer in the
 11 | # *	   documentation and/or other materials provided with the distribution.
 12 | # *	 * Neither the name of the NVIDIA CORPORATION nor the
 13 | # *	   names of its contributors may be used to endorse or promote products
 14 | # *	   derived from this software without specific prior written permission.
 15 | # * 
 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | # *
 27 | #******************************************************************************/
 28 | 
 29 | #-------------------------------------------------------------------------------
 30 | #
 31 | # Makefile usage
 32 | #
 33 | # make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>] [quicktest=<0|1>]
 34 | #
 35 | #-------------------------------------------------------------------------------
 36 |  
 37 | include ../common.mk 
 38 | 
 39 | #-------------------------------------------------------------------------------
 40 | # Commandline Options
 41 | #-------------------------------------------------------------------------------
 42 | 
 43 | # [mkl=<0|1>] compile against Intel MKL
 44 | ifeq ($(mkl), 1)
 45 | 	DEFINES 	+= -DCUB_MKL
 46 | 
 47 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
 48 | 	LIBS 		+=	mkl_intel_lp64.lib mkl_intel_thread.lib  mkl_core.lib libiomp5md.lib
 49 | 	NVCCFLAGS 	+= -Xcompiler /openmp
 50 | else
 51 | 	LIBS		+= -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5 -lpthread -lm
 52 | 	NVCCFLAGS 	+= -Xcompiler -fopenmp
 53 | 	
 54 | endif	
 55 | 
 56 | endif
 57 | 
 58 | 
 59 | #-------------------------------------------------------------------------------
 60 | # Compiler and compilation platform
 61 | #-------------------------------------------------------------------------------
 62 | 
 63 | # Includes
 64 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
 65 | 
 66 | # detect OS
 67 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 68 | 
 69 | #-------------------------------------------------------------------------------
 70 | # Dependency Lists
 71 | #-------------------------------------------------------------------------------
 72 | 
 73 | exp_rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
 74 | 
 75 | EXP_DEPS = 	$(call rwildcard, ./,*.cuh) \
 76 | 			$(call rwildcard, ./,*.h)
 77 | 
 78 | DEPS =				$(CUB_DEPS) \
 79 | 					$(EXP_DEPS) \
 80 | 					$(CUB_DIR)test/Makefile \
 81 | 					$(CUB_DIR)test/test_util.h \
 82 | 					$(CUB_DIR)test/mersenne.h \
 83 | 
 84 | 		
 85 | 
 86 | #-------------------------------------------------------------------------------
 87 | # make default
 88 | #-------------------------------------------------------------------------------
 89 | 
 90 | default:
 91 | 
 92 | 
 93 | #-------------------------------------------------------------------------------
 94 | # make clean
 95 | #-------------------------------------------------------------------------------
 96 | 
 97 | clean :
 98 | 	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
 99 | 	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
100 | 
101 | 
102 | 
103 | #-------------------------------------------------------------------------------
104 | # make histogram_compare
105 | #-------------------------------------------------------------------------------
106 | 
107 | histogram_compare: bin/histogram_compare_$(BIN_SUFFIX)
108 | 
109 | bin/histogram_compare_$(BIN_SUFFIX) : histogram_compare.cu $(DEPS)
110 | 	mkdir -p bin
111 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/histogram_compare_$(BIN_SUFFIX) histogram_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
112 | 	
113 | 
114 | 
115 | #-------------------------------------------------------------------------------
116 | # make spmv_compare
117 | #-------------------------------------------------------------------------------
118 | 
119 | spmv_compare: bin/spmv_compare_$(BIN_SUFFIX)
120 | 
121 | bin/spmv_compare_$(BIN_SUFFIX) : spmv_compare.cu $(DEPS)
122 | 	mkdir -p bin
123 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/spmv_compare_$(BIN_SUFFIX) spmv_compare.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse $(MKL_LIBS) -O3
124 | 	
125 | 
126 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/thread/thread_search.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential search
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * Computes the begin offsets into A and B for the specific diagonal
 47 |  */
 48 | template <
 49 |     typename AIteratorT,
 50 |     typename BIteratorT,
 51 |     typename OffsetT,
 52 |     typename CoordinateT>
 53 | __host__ __device__ __forceinline__ void MergePathSearch(
 54 |     OffsetT         diagonal,
 55 |     AIteratorT      a,
 56 |     BIteratorT      b,
 57 |     OffsetT         a_len,
 58 |     OffsetT         b_len,
 59 |     CoordinateT&    path_coordinate)
 60 | {
 61 |     /// The value type of the input iterator
 62 |     typedef typename std::iterator_traits<AIteratorT>::value_type T;
 63 | 
 64 |     OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
 65 |     OffsetT split_max = CUB_MIN(diagonal, a_len);
 66 | 
 67 |     while (split_min < split_max)
 68 |     {
 69 |         OffsetT split_pivot = (split_min + split_max) >> 1;
 70 |         if (a[split_pivot] <= b[diagonal - split_pivot - 1])
 71 |         {
 72 |             // Move candidate split range up A, down B
 73 |             split_min = split_pivot + 1;
 74 |         }
 75 |         else
 76 |         {
 77 |             // Move candidate split range up B, down A
 78 |             split_max = split_pivot;
 79 |         }
 80 |     }
 81 | 
 82 |     path_coordinate.x = CUB_MIN(split_min, a_len);
 83 |     path_coordinate.y = diagonal - split_min;
 84 | }
 85 | 
 86 | 
 87 | 
 88 | /**
 89 |  * \brief Returns the offset of the first value within \p input which does not compare less than \p val
 90 |  */
 91 | template <
 92 |     typename InputIteratorT,
 93 |     typename OffsetT,
 94 |     typename T>
 95 | __device__ __forceinline__ OffsetT LowerBound(
 96 |     InputIteratorT      input,              ///< [in] Input sequence
 97 |     OffsetT             num_items,          ///< [in] Input sequence length
 98 |     T                   val)                ///< [in] Search key
 99 | {
100 |     OffsetT retval = 0;
101 |     while (num_items > 0)
102 |     {
103 |         OffsetT half = num_items >> 1;
104 |         if (input[retval + half] < val)
105 |         {
106 |             retval = retval + (half + 1);
107 |             num_items = num_items - (half + 1);
108 |         }
109 |         else
110 |         {
111 |             num_items = half;
112 |         }
113 |     }
114 | 
115 |     return retval;
116 | }
117 | 
118 | 
119 | /**
120 |  * \brief Returns the offset of the first value within \p input which compares greater than \p val
121 |  */
122 | template <
123 |     typename InputIteratorT,
124 |     typename OffsetT,
125 |     typename T>
126 | __device__ __forceinline__ OffsetT UpperBound(
127 |     InputIteratorT      input,              ///< [in] Input sequence
128 |     OffsetT             num_items,          ///< [in] Input sequence length
129 |     T                   val)                ///< [in] Search key
130 | {
131 |     OffsetT retval = 0;
132 |     while (num_items > 0)
133 |     {
134 |         OffsetT half = num_items >> 1;
135 |         if (val < input[retval + half])
136 |         {
137 |             num_items = half;
138 |         }
139 |         else
140 |         {
141 |             retval = retval + (half + 1);
142 |             num_items = num_items - (half + 1);
143 |         }
144 |     }
145 | 
146 |     return retval;
147 | }
148 | 
149 | 
150 | 
151 | 
152 | 
153 | }               // CUB namespace
154 | CUB_NS_POSTFIX  // Optional outer namespace(s)
155 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/util_debug.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Error and event logging routines.
 32 |  *
 33 |  * The following macros definitions are supported:
 34 |  * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
 35 |  */
 36 | 
 37 | #pragma once
 38 | 
 39 | #include <stdio.h>
 40 | #include "util_namespace.cuh"
 41 | #include "util_arch.cuh"
 42 | 
 43 | /// Optional outer namespace(s)
 44 | CUB_NS_PREFIX
 45 | 
 46 | /// CUB namespace
 47 | namespace cub {
 48 | 
 49 | 
 50 | /**
 51 |  * \addtogroup UtilMgmt
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | /// CUB error reporting macro (prints error messages to stderr)
 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 58 |     #define CUB_STDERR
 59 | #endif
 60 | 
 61 | 
 62 | 
 63 | /**
 64 |  * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
 65 |  *
 66 |  * \return The CUDA error.
 67 |  */
 68 | __host__ __device__ __forceinline__ cudaError_t Debug(
 69 |     cudaError_t     error,
 70 |     const char*     filename,
 71 |     int             line)
 72 | {
 73 |     (void)filename;
 74 |     (void)line;
 75 | #ifdef CUB_STDERR
 76 |     if (error)
 77 |     {
 78 |     #if (CUB_PTX_ARCH == 0)
 79 |         fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
 80 |         fflush(stderr);
 81 |     #elif (CUB_PTX_ARCH >= 200)
 82 |         printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
 83 |     #endif
 84 |     }
 85 | #endif
 86 |     return error;
 87 | }
 88 | 
 89 | 
 90 | /**
 91 |  * \brief Debug macro
 92 |  */
 93 | #ifndef CubDebug
 94 |     #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
 95 | #endif
 96 | 
 97 | 
 98 | /**
 99 |  * \brief Debug macro with exit
100 |  */
101 | #ifndef CubDebugExit
102 |     #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
103 | #endif
104 | 
105 | 
106 | /**
107 |  * \brief Log macro for printf statements.
108 |  */
109 | #if !defined(_CubLog)
110 |     #if !(defined(__clang__) && defined(__CUDA__))
111 |         #if (CUB_PTX_ARCH == 0)
112 |             #define _CubLog(format, ...) printf(format,__VA_ARGS__);
113 |         #elif (CUB_PTX_ARCH >= 200)
114 |             #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
115 |         #endif
116 |     #else
117 |         // XXX shameless hack for clang around variadic printf...
118 |         //     Compilies w/o supplying -std=c++11 but shows warning,
119 |         //     so we sielence them :)
120 |         #pragma clang diagnostic ignored "-Wc++11-extensions"
121 |         #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
122 |             template <class... Args>
123 |             inline __host__ __device__ void va_printf(char const* format, Args const&... args)
124 |             {
125 |         #ifdef __CUDA_ARCH__
126 |               printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
127 |         #else
128 |               printf(format, args...);
129 |         #endif
130 |             }
131 |         #ifndef __CUDA_ARCH__
132 |             #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
133 |         #else
134 |             #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
135 |         #endif
136 |     #endif
137 | #endif
138 | 
139 | 
140 | 
141 | 
142 | /** @} */       // end group UtilMgmt
143 | 
144 | }               // CUB namespace
145 | CUB_NS_POSTFIX  // Optional outer namespace(s)
146 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/block/Makefile:
--------------------------------------------------------------------------------
  1 | #/******************************************************************************
  2 | # * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # * 
  5 | # * Redistribution and use in source and binary forms, with or without
  6 | # * modification, are permitted provided that the following conditions are met:
  7 | # *	 * Redistributions of source code must retain the above copyright
  8 | # *	   notice, this list of conditions and the following disclaimer.
  9 | # *	 * Redistributions in binary form must reproduce the above copyright
 10 | # *	   notice, this list of conditions and the following disclaimer in the
 11 | # *	   documentation and/or other materials provided with the distribution.
 12 | # *	 * Neither the name of the NVIDIA CORPORATION nor the
 13 | # *	   names of its contributors may be used to endorse or promote products
 14 | # *	   derived from this software without specific prior written permission.
 15 | # * 
 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | # *
 27 | #******************************************************************************/
 28 | 
 29 | #-------------------------------------------------------------------------------
 30 | #
 31 | # Makefile usage
 32 | #
 33 | # make <target> [sm=<XXX,...>] [cdp=<0|1>] [force32=<0|1>] [abi=<0|1>] [open64=<0|1>] [verbose=<0|1>] [keep=<0|1>]
 34 | #
 35 | #-------------------------------------------------------------------------------
 36 |  
 37 | include ../../common.mk 
 38 |  
 39 |  
 40 | #-------------------------------------------------------------------------------
 41 | # Includes
 42 | #-------------------------------------------------------------------------------
 43 | 
 44 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
 45 | 
 46 | 
 47 | 
 48 | #-------------------------------------------------------------------------------
 49 | # Dependency Lists
 50 | #-------------------------------------------------------------------------------
 51 | 
 52 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
 53 | 
 54 | DEPS =				$(CUB_DEPS) \
 55 | 					$(CUB_DIR)test/Makefile \
 56 | 					$(CUB_DIR)test/test_util.h \
 57 | 					$(CUB_DIR)test/mersenne.h \
 58 | 		
 59 | ALL = 	example_block_radix_sort \
 60 | 	 	example_block_reduce \
 61 | 	 	example_block_scan
 62 | 		
 63 | 
 64 | 
 65 | #-------------------------------------------------------------------------------
 66 | # make default
 67 | #-------------------------------------------------------------------------------
 68 | 
 69 | default:
 70 | 
 71 | 
 72 | #-------------------------------------------------------------------------------
 73 | # make clean
 74 | #-------------------------------------------------------------------------------
 75 | 
 76 | clean :
 77 | 	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
 78 | 	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
 79 | 
 80 | 
 81 | #-------------------------------------------------------------------------------
 82 | # make all
 83 | #-------------------------------------------------------------------------------
 84 | 
 85 | all : $(ALL)
 86 | 
 87 | #-------------------------------------------------------------------------------
 88 | # make run
 89 | #-------------------------------------------------------------------------------
 90 | 
 91 | run : 
 92 | 	for i in $(ALL); do ./bin/$${i}_$(BIN_SUFFIX) --device=$(device) || exit 1; done
 93 | 
 94 | 
 95 | 
 96 | 
 97 | #-------------------------------------------------------------------------------
 98 | # make example_block_reduce
 99 | #-------------------------------------------------------------------------------
100 | 
101 | example_block_reduce: bin/example_block_reduce_$(BIN_SUFFIX)
102 | 
103 | bin/example_block_reduce_$(BIN_SUFFIX) : example_block_reduce.cu $(DEPS)
104 | 	mkdir -p bin
105 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_reduce_$(BIN_SUFFIX) example_block_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
106 | 
107 | 
108 | #-------------------------------------------------------------------------------
109 | # make example_block_scan
110 | #-------------------------------------------------------------------------------
111 | 
112 | example_block_scan: bin/example_block_scan_$(BIN_SUFFIX)
113 | 
114 | bin/example_block_scan_$(BIN_SUFFIX) : example_block_scan.cu $(DEPS)
115 | 	mkdir -p bin
116 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_scan_$(BIN_SUFFIX) example_block_scan.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
117 | 
118 | 
119 | #-------------------------------------------------------------------------------
120 | # make example_block_radix_sort
121 | #-------------------------------------------------------------------------------
122 | 
123 | example_block_radix_sort: bin/example_block_radix_sort_$(BIN_SUFFIX)
124 | 
125 | bin/example_block_radix_sort_$(BIN_SUFFIX) : example_block_radix_sort.cu $(DEPS)
126 | 	mkdir -p bin
127 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/example_block_radix_sort_$(BIN_SUFFIX) example_block_radix_sort.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3
128 | 	
129 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/test_grid_barrier.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Test evaluation for software global barrier throughput
 31 |  ******************************************************************************/
 32 | 
 33 | // Ensure printing of CUDA runtime errors to console
 34 | #define CUB_STDERR
 35 | 
 36 | #include <stdio.h>
 37 | 
 38 | #include <cub/grid/grid_barrier.cuh>
 39 | 
 40 | #include "test_util.h"
 41 | 
 42 | using namespace cub;
 43 | 
 44 | 
 45 | //---------------------------------------------------------------------
 46 | // Test kernels
 47 | //---------------------------------------------------------------------
 48 | 
 49 | /**
 50 |  * Kernel that iterates through the specified number of software global barriers
 51 |  */
 52 | __global__ void Kernel(
 53 |     GridBarrier global_barrier,
 54 |     int iterations)
 55 | {
 56 |     for (int i = 0; i < iterations; i++)
 57 |     {
 58 |         global_barrier.Sync();
 59 |     }
 60 | }
 61 | 
 62 | 
 63 | //---------------------------------------------------------------------
 64 | // Main
 65 | //---------------------------------------------------------------------
 66 | 
 67 | /**
 68 |  * Main
 69 |  */
 70 | int main(int argc, char** argv)
 71 | {
 72 |     cudaError_t retval = cudaSuccess;
 73 | 
 74 |     // Defaults
 75 |     int iterations = 10000;
 76 |     int block_size = 128;
 77 |     int grid_size = -1;
 78 | 
 79 |     // Initialize command line
 80 |     CommandLineArgs args(argc, argv);
 81 | 
 82 |     // Get args
 83 |     args.GetCmdLineArgument("i", iterations);
 84 |     args.GetCmdLineArgument("grid-size", grid_size);
 85 |     args.GetCmdLineArgument("block-size", block_size);
 86 | 
 87 |     // Print usage
 88 |     if (args.CheckCmdLineFlag("help"))
 89 |     {
 90 |         printf("%s "
 91 |             "[--device=<device-id>]"
 92 |             "[--i=<iterations>]"
 93 |             "[--grid-size<grid-size>]"
 94 |             "[--block-size<block-size>]"
 95 |             "\n", argv[0]);
 96 |         exit(0);
 97 |     }
 98 | 
 99 |     // Initialize device
100 |     CubDebugExit(args.DeviceInit());
101 | 
102 |     // Get device ordinal
103 |     int device_ordinal;
104 |     CubDebugExit(cudaGetDevice(&device_ordinal));
105 | 
106 |     // Get device SM version
107 |     int sm_version;
108 |     CubDebugExit(SmVersion(sm_version, device_ordinal));
109 | 
110 |     // Get SM properties
111 |     int sm_count, max_block_threads, max_sm_occupancy;
112 |     CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
113 |     CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
114 |     CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
115 | 
116 |     // Compute grid size and occupancy
117 |     int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
118 | 
119 |     if (grid_size == -1)
120 |     {
121 |         grid_size = occupancy * sm_count;
122 |     }
123 |     else
124 |     {
125 |         occupancy = grid_size / sm_count;
126 |     }
127 | 
128 |     printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
129 |         grid_size, block_size, occupancy);
130 |     fflush(stdout);
131 | 
132 |     // Init global barrier
133 |     GridBarrierLifetime global_barrier;
134 |     global_barrier.Setup(grid_size);
135 | 
136 |     // Time kernel
137 |     GpuTimer gpu_timer;
138 |     gpu_timer.Start();
139 |     Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
140 |     gpu_timer.Stop();
141 | 
142 |     retval = CubDebug(cudaThreadSynchronize());
143 | 
144 |     // Output timing results
145 |     float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
146 |     printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
147 |         iterations,
148 |         gpu_timer.ElapsedMillis(),
149 |         avg_elapsed);
150 | 
151 |     return retval;
152 | }
153 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/test/mersenne.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  A C-program for MT19937, with initialization improved 2002/1/26.
  3 |  Coded by Takuji Nishimura and Makoto Matsumoto.
  4 | 
  5 |  Before using, initialize the state by using init_genrand(seed)
  6 |  or init_by_array(init_key, key_length).
  7 | 
  8 |  Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
  9 |  All rights reserved.
 10 | 
 11 |  Redistribution and use in source and binary forms, with or without
 12 |  modification, are permitted provided that the following conditions
 13 |  are met:
 14 | 
 15 |  1. Redistributions of source code must retain the above copyright
 16 |  notice, this list of conditions and the following disclaimer.
 17 | 
 18 |  2. Redistributions in binary form must reproduce the above copyright
 19 |  notice, this list of conditions and the following disclaimer in the
 20 |  documentation and/or other materials provided with the distribution.
 21 | 
 22 |  3. The names of its contributors may not be used to endorse or promote
 23 |  products derived from this software without specific prior written
 24 |  permission.
 25 | 
 26 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 27 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 28 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 29 |  A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 30 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 31 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 32 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 33 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 34 |  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 35 |  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 36 |  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 37 | 
 38 | 
 39 |  Any feedback is very welcome.
 40 |  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
 41 |  email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
 42 |  */
 43 | 
 44 | #include <stdio.h>
 45 | 
 46 | namespace mersenne {
 47 | 
 48 | /* Period parameters */
 49 | const unsigned int N          = 624;
 50 | const unsigned int M          = 397;
 51 | const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
 52 | const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
 53 | const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
 54 | 
 55 | static unsigned int mt[N];  /* the array for the state vector  */
 56 | static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
 57 | 
 58 | /* initializes mt[N] with a seed */
 59 | void init_genrand(unsigned int s)
 60 | {
 61 |     mt[0] = s & 0xffffffff;
 62 |     for (mti = 1; mti < N; mti++)
 63 |     {
 64 |         mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
 65 | 
 66 |         /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
 67 |         /* In the previous versions, MSBs of the seed affect   */
 68 |         /* only MSBs of the array mt[].                        */
 69 |         /* 2002/01/09 modified by Makoto Matsumoto             */
 70 | 
 71 |         mt[mti] &= 0xffffffff;
 72 |         /* for >32 bit machines */
 73 |     }
 74 | }
 75 | 
 76 | /* initialize by an array with array-length */
 77 | /* init_key is the array for initializing keys */
 78 | /* key_length is its length */
 79 | /* slight change for C++, 2004/2/26 */
 80 | void init_by_array(unsigned int init_key[], int key_length)
 81 | {
 82 |     int i, j, k;
 83 |     init_genrand(19650218);
 84 |     i = 1;
 85 |     j = 0;
 86 |     k = (N > key_length ? N : key_length);
 87 |     for (; k; k--)
 88 |     {
 89 |         mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
 90 |             + init_key[j] + j;  /* non linear */
 91 |         mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
 92 |         i++;
 93 |         j++;
 94 |         if (i >= N)
 95 |         {
 96 |             mt[0] = mt[N - 1];
 97 |             i = 1;
 98 |         }
 99 |         if (j >= key_length) j = 0;
100 |     }
101 |     for (k = N - 1; k; k--)
102 |     {
103 |         mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
104 |         mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
105 |         i++;
106 |         if (i >= N)
107 |         {
108 |             mt[0] = mt[N - 1];
109 |             i = 1;
110 |         }
111 |     }
112 | 
113 |     mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
114 | }
115 | 
116 | /* generates a random number on [0,0xffffffff]-interval */
117 | unsigned int genrand_int32(void)
118 | {
119 |     unsigned int y;
120 |     static unsigned int mag01[2] = { 0x0, MATRIX_A };
121 | 
122 |     /* mag01[x] = x * MATRIX_A  for x=0,1 */
123 | 
124 |     if (mti >= N)
125 |     { /* generate N words at one time */
126 |         int kk;
127 | 
128 |         if (mti == N + 1) /* if init_genrand() has not been called, */
129 |         init_genrand(5489); /* a defat initial seed is used */
130 | 
131 |         for (kk = 0; kk < N - M; kk++)
132 |         {
133 |             y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
134 |             mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
135 |         }
136 |         for (; kk < N - 1; kk++)
137 |         {
138 |             y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
139 |             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
140 |         }
141 |         y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
142 |         mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
143 | 
144 |         mti = 0;
145 |     }
146 | 
147 |     y = mt[mti++];
148 | 
149 |     /* Tempering */
150 |     y ^= (y >> 11);
151 |     y ^= (y << 7) & 0x9d2c5680;
152 |     y ^= (y << 15) & 0xefc60000;
153 |     y ^= (y >> 18);
154 | 
155 |     return y;
156 | }
157 | 
158 | 
159 | 
160 | } // namespace mersenne
161 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/README.md:
--------------------------------------------------------------------------------
  1 | <hr>
  2 | <h3>About CUB</h3>
  3 | 
  4 | Current release: v1.8.0 (02/16/2018)
  5 | 
  6 | We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
  7 | 
  8 | CUB provides state-of-the-art, reusable software components for every layer 
  9 | of the CUDA programming model:
 10 | - [<b><em>Device-wide primitives</em></b>] (https://nvlabs.github.com/cub/group___device_module.html) 
 11 |   - Sort, prefix scan, reduction, histogram, etc.  
 12 |   - Compatible with CUDA dynamic parallelism
 13 | - [<b><em>Block-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___block_module.html)
 14 |   - I/O, sort, prefix scan, reduction, histogram, etc.  
 15 |   - Compatible with arbitrary thread block sizes and types 
 16 | - [<b><em>Warp-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___warp_module.html)
 17 |   - Warp-wide prefix scan, reduction, etc.
 18 |   - Safe and architecture-specific
 19 | - [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
 20 |   - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. 
 21 | 
 22 | ![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
 23 | 
 24 | <br><hr>
 25 | <h3>A Simple Example</h3>
 26 | 
 27 | ```C++
 28 | #include <cub/cub.cuh>
 29 |  
 30 | // Block-sorting CUDA kernel
 31 | __global__ void BlockSortKernel(int *d_in, int *d_out)
 32 | {
 33 |      using namespace cub;
 34 | 
 35 |      // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
 36 |      // owning 16 integer items each
 37 |      typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
 38 |      typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
 39 |      typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
 40 |  
 41 |      // Allocate shared memory
 42 |      __shared__ union {
 43 |          typename BlockRadixSort::TempStorage  sort;
 44 |          typename BlockLoad::TempStorage       load; 
 45 |          typename BlockStore::TempStorage      store; 
 46 |      } temp_storage; 
 47 | 
 48 |      int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
 49 | 
 50 |      // Obtain a segment of 2048 consecutive keys that are blocked across threads
 51 |      int thread_keys[16];
 52 |      BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
 53 |      __syncthreads();
 54 | 
 55 |      // Collectively sort the keys
 56 |      BlockRadixSort(temp_storage.sort).Sort(thread_keys);
 57 |      __syncthreads();
 58 | 
 59 |      // Store the sorted segment 
 60 |      BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
 61 | }
 62 | ```
 63 | 
 64 | Each thread block uses cub::BlockRadixSort to collectively sort 
 65 | its own input segment.  The class is specialized by the 
 66 | data type being sorted, by the number of threads per block, by the number of 
 67 | keys per thread, and implicitly by the targeted compilation architecture.  
 68 | 
 69 | The cub::BlockLoad and cub::BlockStore classes are similarly specialized.    
 70 | Furthermore, to provide coalesced accesses to device memory, these primitives are 
 71 | configured to access memory using a striped access pattern (where consecutive threads 
 72 | simultaneously access consecutive items) and then <em>transpose</em> the keys into 
 73 | a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads. 
 74 | 
 75 | Once specialized, these classes expose opaque \p TempStorage member types.  
 76 | The thread block uses these storage types to statically allocate the union of 
 77 | shared memory needed by the thread block.  (Alternatively these storage types 
 78 | could be aliased to global memory allocations).
 79 | 
 80 | <br><hr>
 81 | <h3>Stable Releases</h3>
 82 | 
 83 | CUB releases are labeled using version identifiers having three fields: 
 84 | *epoch.feature.update*.  The *epoch* field corresponds to support for
 85 | a major change in the CUDA programming model.  The *feature* field 
 86 | corresponds to a stable set of features, functionality, and interface.  The
 87 | *update* field corresponds to a bug-fix or performance update for that
 88 | feature set.  At the moment, we do not publicly provide non-stable releases 
 89 | such as development snapshots, beta releases or rolling releases.  (Feel free
 90 | to contact us if you would like such things.)  See the 
 91 | [CUB Project Website](http://nvlabs.github.com/cub) for more information.
 92 | 
 93 | <br><hr>
 94 | <h3>Contributors</h3>
 95 | 
 96 | CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com).  The primary contributor is [Duane Merrill](http://github.com/dumerrill).
 97 | 
 98 | <br><hr>
 99 | <h3>Open Source License</h3>
100 | 
101 | CUB is available under the "New BSD" open-source license:
102 | 
103 | ```
104 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
105 | Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
106 | 
107 | Redistribution and use in source and binary forms, with or without
108 | modification, are permitted provided that the following conditions are met:
109 |    *  Redistributions of source code must retain the above copyright
110 |       notice, this list of conditions and the following disclaimer.
111 |    *  Redistributions in binary form must reproduce the above copyright
112 |       notice, this list of conditions and the following disclaimer in the
113 |       documentation and/or other materials provided with the distribution.
114 |    *  Neither the name of the NVIDIA CORPORATION nor the
115 |       names of its contributors may be used to endorse or promote products
116 |       derived from this software without specific prior written permission.
117 | 
118 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
119 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
120 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
121 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
122 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
123 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
124 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
125 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
126 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
127 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
128 | ```
129 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/decentralized_full_precision_asynchronous.rs:
--------------------------------------------------------------------------------
  1 | use crate::comm_ops::decentralized_full_precision_synchronous::PeerSelectionMode;
  2 | use crate::comm_ops::CommOpTrait;
  3 | use crate::communicators::BaguaCommunicator;
  4 | use crate::datatypes::{BaguaBucket, BaguaReductionOp, BaguaTensorRaw, RawBaguaTensor};
  5 | use crate::events::BaguaEventChannel;
  6 | use crate::resource_pool::{CUDA_DEVICE_MEMORY_POOL, CUDA_EVENT_POOL};
  7 | use crate::{BaguaCommOpChannels, BaguaCoreError};
  8 | use std::sync::Arc;
  9 | use std::time::Duration;
 10 | 
 11 | #[derive(Debug)]
 12 | pub struct DecentralizedFullPrecisionAsynchronous {
 13 |     pub communicator: BaguaCommunicator,
 14 |     pub peer_selection_mode: PeerSelectionMode,
 15 |     pub torch_stream: u64,
 16 | }
 17 | 
 18 | impl CommOpTrait for DecentralizedFullPrecisionAsynchronous {
 19 |     fn execute_background_communication(
 20 |         &self,
 21 |         bucket: Arc<BaguaBucket>,
 22 |         comm_op_channels: &BaguaCommOpChannels,
 23 |     ) {
 24 |         let bucket_guard = bucket.inner.lock();
 25 |         let comm_stream = self.communicator.stream_ptr();
 26 | 
 27 |         let mut communication_tensor = match &self.communicator {
 28 |             BaguaCommunicator::SingleCommunicator(_) => {
 29 |                 bucket_guard.get_communication_tensor(comm_stream, false, false)
 30 |             }
 31 |             BaguaCommunicator::HierarchicalCommunicator(x) => {
 32 |                 panic!("asynchronous op only accepts non-hierarchical communicator");
 33 |             }
 34 |         };
 35 | 
 36 |         let peer_mode = &self.peer_selection_mode;
 37 | 
 38 |         let torch_stream = self.torch_stream;
 39 | 
 40 |         self.communicator.execute_communication(
 41 |             &mut communication_tensor,
 42 |             false,
 43 |             false,
 44 |             false,
 45 |             &mut |c, t| {
 46 |                 let start_time = std::time::Instant::now();
 47 |                 tracing::debug!("async model average start");
 48 | 
 49 |                 let temp_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id()]
 50 |                     .try_pull(t.raw.num_elements_allocated() * t.raw.dtype().bytes())
 51 |                     .expect("cannot allocate cuda memory");
 52 | 
 53 |                 let mut temp_tensor = BaguaTensorRaw {
 54 |                     ptr: temp_buf.ptr,
 55 |                     num_elem_allocated: t.raw.num_elements_allocated(),
 56 |                     dtype: t.raw.dtype().clone(),
 57 |                     num_elem: t.raw.num_elements(),
 58 |                     device_id: t.raw.device_id(),
 59 |                     pool_allocations: vec![Arc::new(temp_buf)],
 60 |                 };
 61 | 
 62 |                 let reduced_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id()]
 63 |                     .try_pull(t.raw.num_elements_allocated() * t.raw.dtype().bytes())
 64 |                     .expect("cannot allocate cuda memory");
 65 | 
 66 |                 let mut reduced_tensor = BaguaTensorRaw {
 67 |                     ptr: reduced_buf.ptr,
 68 |                     num_elem_allocated: t.raw.num_elements_allocated(),
 69 |                     dtype: t.raw.dtype().clone(),
 70 |                     num_elem: t.raw.num_elements(),
 71 |                     device_id: t.raw.device_id(),
 72 |                     pool_allocations: vec![Arc::new(reduced_buf)],
 73 |                 };
 74 | 
 75 |                 // use default stream to copy weights
 76 |                 temp_tensor.clone_from(&t.raw, torch_stream as u64);
 77 | 
 78 |                 let src_ready_event = CUDA_EVENT_POOL.take().event;
 79 | 
 80 |                 unsafe {
 81 |                     cpp::cpp!([
 82 |                         src_ready_event as "cudaEvent_t",
 83 |                         comm_stream as "cudaStream_t",
 84 |                         torch_stream as "cudaStream_t"]
 85 |                     {
 86 |                         CUDACHECK(cudaEventRecord(src_ready_event, torch_stream));
 87 |                         CUDACHECK(cudaStreamWaitEvent(comm_stream, src_ready_event , 0));
 88 |                     });
 89 |                 }
 90 | 
 91 |                 if c.check_abort() {
 92 |                     return;
 93 |                 }
 94 | 
 95 |                 match peer_mode {
 96 |                     PeerSelectionMode::All => {
 97 |                         c.allreduce(&temp_tensor, &mut reduced_tensor, BaguaReductionOp::SUM);
 98 |                     }
 99 |                     PeerSelectionMode::Ring => {
100 |                         unimplemented!()
101 |                     }
102 |                     PeerSelectionMode::ShiftOne => {
103 |                         unimplemented!()
104 |                     }
105 |                 };
106 | 
107 |                 let comm_ready_event = CUDA_EVENT_POOL.take().event;
108 | 
109 |                 unsafe {
110 |                     cpp::cpp!([
111 |                         comm_ready_event as "cudaEvent_t",
112 |                         comm_stream as "cudaStream_t"]
113 |                     {
114 |                         CUDACHECK(cudaEventRecord(comm_ready_event, comm_stream));
115 |                         CUDACHECK(cudaEventSynchronize(comm_ready_event));
116 |                     });
117 |                 }
118 | 
119 |                 if c.check_abort() {
120 |                     return;
121 |                 }
122 | 
123 |                 // do we need to wait default stream?
124 |                 unsafe {
125 |                     cpp::cpp!([
126 |                         src_ready_event as "cudaEvent_t",
127 |                         comm_stream as "cudaStream_t",
128 |                         torch_stream as "cudaStream_t"]
129 |                     {
130 |                         CUDACHECK(cudaEventRecord(src_ready_event, torch_stream));
131 |                         CUDACHECK(cudaStreamWaitEvent(comm_stream, src_ready_event , 0));
132 |                     });
133 |                 }
134 | 
135 |                 t.raw.async_model_average(
136 |                     &reduced_tensor,
137 |                     &temp_tensor,
138 |                     c.nranks as f32,
139 |                     comm_stream,
140 |                 );
141 | 
142 |                 unsafe {
143 |                     cpp::cpp!([comm_stream as "cudaStream_t"]
144 |                     {
145 |                         CUDACHECK(cudaStreamSynchronize(comm_stream));
146 |                     });
147 |                 }
148 | 
149 |                 tracing::debug!(
150 |                     "async model average update cost: {:?}",
151 |                     start_time.elapsed()
152 |                 );
153 |             },
154 |         );
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/device/example_device_reduce.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Simple example of DeviceReduce::Sum().
 31 |  *
 32 |  * Sums an array of int keys.
 33 |  *
 34 |  * To compile using the command line:
 35 |  *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
 36 |  *
 37 |  ******************************************************************************/
 38 | 
 39 | // Ensure printing of CUDA runtime errors to console
 40 | #define CUB_STDERR
 41 | 
 42 | #include <stdio.h>
 43 | 
 44 | #include <cub/util_allocator.cuh>
 45 | #include <cub/device/device_reduce.cuh>
 46 | 
 47 | #include "../../test/test_util.h"
 48 | 
 49 | using namespace cub;
 50 | 
 51 | 
 52 | //---------------------------------------------------------------------
 53 | // Globals, constants and typedefs
 54 | //---------------------------------------------------------------------
 55 | 
 56 | bool                    g_verbose = false;  // Whether to display input/output to console
 57 | CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
 58 | 
 59 | 
 60 | //---------------------------------------------------------------------
 61 | // Test generation
 62 | //---------------------------------------------------------------------
 63 | 
 64 | /**
 65 |  * Initialize problem
 66 |  */
 67 | void Initialize(
 68 |     int   *h_in,
 69 |     int     num_items)
 70 | {
 71 |     for (int i = 0; i < num_items; ++i)
 72 |         h_in[i] = i;
 73 | 
 74 |     if (g_verbose)
 75 |     {
 76 |         printf("Input:\n");
 77 |         DisplayResults(h_in, num_items);
 78 |         printf("\n\n");
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | /**
 84 |  * Compute solution
 85 |  */
 86 | void Solve(
 87 |     int           *h_in,
 88 |     int           &h_reference,
 89 |     int             num_items)
 90 | {
 91 |     for (int i = 0; i < num_items; ++i)
 92 |     {
 93 |         if (i == 0)
 94 |             h_reference = h_in[0];
 95 |         else
 96 |             h_reference += h_in[i];
 97 |     }
 98 | }
 99 | 
100 | 
101 | //---------------------------------------------------------------------
102 | // Main
103 | //---------------------------------------------------------------------
104 | 
105 | /**
106 |  * Main
107 |  */
108 | int main(int argc, char** argv)
109 | {
110 |     int num_items = 150;
111 | 
112 |     // Initialize command line
113 |     CommandLineArgs args(argc, argv);
114 |     g_verbose = args.CheckCmdLineFlag("v");
115 |     args.GetCmdLineArgument("n", num_items);
116 | 
117 |     // Print usage
118 |     if (args.CheckCmdLineFlag("help"))
119 |     {
120 |         printf("%s "
121 |             "[--n=<input items> "
122 |             "[--device=<device-id>] "
123 |             "[--v] "
124 |             "\n", argv[0]);
125 |         exit(0);
126 |     }
127 | 
128 |     // Initialize device
129 |     CubDebugExit(args.DeviceInit());
130 | 
131 |     printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
132 |         num_items, (int) sizeof(int));
133 |     fflush(stdout);
134 | 
135 |     // Allocate host arrays
136 |     int* h_in = new int[num_items];
137 |     int  h_reference;
138 | 
139 |     // Initialize problem and solution
140 |     Initialize(h_in, num_items);
141 |     Solve(h_in, h_reference, num_items);
142 | 
143 |     // Allocate problem device arrays
144 |     int *d_in = NULL;
145 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
146 | 
147 |     // Initialize device input
148 |     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
149 | 
150 |     // Allocate device output array
151 |     int *d_out = NULL;
152 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
153 | 
154 |     // Request and allocate temporary storage
155 |     void            *d_temp_storage = NULL;
156 |     size_t          temp_storage_bytes = 0;
157 |     CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
158 |     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
159 | 
160 |     // Run
161 |     CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
162 | 
163 |     // Check for correctness (and display results, if specified)
164 |     int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
165 |     printf("\t%s", compare ? "FAIL" : "PASS");
166 |     AssertEquals(0, compare);
167 | 
168 |     // Cleanup
169 |     if (h_in) delete[] h_in;
170 |     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
171 |     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
172 |     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
173 | 
174 |     printf("\n\n");
175 | 
176 |     return 0;
177 | }
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential reduction over statically-sized array types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../thread/thread_operators.cuh"
 37 | #include "../util_namespace.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | /// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
 46 | namespace internal {
 47 | 
 48 | /**
 49 |  * Sequential reduction over statically-sized array types
 50 |  */
 51 | template <
 52 |     int         LENGTH,
 53 |     typename    T,
 54 |     typename    ReductionOp>
 55 | __device__ __forceinline__ T ThreadReduce(
 56 |     T*                  input,                  ///< [in] Input array
 57 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 58 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 59 |     Int2Type<LENGTH>    /*length*/)
 60 | {
 61 |     T retval = prefix;
 62 | 
 63 |     #pragma unroll
 64 |     for (int i = 0; i < LENGTH; ++i)
 65 |         retval = reduction_op(retval, input[i]);
 66 | 
 67 |     return retval;
 68 | }
 69 | 
 70 | 
 71 | /**
 72 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
 73 |  *
 74 |  * \tparam LENGTH     LengthT of input array
 75 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 76 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 77 |  */
 78 | template <
 79 |     int         LENGTH,
 80 |     typename    T,
 81 |     typename    ReductionOp>
 82 | __device__ __forceinline__ T ThreadReduce(
 83 |     T*          input,                  ///< [in] Input array
 84 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
 85 |     T           prefix)                 ///< [in] Prefix to seed reduction with
 86 | {
 87 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 88 | }
 89 | 
 90 | 
 91 | /**
 92 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
 93 |  *
 94 |  * \tparam LENGTH     LengthT of input array
 95 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 96 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 97 |  */
 98 | template <
 99 |     int         LENGTH,
100 |     typename    T,
101 |     typename    ReductionOp>
102 | __device__ __forceinline__ T ThreadReduce(
103 |     T*          input,                  ///< [in] Input array
104 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
105 | {
106 |     T prefix = input[0];
107 |     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
108 | }
109 | 
110 | 
111 | /**
112 |  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
113 |  *
114 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
115 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
116 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
117 |  */
118 | template <
119 |     int         LENGTH,
120 |     typename    T,
121 |     typename    ReductionOp>
122 | __device__ __forceinline__ T ThreadReduce(
123 |     T           (&input)[LENGTH],       ///< [in] Input array
124 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
125 |     T           prefix)                 ///< [in] Prefix to seed reduction with
126 | {
127 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
128 | }
129 | 
130 | 
131 | /**
132 |  * \brief Serial reduction with the specified operator
133 |  *
134 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
135 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
136 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
137 |  */
138 | template <
139 |     int         LENGTH,
140 |     typename    T,
141 |     typename    ReductionOp>
142 | __device__ __forceinline__ T ThreadReduce(
143 |     T           (&input)[LENGTH],       ///< [in] Input array
144 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
145 | {
146 |     return ThreadReduce<LENGTH>((T*) input, reduction_op);
147 | }
148 | 
149 | 
150 | }               // internal namespace
151 | }               // CUB namespace
152 | CUB_NS_POSTFIX  // Optional outer namespace(s)
153 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_macro.cuh"
 38 | #include "../util_arch.cuh"
 39 | #include "../util_type.cuh"
 40 | #include "../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | /**
 49 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 50 |  * \ingroup BlockModule
 51 |  *
 52 |  * \par Overview
 53 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 54 |  * threads places elements into shared memory and then reduces the active
 55 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 56 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 57 |  * (for most data types).
 58 |  *
 59 |  * \tparam T                        The data type to be exchanged.
 60 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 61 |  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
 62 |  */
 63 | template <
 64 |     typename    T,
 65 |     int         BLOCK_THREADS,
 66 |     int         PTX_ARCH = CUB_PTX_ARCH>
 67 | struct BlockRakingLayout
 68 | {
 69 |     //---------------------------------------------------------------------
 70 |     // Constants and type definitions
 71 |     //---------------------------------------------------------------------
 72 | 
 73 |     enum
 74 |     {
 75 |         /// The total number of elements that need to be cooperatively reduced
 76 |         SHARED_ELEMENTS = BLOCK_THREADS,
 77 | 
 78 |         /// Maximum number of warp-synchronous raking threads
 79 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
 80 | 
 81 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 82 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 83 | 
 84 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 85 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 86 | 
 87 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 88 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
 89 | 
 90 |         /// Degree of bank conflicts (e.g., 4-way)
 91 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 92 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
 93 |             1,
 94 | 
 95 |         /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
 96 |         USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
 97 | 
 98 |         /// Total number of elements in the raking grid
 99 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
100 | 
101 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
102 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
103 |     };
104 | 
105 | 
106 |     /**
107 |      * \brief Shared memory storage type
108 |      */
109 |     struct __align__(16) _TempStorage
110 |     {
111 |         T buff[BlockRakingLayout::GRID_ELEMENTS];
112 |     };
113 | 
114 |     /// Alias wrapper allowing storage to be unioned
115 |     struct TempStorage : Uninitialized<_TempStorage> {};
116 | 
117 | 
118 |     /**
119 |      * \brief Returns the location for the calling thread to place data into the grid
120 |      */
121 |     static __device__ __forceinline__ T* PlacementPtr(
122 |         TempStorage &temp_storage,
123 |         unsigned int linear_tid)
124 |     {
125 |         // Offset for partial
126 |         unsigned int offset = linear_tid;
127 | 
128 |         // Add in one padding element for every segment
129 |         if (USE_SEGMENT_PADDING > 0)
130 |         {
131 |             offset += offset / SEGMENT_LENGTH;
132 |         }
133 | 
134 |         // Incorporating a block of padding partials every shared memory segment
135 |         return temp_storage.Alias().buff + offset;
136 |     }
137 | 
138 | 
139 |     /**
140 |      * \brief Returns the location for the calling thread to begin sequential raking
141 |      */
142 |     static __device__ __forceinline__ T* RakingPtr(
143 |         TempStorage &temp_storage,
144 |         unsigned int linear_tid)
145 |     {
146 |         return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
147 |     }
148 | };
149 | 
150 | }               // CUB namespace
151 | CUB_NS_POSTFIX  // Optional outer namespace(s)
152 | 
153 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_debug.cuh"
 37 | #include "../util_namespace.cuh"
 38 | #include "../thread/thread_load.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 55 |  */
 56 | class GridBarrier
 57 | {
 58 | protected :
 59 | 
 60 |     typedef unsigned int SyncFlag;
 61 | 
 62 |     // Counters in global device memory
 63 |     SyncFlag* d_sync;
 64 | 
 65 | public:
 66 | 
 67 |     /**
 68 |      * Constructor
 69 |      */
 70 |     GridBarrier() : d_sync(NULL) {}
 71 | 
 72 | 
 73 |     /**
 74 |      * Synchronize
 75 |      */
 76 |     __device__ __forceinline__ void Sync() const
 77 |     {
 78 |         volatile SyncFlag *d_vol_sync = d_sync;
 79 | 
 80 |         // Threadfence and syncthreads to make sure global writes are visible before
 81 |         // thread-0 reports in with its sync counter
 82 |         __threadfence();
 83 |         CTA_SYNC();
 84 | 
 85 |         if (blockIdx.x == 0)
 86 |         {
 87 |             // Report in ourselves
 88 |             if (threadIdx.x == 0)
 89 |             {
 90 |                 d_vol_sync[blockIdx.x] = 1;
 91 |             }
 92 | 
 93 |             CTA_SYNC();
 94 | 
 95 |             // Wait for everyone else to report in
 96 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
 97 |             {
 98 |                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
 99 |                 {
100 |                     __threadfence_block();
101 |                 }
102 |             }
103 | 
104 |             CTA_SYNC();
105 | 
106 |             // Let everyone know it's safe to proceed
107 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 |             {
109 |                 d_vol_sync[peer_block] = 0;
110 |             }
111 |         }
112 |         else
113 |         {
114 |             if (threadIdx.x == 0)
115 |             {
116 |                 // Report in
117 |                 d_vol_sync[blockIdx.x] = 1;
118 | 
119 |                 // Wait for acknowledgment
120 |                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121 |                 {
122 |                     __threadfence_block();
123 |                 }
124 |             }
125 | 
126 |             CTA_SYNC();
127 |         }
128 |     }
129 | };
130 | 
131 | 
132 | /**
133 |  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 |  *
135 |  * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 |  * the destructor is called.
137 |  */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 | 
142 |     // Number of bytes backed by d_sync
143 |     size_t sync_bytes;
144 | 
145 | public:
146 | 
147 |     /**
148 |      * Constructor
149 |      */
150 |     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 | 
152 | 
153 |     /**
154 |      * DeviceFrees and resets the progress counters
155 |      */
156 |     cudaError_t HostReset()
157 |     {
158 |         cudaError_t retval = cudaSuccess;
159 |         if (d_sync)
160 |         {
161 |             CubDebug(retval = cudaFree(d_sync));
162 |             d_sync = NULL;
163 |         }
164 |         sync_bytes = 0;
165 |         return retval;
166 |     }
167 | 
168 | 
169 |     /**
170 |      * Destructor
171 |      */
172 |     virtual ~GridBarrierLifetime()
173 |     {
174 |         HostReset();
175 |     }
176 | 
177 | 
178 |     /**
179 |      * Sets up the progress counters for the next kernel launch (lazily
180 |      * allocating and initializing them if necessary)
181 |      */
182 |     cudaError_t Setup(int sweep_grid_size)
183 |     {
184 |         cudaError_t retval = cudaSuccess;
185 |         do {
186 |             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 |             if (new_sync_bytes > sync_bytes)
188 |             {
189 |                 if (d_sync)
190 |                 {
191 |                     if (CubDebug(retval = cudaFree(d_sync))) break;
192 |                 }
193 | 
194 |                 sync_bytes = new_sync_bytes;
195 | 
196 |                 // Allocate and initialize to zero
197 |                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 |                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 |             }
200 |         } while (0);
201 | 
202 |         return retval;
203 |     }
204 | };
205 | 
206 | 
207 | /** @} */       // end group GridModule
208 | 
209 | }               // CUB namespace
210 | CUB_NS_POSTFIX  // Optional outer namespace(s)
211 | 
212 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/examples/device/example_device_scan.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Simple example of DeviceScan::ExclusiveSum().
 31 |  *
 32 |  * Computes an exclusive sum of int keys.
 33 |  *
 34 |  * To compile using the command line:
 35 |  *   nvcc -arch=sm_XX example_device_scan.cu -I../.. -lcudart -O3
 36 |  *
 37 |  ******************************************************************************/
 38 | 
 39 | // Ensure printing of CUDA runtime errors to console
 40 | #define CUB_STDERR
 41 | 
 42 | #include <stdio.h>
 43 | 
 44 | #include <cub/util_allocator.cuh>
 45 | #include <cub/device/device_scan.cuh>
 46 | 
 47 | #include "../../test/test_util.h"
 48 | 
 49 | using namespace cub;
 50 | 
 51 | 
 52 | //---------------------------------------------------------------------
 53 | // Globals, constants and typedefs
 54 | //---------------------------------------------------------------------
 55 | 
 56 | bool                    g_verbose = false;  // Whether to display input/output to console
 57 | CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
 58 | 
 59 | 
 60 | //---------------------------------------------------------------------
 61 | // Test generation
 62 | //---------------------------------------------------------------------
 63 | 
 64 | 
 65 | /**
 66 |  * Initialize problem
 67 |  */
 68 | void Initialize(
 69 |     int        *h_in,
 70 |     int          num_items)
 71 | {
 72 |     for (int i = 0; i < num_items; ++i)
 73 |         h_in[i] = i;
 74 | 
 75 |     if (g_verbose)
 76 |     {
 77 |         printf("Input:\n");
 78 |         DisplayResults(h_in, num_items);
 79 |         printf("\n\n");
 80 |     }
 81 | }
 82 | 
 83 | /**
 84 |  * Solve exclusive-scan problem
 85 |  */
 86 | int Solve(
 87 |     int           *h_in,
 88 |     int           *h_reference,
 89 |     int             num_items)
 90 | {
 91 |     int inclusive = 0;
 92 |     int aggregate = 0;
 93 | 
 94 |     for (int i = 0; i < num_items; ++i)
 95 |     {
 96 |         h_reference[i] = inclusive;
 97 |         inclusive += h_in[i];
 98 |         aggregate += h_in[i];
 99 |     }
100 | 
101 |     return aggregate;
102 | }
103 | 
104 | 
105 | 
106 | //---------------------------------------------------------------------
107 | // Main
108 | //---------------------------------------------------------------------
109 | 
110 | /**
111 |  * Main
112 |  */
113 | int main(int argc, char** argv)
114 | {
115 |     int num_items = 150;
116 | 
117 |     // Initialize command line
118 |     CommandLineArgs args(argc, argv);
119 |     g_verbose = args.CheckCmdLineFlag("v");
120 |     args.GetCmdLineArgument("n", num_items);
121 | 
122 |     // Print usage
123 |     if (args.CheckCmdLineFlag("help"))
124 |     {
125 |         printf("%s "
126 |             "[--n=<input items> "
127 |             "[--device=<device-id>] "
128 |             "[--v] "
129 |             "\n", argv[0]);
130 |         exit(0);
131 |     }
132 | 
133 |     // Initialize device
134 |     CubDebugExit(args.DeviceInit());
135 | 
136 |     printf("cub::DeviceScan::ExclusiveSum %d items (%d-byte elements)\n",
137 |         num_items, (int) sizeof(int));
138 |     fflush(stdout);
139 | 
140 |     // Allocate host arrays
141 |     int*  h_in = new int[num_items];
142 |     int*  h_reference = new int[num_items];
143 | 
144 |     // Initialize problem and solution
145 |     Initialize(h_in, num_items);
146 |     Solve(h_in, h_reference, num_items);
147 | 
148 |     // Allocate problem device arrays
149 |     int *d_in = NULL;
150 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
151 | 
152 |     // Initialize device input
153 |     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
154 | 
155 |     // Allocate device output array
156 |     int *d_out = NULL;
157 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * num_items));
158 | 
159 |     // Allocate temporary storage
160 |     void            *d_temp_storage = NULL;
161 |     size_t          temp_storage_bytes = 0;
162 |     CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
163 |     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
164 | 
165 |     // Run
166 |     CubDebugExit(DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
167 | 
168 |     // Check for correctness (and display results, if specified)
169 |     int compare = CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose);
170 |     printf("\t%s", compare ? "FAIL" : "PASS");
171 |     AssertEquals(0, compare);
172 | 
173 |     // Cleanup
174 |     if (h_in) delete[] h_in;
175 |     if (h_reference) delete[] h_reference;
176 |     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
177 |     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
178 |     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
179 | 
180 |     printf("\n\n");
181 | 
182 |     return 0;
183 | }
184 | 
185 | 
186 | 
187 | 


--------------------------------------------------------------------------------
/bagua-core-internal/src/comm_ops/decentralized_low_precision_synchronous.rs:
--------------------------------------------------------------------------------
  1 | use crate::comm_ops::decentralized_full_precision_synchronous::PeerSelectionMode;
  2 | use crate::comm_ops::CommOpTrait;
  3 | use crate::communicators::{BaguaCommunicator, BaguaHierarchicalCommunicator, NCCLGroupGuard};
  4 | use crate::datatypes::{
  5 |     BaguaBucket, BaguaTensor, BaguaTensorRaw, RawBaguaTensor, TensorCompressionMethod,
  6 | };
  7 | use crate::events::BaguaEventChannel;
  8 | use crate::resource_pool::CUDA_DEVICE_MEMORY_POOL;
  9 | use crate::{BaguaCommOpChannels, BaguaScheduledCommOp};
 10 | use parking_lot::Mutex;
 11 | use std::sync::Arc;
 12 | 
 13 | #[derive(Debug)]
 14 | pub struct DecentralizedLowPrecisionSynchronous {
 15 |     pub communicator: BaguaCommunicator,
 16 |     pub peer_selection_mode: PeerSelectionMode,
 17 |     pub compression_method: TensorCompressionMethod,
 18 |     pub weight: BaguaTensor,
 19 |     pub left_peer_weight: BaguaTensor,
 20 |     pub right_peer_weight: BaguaTensor,
 21 | }
 22 | 
 23 | impl CommOpTrait for DecentralizedLowPrecisionSynchronous {
 24 |     fn execute_background_communication(
 25 |         &self,
 26 |         bucket: Arc<BaguaBucket>,
 27 |         _comm_op_channels: &BaguaCommOpChannels,
 28 |     ) {
 29 |         let bucket_guard = bucket.inner.lock();
 30 |         let stream_ptr = self.communicator.stream_ptr();
 31 | 
 32 |         let mut communication_tensor =
 33 |             bucket_guard.get_communication_tensor(stream_ptr, false, false);
 34 | 
 35 |         let peer_mode = &self.peer_selection_mode;
 36 | 
 37 |         self.communicator.execute_communication(
 38 |             &mut communication_tensor,
 39 |             true,
 40 |             true,
 41 |             true,
 42 |             &mut |c, t| {
 43 |                 tracing::debug!("start compress diff");
 44 | 
 45 |                 t.raw.addmul_inplace(
 46 |                     self.left_peer_weight.inner.read().raw.as_ref(),
 47 |                     1.0 / 3.0,
 48 |                     c.stream_ptr,
 49 |                 );
 50 |                 t.raw.addmul_inplace(
 51 |                     self.right_peer_weight.inner.read().raw.as_ref(),
 52 |                     1.0 / 3.0,
 53 |                     c.stream_ptr,
 54 |                 );
 55 | 
 56 |                 {
 57 |                     let weight_guard = self.weight.inner.read();
 58 |                     t.raw
 59 |                         .addmul_inplace(weight_guard.raw.as_ref(), -5.0 / 3.0, c.stream_ptr);
 60 |                 }
 61 |                 let compressed_tensor = t
 62 |                     .raw
 63 |                     .compress(&self.compression_method, 1, c.stream_ptr, -1)
 64 |                     .expect("cannot compress tensor");
 65 | 
 66 |                 tracing::debug!("start communicate with peers");
 67 |                 let lrecv_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id]
 68 |                     .try_pull(
 69 |                         compressed_tensor.num_elements_allocated()
 70 |                             * compressed_tensor.dtype().bytes(),
 71 |                     )
 72 |                     .expect("cannot allocate cuda memory");
 73 |                 let mut lrecv_tensor = BaguaTensorRaw {
 74 |                     ptr: lrecv_buf.ptr,
 75 |                     num_elem_allocated: compressed_tensor.num_elements_allocated(),
 76 |                     dtype: compressed_tensor.dtype().clone(),
 77 |                     num_elem: compressed_tensor.num_elements(),
 78 |                     device_id: compressed_tensor.device_id(),
 79 |                     pool_allocations: vec![Arc::new(lrecv_buf)],
 80 |                 };
 81 | 
 82 |                 let rrecv_buf = CUDA_DEVICE_MEMORY_POOL[t.raw.device_id]
 83 |                     .try_pull(
 84 |                         compressed_tensor.num_elements_allocated()
 85 |                             * compressed_tensor.dtype().bytes(),
 86 |                     )
 87 |                     .expect("cannot allocate cuda memory");
 88 |                 let mut rrecv_tensor = BaguaTensorRaw {
 89 |                     ptr: rrecv_buf.ptr,
 90 |                     num_elem_allocated: compressed_tensor.num_elements_allocated(),
 91 |                     dtype: compressed_tensor.dtype().clone(),
 92 |                     num_elem: compressed_tensor.num_elements(),
 93 |                     device_id: compressed_tensor.device_id(),
 94 |                     pool_allocations: vec![Arc::new(rrecv_buf)],
 95 |                 };
 96 | 
 97 |                 match peer_mode {
 98 |                     PeerSelectionMode::Ring => {
 99 |                         let left_peer_rank = ((c.rank + c.nranks - 1) % c.nranks) as i32;
100 |                         let right_peer_rank = ((c.rank + 1) % c.nranks) as i32;
101 | 
102 |                         {
103 |                             let _guard = NCCLGroupGuard::new();
104 | 
105 |                             tracing::debug!(
106 |                                 "rank: {} left peer: {} right peer: {}",
107 |                                 c.rank,
108 |                                 left_peer_rank,
109 |                                 right_peer_rank
110 |                             );
111 |                             c.send(compressed_tensor.as_ref(), left_peer_rank);
112 |                             c.send(compressed_tensor.as_ref(), right_peer_rank);
113 |                             c.recv(&mut lrecv_tensor, left_peer_rank);
114 |                             c.recv(&mut rrecv_tensor, right_peer_rank);
115 |                         }
116 |                     }
117 |                     PeerSelectionMode::All => {
118 |                         unimplemented!()
119 |                     }
120 |                     PeerSelectionMode::ShiftOne => {
121 |                         unimplemented!()
122 |                     }
123 |                 };
124 | 
125 |                 tracing::debug!("start decompress diff and update weights");
126 |                 t.raw
127 |                     .decompress_from(&self.compression_method, 1, &lrecv_tensor, c.stream_ptr);
128 |                 {
129 |                     let mut weight_guard = self.left_peer_weight.inner.write();
130 |                     weight_guard.raw.add_inplace(&t.raw, c.stream_ptr);
131 |                 }
132 | 
133 |                 t.raw
134 |                     .decompress_from(&self.compression_method, 1, &rrecv_tensor, c.stream_ptr);
135 |                 {
136 |                     let mut weight_guard = self.right_peer_weight.inner.write();
137 |                     weight_guard.raw.add_inplace(&t.raw, c.stream_ptr);
138 |                 }
139 | 
140 |                 t.raw.decompress_from(
141 |                     &self.compression_method,
142 |                     1,
143 |                     compressed_tensor.as_ref(),
144 |                     c.stream_ptr,
145 |                 );
146 | 
147 |                 {
148 |                     let mut weight_guard = self.weight.inner.write();
149 |                     t.raw.add_inplace(weight_guard.raw.as_ref(), c.stream_ptr);
150 |                     weight_guard.raw.clone_from(&t.raw, c.stream_ptr);
151 |                 }
152 |             },
153 |         );
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_gmem_atomics.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <test/test_util.h>
 29 | 
 30 | namespace histogram_gmem_atomics
 31 | {
 32 |     // Decode float4 pixel into bins
 33 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 34 |     __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 35 |     {
 36 |         float* samples = reinterpret_cast<float*>(&pixel);
 37 | 
 38 |         #pragma unroll
 39 |         for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
 40 |             bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
 41 |     }
 42 | 
 43 |     // Decode uchar4 pixel into bins
 44 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 45 |     __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 46 |     {
 47 |         unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
 48 | 
 49 |         #pragma unroll
 50 |         for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
 51 |             bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
 52 |     }
 53 | 
 54 |     // Decode uchar1 pixel into bins
 55 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 56 |     __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 57 |     {
 58 |         bins[0] = (unsigned int) pixel.x;
 59 |     }
 60 | 
 61 |     // First-pass histogram kernel (binning into privatized counters)
 62 |     template <
 63 |         int         NUM_PARTS,
 64 |         int         ACTIVE_CHANNELS,
 65 |         int         NUM_BINS,
 66 |         typename    PixelType>
 67 |     __global__ void histogram_gmem_atomics(
 68 |         const PixelType *in,
 69 |         int width,
 70 |         int height,
 71 |         unsigned int *out)
 72 |     {
 73 |         // global position and size
 74 |         int x = blockIdx.x * blockDim.x + threadIdx.x;
 75 |         int y = blockIdx.y * blockDim.y + threadIdx.y;
 76 |         int nx = blockDim.x * gridDim.x;
 77 |         int ny = blockDim.y * gridDim.y;
 78 | 
 79 |         // threads in workgroup
 80 |         int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
 81 |         int nt = blockDim.x * blockDim.y; // total threads in workgroup
 82 | 
 83 |         // group index in 0..ngroups-1
 84 |         int g = blockIdx.x + blockIdx.y * gridDim.x;
 85 | 
 86 |         // initialize smem
 87 |         unsigned int *gmem = out + g * NUM_PARTS;
 88 |         for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS; i += nt)
 89 |             gmem[i] = 0;
 90 |         __syncthreads();
 91 | 
 92 |         // process pixels (updates our group's partial histogram in gmem)
 93 |         for (int col = x; col < width; col += nx)
 94 |         {
 95 |             for (int row = y; row < height; row += ny)
 96 |             {
 97 |                 PixelType pixel = in[row * width + col];
 98 | 
 99 |                 unsigned int bins[ACTIVE_CHANNELS];
100 |                 DecodePixel<NUM_BINS>(pixel, bins);
101 | 
102 |                 #pragma unroll
103 |                 for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
104 |                     atomicAdd(&gmem[(NUM_BINS * CHANNEL) + bins[CHANNEL]], 1);
105 |             }
106 |         }
107 |     }
108 | 
109 |     // Second pass histogram kernel (accumulation)
110 |     template <
111 |         int         NUM_PARTS,
112 |         int         ACTIVE_CHANNELS,
113 |         int         NUM_BINS>
114 |     __global__ void histogram_gmem_accum(
115 |         const unsigned int *in,
116 |         int n,
117 |         unsigned int *out)
118 |     {
119 |         int i = blockIdx.x * blockDim.x + threadIdx.x;
120 |         if (i > ACTIVE_CHANNELS * NUM_BINS)
121 |             return; // out of range
122 | 
123 |         unsigned int total = 0;
124 |         for (int j = 0; j < n; j++)
125 |             total += in[i + NUM_PARTS * j];
126 | 
127 |         out[i] = total;
128 |     }
129 | 
130 | 
131 | }   // namespace histogram_gmem_atomics
132 | 
133 | 
134 | template <
135 |     int         ACTIVE_CHANNELS,
136 |     int         NUM_BINS,
137 |     typename    PixelType>
138 | double run_gmem_atomics(
139 |     PixelType *d_image,
140 |     int width,
141 |     int height,
142 |     unsigned int *d_hist,
143 |     bool warmup)
144 | {
145 |     enum
146 |     {
147 |         NUM_PARTS = 1024
148 |     };
149 | 
150 |     cudaDeviceProp props;
151 |     cudaGetDeviceProperties(&props, 0);
152 | 
153 |     dim3 block(32, 4);
154 |     dim3 grid(16, 16);
155 |     int total_blocks = grid.x * grid.y;
156 | 
157 |     // allocate partial histogram
158 |     unsigned int *d_part_hist;
159 |     cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
160 | 
161 |     dim3 block2(128);
162 |     dim3 grid2((3 * NUM_BINS + block.x - 1) / block.x);
163 | 
164 |     GpuTimer gpu_timer;
165 |     gpu_timer.Start();
166 | 
167 |     histogram_gmem_atomics::histogram_gmem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
168 |         d_image,
169 |         width,
170 |         height,
171 |         d_part_hist);
172 | 
173 |     histogram_gmem_atomics::histogram_gmem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
174 |         d_part_hist,
175 |         total_blocks,
176 |         d_hist);
177 | 
178 |     gpu_timer.Stop();
179 |     float elapsed_millis = gpu_timer.ElapsedMillis();
180 | 
181 |     cudaFree(d_part_hist);
182 | 
183 |     return elapsed_millis;
184 | }
185 | 
186 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/tune/Makefile:
--------------------------------------------------------------------------------
  1 | #/******************************************************************************
  2 | # * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 | # * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 | # * 
  5 | # * Redistribution and use in source and binary forms, with or without
  6 | # * modification, are permitted provided that the following conditions are met:
  7 | # *	 * Redistributions of source code must retain the above copyright
  8 | # *	   notice, this list of conditions and the following disclaimer.
  9 | # *	 * Redistributions in binary form must reproduce the above copyright
 10 | # *	   notice, this list of conditions and the following disclaimer in the
 11 | # *	   documentation and/or other materials provided with the distribution.
 12 | # *	 * Neither the name of the NVIDIA CORPORATION nor the
 13 | # *	   names of its contributors may be used to endorse or promote products
 14 | # *	   derived from this software without specific prior written permission.
 15 | # * 
 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | # *
 27 | #******************************************************************************/
 28 |  
 29 | #-------------------------------------------------------------------------------
 30 | # Build script for project
 31 | #-------------------------------------------------------------------------------
 32 | 
 33 | NVCC = "$(shell which nvcc)"
 34 | NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
 35 | 
 36 | # detect OS
 37 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 38 | 
 39 | #-------------------------------------------------------------------------------
 40 | # Libs
 41 | #-------------------------------------------------------------------------------
 42 | 
 43 | 
 44 | #-------------------------------------------------------------------------------
 45 | # Includes
 46 | #-------------------------------------------------------------------------------
 47 | 
 48 | INC = -I. -I.. -I../test
 49 | 
 50 | #-------------------------------------------------------------------------------
 51 | # Libs
 52 | #-------------------------------------------------------------------------------
 53 | 
 54 | LIBS += -lcudart 
 55 | 
 56 | #-------------------------------------------------------------------------------
 57 | # Defines
 58 | #-------------------------------------------------------------------------------
 59 | 
 60 | DEFINES = 
 61 | 
 62 | #-------------------------------------------------------------------------------
 63 | # SM Arch
 64 | #-------------------------------------------------------------------------------
 65 | 
 66 | ifdef sm
 67 | 	SM_ARCH = $(sm)
 68 | else 
 69 |     SM_ARCH = 200
 70 | endif
 71 | 
 72 | # Only one arch per tuning binary
 73 | ifeq (350, $(findstring 350, $(SM_ARCH)))
 74 |     SM_TARGETS = -arch=sm_35
 75 |     SM_ARCH = 350
 76 | endif
 77 | ifeq (300, $(findstring 300, $(SM_ARCH)))
 78 |     SM_TARGETS = -arch=sm_30
 79 |     SM_ARCH = 300
 80 | endif
 81 | ifeq (200, $(findstring 200, $(SM_ARCH)))
 82 |     SM_TARGETS = -arch=sm_20
 83 |     SM_ARCH = 200
 84 | endif
 85 | ifeq (130, $(findstring 130, $(SM_ARCH)))
 86 |     SM_TARGETS = -arch=sm_13
 87 |     SM_ARCH = 130
 88 | endif
 89 | ifeq (110, $(findstring 110, $(SM_ARCH)))
 90 |     SM_TARGETS = -arch=sm_11 
 91 |     SM_ARCH = 110
 92 | endif
 93 | ifeq (100, $(findstring 100, $(SM_ARCH)))
 94 |     SM_TARGETS = -arch=sm_10 
 95 |     SM_ARCH = 100
 96 | endif
 97 | 
 98 | 
 99 | #-------------------------------------------------------------------------------
100 | # Compiler Flags
101 | #-------------------------------------------------------------------------------
102 | 
103 | NVCCFLAGS = -Xptxas -v -Xcudafe -\#
104 | 
105 | # Help the compiler/linker work with huge numbers of kernels on Windows
106 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
107 | 	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
108 | endif
109 | 
110 | # 32/64-bit (32-bit device pointers by default) 
111 | ifeq ($(force32), 1)
112 | 	CPU_ARCH = -m32
113 | 	CPU_ARCH_SUFFIX = i386
114 | else
115 | 	CPU_ARCH = -m64
116 | 	CPU_ARCH_SUFFIX = x86_64
117 | endif
118 | 
119 | # CUDA ABI enable/disable (enabled by default) 
120 | ifneq ($(abi), 0)
121 | 	ABI_SUFFIX = abi
122 | else 
123 | 	NVCCFLAGS += -Xptxas -abi=no
124 | 	ABI_SUFFIX = noabi
125 | endif
126 | 
127 | # NVVM/Open64 middle-end compiler (nvvm by default)
128 | ifeq ($(open64), 1)
129 | 	NVCCFLAGS += -open64
130 | 	PTX_SUFFIX = open64
131 | else 
132 | 	PTX_SUFFIX = nvvm
133 | endif
134 | 
135 | # Verbose toolchain output from nvcc
136 | ifeq ($(verbose), 1)
137 | 	NVCCFLAGS += -v
138 | endif
139 | 
140 | # Keep intermediate compilation artifacts
141 | ifeq ($(keep), 1)
142 | 	NVCCFLAGS += -keep
143 | endif
144 | 
145 | # Data type size to compile a schmoo binary for
146 | ifdef tunesize
147 |     TUNE_SIZE = $(tunesize)
148 | else 
149 | 	TUNE_SIZE = 4
150 | endif
151 | 
152 | 
153 | SUFFIX = $(TUNE_SIZE)B_sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CPU_ARCH_SUFFIX)
154 | 
155 | #-------------------------------------------------------------------------------
156 | # Dependency Lists
157 | #-------------------------------------------------------------------------------
158 | 
159 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
160 | 
161 | DEPS =	 ./Makefile \
162 | 		../test/test_util.h \
163 | 		$(call rwildcard,../cub/,*.cuh)
164 | 
165 | 
166 | #-------------------------------------------------------------------------------
167 | # make default
168 | #-------------------------------------------------------------------------------
169 | 
170 | default:
171 | 
172 | 
173 | #-------------------------------------------------------------------------------
174 | # make clean
175 | #-------------------------------------------------------------------------------
176 | 
177 | clean :
178 | 	rm -f bin/*$(CPU_ARCH_SUFFIX)* 
179 | 	rm -f *.i* *.cubin *.cu.c *.cudafe* *.fatbin.c *.ptx *.hash *.cu.cpp *.o
180 | 
181 | 
182 | 
183 | #-------------------------------------------------------------------------------
184 | # make tune_device_reduce
185 | #-------------------------------------------------------------------------------
186 | 
187 | tune_device_reduce: bin/tune_device_reduce_$(SUFFIX)
188 | 
189 | bin/tune_device_reduce_$(SUFFIX) : tune_device_reduce.cu $(DEPS)
190 | 	mkdir -p bin
191 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o bin/tune_device_reduce_$(SUFFIX) tune_device_reduce.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -O3 -DTUNE_ARCH=$(SM_ARCH) -DTUNE_SIZE=$(TUNE_SIZE)
192 | 
193 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 45 | 
 46 | #if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
 47 |     #define CUB_USE_COOPERATIVE_GROUPS
 48 | #endif
 49 | 
 50 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 51 | #ifndef CUB_PTX_ARCH
 52 |     #ifndef __CUDA_ARCH__
 53 |         #define CUB_PTX_ARCH 0
 54 |     #else
 55 |         #define CUB_PTX_ARCH __CUDA_ARCH__
 56 |     #endif
 57 | #endif
 58 | 
 59 | 
 60 | /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 61 | #ifndef CUB_RUNTIME_FUNCTION
 62 |     #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
 63 |         #define CUB_RUNTIME_ENABLED
 64 |         #define CUB_RUNTIME_FUNCTION __host__ __device__
 65 |     #else
 66 |         #define CUB_RUNTIME_FUNCTION __host__
 67 |     #endif
 68 | #endif
 69 | 
 70 | 
 71 | /// Number of threads per warp
 72 | #ifndef CUB_LOG_WARP_THREADS
 73 |     #define CUB_LOG_WARP_THREADS(arch)                      \
 74 |         (5)
 75 |     #define CUB_WARP_THREADS(arch)                          \
 76 |         (1 << CUB_LOG_WARP_THREADS(arch))
 77 | 
 78 |     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
 79 |     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
 80 | #endif
 81 | 
 82 | 
 83 | /// Number of smem banks
 84 | #ifndef CUB_LOG_SMEM_BANKS
 85 |     #define CUB_LOG_SMEM_BANKS(arch)                        \
 86 |         ((arch >= 200) ?                                    \
 87 |             (5) :                                           \
 88 |             (4))
 89 |     #define CUB_SMEM_BANKS(arch)                            \
 90 |         (1 << CUB_LOG_SMEM_BANKS(arch))
 91 | 
 92 |     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
 93 |     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
 94 | #endif
 95 | 
 96 | 
 97 | /// Oversubscription factor
 98 | #ifndef CUB_SUBSCRIPTION_FACTOR
 99 |     #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
100 |         ((arch >= 300) ?                                    \
101 |             (5) :                                           \
102 |             ((arch >= 200) ?                                \
103 |                 (3) :                                       \
104 |                 (10)))
105 |     #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
106 | #endif
107 | 
108 | 
109 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
110 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
111 |     #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
112 |         ((arch >= 300) ?                                    \
113 |             (1) :                                           \
114 |             (4))
115 |     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
116 | #endif
117 | 
118 | 
119 | /// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data.  Minimum of two warps.
120 | #ifndef CUB_SCALED_BLOCK_THREADS
121 |     #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                   \
122 |         (CUB_MIN(                                                                           \
123 |             NOMINAL_4B_BLOCK_THREADS,                                                       \
124 |             CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
125 |                 2,                                                                          \
126 |                 (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
127 | #endif
128 | 
129 | /// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data.  Minimum 1 item per thread
130 | #ifndef CUB_SCALED_ITEMS_PER_THREAD
131 |     #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)     \
132 |         CUB_MAX(                                                                                                \
133 |             1,                                                                                                  \
134 |             (sizeof(T) < 4) ?                                                                                   \
135 |                 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 :  \
136 |                 ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))
137 | #endif
138 | 
139 | /// Define both nominal threads-per-block and items-per-thread
140 | #ifndef CUB_SCALED_GRANULARITIES
141 |     #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)      \
142 |         CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                   \
143 |         CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
144 | #endif
145 | 
146 | 
147 | 
148 | #endif  // Do not document
149 | 
150 | }               // CUB namespace
151 | CUB_NS_POSTFIX  // Optional outer namespace(s)
152 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/experimental/histogram/histogram_smem_atomics.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <test/test_util.h>
 29 | 
 30 | namespace histogram_smem_atomics
 31 | {
 32 |     // Decode float4 pixel into bins
 33 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 34 |     __device__ __forceinline__ void DecodePixel(float4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 35 |     {
 36 |         float* samples = reinterpret_cast<float*>(&pixel);
 37 | 
 38 |         #pragma unroll
 39 |         for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
 40 |             bins[CHANNEL] = (unsigned int) (samples[CHANNEL] * float(NUM_BINS));
 41 |     }
 42 | 
 43 |     // Decode uchar4 pixel into bins
 44 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 45 |     __device__ __forceinline__ void DecodePixel(uchar4 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 46 |     {
 47 |         unsigned char* samples = reinterpret_cast<unsigned char*>(&pixel);
 48 | 
 49 |         #pragma unroll
 50 |         for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
 51 |             bins[CHANNEL] = (unsigned int) (samples[CHANNEL]);
 52 |     }
 53 | 
 54 |     // Decode uchar1 pixel into bins
 55 |     template <int NUM_BINS, int ACTIVE_CHANNELS>
 56 |     __device__ __forceinline__ void DecodePixel(uchar1 pixel, unsigned int (&bins)[ACTIVE_CHANNELS])
 57 |     {
 58 |         bins[0] = (unsigned int) pixel.x;
 59 |     }
 60 | 
 61 |     // First-pass histogram kernel (binning into privatized counters)
 62 |     template <
 63 |         int         NUM_PARTS,
 64 |         int         ACTIVE_CHANNELS,
 65 |         int         NUM_BINS,
 66 |         typename    PixelType>
 67 |     __global__ void histogram_smem_atomics(
 68 |         const PixelType *in,
 69 |         int width,
 70 |         int height,
 71 |         unsigned int *out)
 72 |     {
 73 |         // global position and size
 74 |         int x = blockIdx.x * blockDim.x + threadIdx.x;
 75 |         int y = blockIdx.y * blockDim.y + threadIdx.y;
 76 |         int nx = blockDim.x * gridDim.x;
 77 |         int ny = blockDim.y * gridDim.y;
 78 | 
 79 |         // threads in workgroup
 80 |         int t = threadIdx.x + threadIdx.y * blockDim.x; // thread index in workgroup, linear in 0..nt-1
 81 |         int nt = blockDim.x * blockDim.y; // total threads in workgroup
 82 | 
 83 |         // group index in 0..ngroups-1
 84 |         int g = blockIdx.x + blockIdx.y * gridDim.x;
 85 | 
 86 |         // initialize smem
 87 |         __shared__ unsigned int smem[ACTIVE_CHANNELS * NUM_BINS + 3];
 88 |         for (int i = t; i < ACTIVE_CHANNELS * NUM_BINS + 3; i += nt)
 89 |             smem[i] = 0;
 90 |         __syncthreads();
 91 | 
 92 |         // process pixels
 93 |         // updates our group's partial histogram in smem
 94 |         for (int col = x; col < width; col += nx)
 95 |         {
 96 |             for (int row = y; row < height; row += ny)
 97 |             {
 98 |                 PixelType pixel = in[row * width + col];
 99 | 
100 |                 unsigned int bins[ACTIVE_CHANNELS];
101 |                 DecodePixel<NUM_BINS>(pixel, bins);
102 | 
103 |                 #pragma unroll
104 |                 for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
105 |                     atomicAdd(&smem[(NUM_BINS * CHANNEL) + bins[CHANNEL] + CHANNEL], 1);
106 |             }
107 |         }
108 | 
109 |         __syncthreads();
110 | 
111 |         // move to our workgroup's slice of output
112 |         out += g * NUM_PARTS;
113 | 
114 |         // store local output to global
115 |         for (int i = t; i < NUM_BINS; i += nt)
116 |         {
117 |             #pragma unroll
118 |             for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
119 |                 out[i + NUM_BINS * CHANNEL] = smem[i + NUM_BINS * CHANNEL + CHANNEL];
120 |         }
121 |     }
122 | 
123 |     // Second pass histogram kernel (accumulation)
124 |     template <
125 |         int         NUM_PARTS,
126 |         int         ACTIVE_CHANNELS,
127 |         int         NUM_BINS>
128 |     __global__ void histogram_smem_accum(
129 |         const unsigned int *in,
130 |         int n,
131 |         unsigned int *out)
132 |     {
133 |         int i = blockIdx.x * blockDim.x + threadIdx.x;
134 |         if (i > ACTIVE_CHANNELS * NUM_BINS) return; // out of range
135 |         unsigned int total = 0;
136 |         for (int j = 0; j < n; j++)
137 |             total += in[i + NUM_PARTS * j];
138 |         out[i] = total;
139 |     }
140 | 
141 | }   // namespace histogram_smem_atomics
142 | 
143 | 
144 | template <
145 |     int         ACTIVE_CHANNELS,
146 |     int         NUM_BINS,
147 |     typename    PixelType>
148 | double run_smem_atomics(
149 |     PixelType *d_image,
150 |     int width,
151 |     int height,
152 |     unsigned int *d_hist, 
153 |     bool warmup)
154 | {
155 |     enum
156 |     {
157 |         NUM_PARTS = 1024
158 |     };
159 | 
160 |     cudaDeviceProp props;
161 |     cudaGetDeviceProperties(&props, 0);
162 | 
163 |     dim3 block(32, 4);
164 |     dim3 grid(16, 16);
165 |     int total_blocks = grid.x * grid.y;
166 | 
167 |     // allocate partial histogram
168 |     unsigned int *d_part_hist;
169 |     cudaMalloc(&d_part_hist, total_blocks * NUM_PARTS * sizeof(unsigned int));
170 | 
171 |     dim3 block2(128);
172 |     dim3 grid2((ACTIVE_CHANNELS * NUM_BINS + block.x - 1) / block.x);
173 | 
174 |     GpuTimer gpu_timer;
175 |     gpu_timer.Start();
176 | 
177 |     histogram_smem_atomics::histogram_smem_atomics<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid, block>>>(
178 |         d_image,
179 |         width,
180 |         height,
181 |         d_part_hist);
182 | 
183 |     histogram_smem_atomics::histogram_smem_accum<NUM_PARTS, ACTIVE_CHANNELS, NUM_BINS><<<grid2, block2>>>(
184 |         d_part_hist,
185 |         total_blocks,
186 |         d_hist);
187 | 
188 |     gpu_timer.Stop();
189 |     float elapsed_millis = gpu_timer.ElapsedMillis();
190 | 
191 |     cudaFree(d_part_hist);
192 | 
193 |     return elapsed_millis;
194 | }
195 | 
196 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/iterator/discard_output_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../util_namespace.cuh"
 40 | #include "../util_macro.cuh"
 41 | 
 42 | #if (THRUST_VERSION >= 100700)
 43 |     // This iterator is compatible with Thrust API 1.7 and newer
 44 |     #include <thrust/iterator/iterator_facade.h>
 45 |     #include <thrust/iterator/iterator_traits.h>
 46 | #endif // THRUST_VERSION
 47 | 
 48 | 
 49 | /// Optional outer namespace(s)
 50 | CUB_NS_PREFIX
 51 | 
 52 | /// CUB namespace
 53 | namespace cub {
 54 | 
 55 | 
 56 | /**
 57 |  * \addtogroup UtilIterator
 58 |  * @{
 59 |  */
 60 | 
 61 | 
 62 | /**
 63 |  * \brief A discard iterator
 64 |  */
 65 | template <typename OffsetT = ptrdiff_t>
 66 | class DiscardOutputIterator
 67 | {
 68 | public:
 69 | 
 70 |     // Required iterator traits
 71 |     typedef DiscardOutputIterator   self_type;              ///< My own type
 72 |     typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
 73 |     typedef void                    value_type;             ///< The type of the element the iterator can point to
 74 |     typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
 75 |     typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
 76 | 
 77 | #if (THRUST_VERSION >= 100700)
 78 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
 79 |     typedef typename thrust::detail::iterator_facade_category<
 80 |         thrust::any_system_tag,
 81 |         thrust::random_access_traversal_tag,
 82 |         value_type,
 83 |         reference
 84 |       >::type iterator_category;                                        ///< The iterator category
 85 | #else
 86 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
 87 | #endif  // THRUST_VERSION
 88 | 
 89 | private:
 90 | 
 91 |     OffsetT offset;
 92 | 
 93 | #if defined(_WIN32) || !defined(_WIN64)
 94 |     // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
 95 |     OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
 96 | #endif
 97 | 
 98 | public:
 99 | 
100 |     /// Constructor
101 |     __host__ __device__ __forceinline__ DiscardOutputIterator(
102 |         OffsetT offset = 0)     ///< Base offset
103 |     :
104 |         offset(offset)
105 |     {}
106 | 
107 |     /// Postfix increment
108 |     __host__ __device__ __forceinline__ self_type operator++(int)
109 |     {
110 |         self_type retval = *this;
111 |         offset++;
112 |         return retval;
113 |     }
114 | 
115 |     /// Prefix increment
116 |     __host__ __device__ __forceinline__ self_type operator++()
117 |     {
118 |         offset++;
119 |         return *this;
120 |     }
121 | 
122 |     /// Indirection
123 |     __host__ __device__ __forceinline__ self_type& operator*()
124 |     {
125 |         // return self reference, which can be assigned to anything
126 |         return *this;
127 |     }
128 | 
129 |     /// Addition
130 |     template <typename Distance>
131 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
132 |     {
133 |         self_type retval(offset + n);
134 |         return retval;
135 |     }
136 | 
137 |     /// Addition assignment
138 |     template <typename Distance>
139 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
140 |     {
141 |         offset += n;
142 |         return *this;
143 |     }
144 | 
145 |     /// Subtraction
146 |     template <typename Distance>
147 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
148 |     {
149 |         self_type retval(offset - n);
150 |         return retval;
151 |     }
152 | 
153 |     /// Subtraction assignment
154 |     template <typename Distance>
155 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
156 |     {
157 |         offset -= n;
158 |         return *this;
159 |     }
160 | 
161 |     /// Distance
162 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
163 |     {
164 |         return offset - other.offset;
165 |     }
166 | 
167 |     /// Array subscript
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type& operator[](Distance n)
170 |     {
171 |         // return self reference, which can be assigned to anything
172 |         return *this;
173 |     }
174 | 
175 |     /// Structure dereference
176 |     __host__ __device__ __forceinline__ pointer operator->()
177 |     {
178 |         return;
179 |     }
180 | 
181 |     /// Assignment to self (no-op)
182 |     __host__ __device__ __forceinline__ void operator=(self_type const& other)
183 |     {
184 |         offset = other.offset;
185 |     }
186 | 
187 |     /// Assignment to anything else (no-op)
188 |     template<typename T>
189 |     __host__ __device__ __forceinline__ void operator=(T const&)
190 |     {}
191 | 
192 |     /// Cast to void* operator
193 |     __host__ __device__ __forceinline__ operator void*() const { return NULL; }
194 | 
195 |     /// Equal to
196 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
197 |     {
198 |         return (offset == rhs.offset);
199 |     }
200 | 
201 |     /// Not equal to
202 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
203 |     {
204 |         return (offset != rhs.offset);
205 |     }
206 | 
207 |     /// ostream operator
208 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
209 |     {
210 |         os << "[" << itr.offset << "]";
211 |         return os;
212 |     }
213 | 
214 | };
215 | 
216 | 
217 | /** @} */       // end group UtilIterator
218 | 
219 | }               // CUB namespace
220 | CUB_NS_POSTFIX  // Optional outer namespace(s)
221 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from distutils.errors import (
  3 |     DistutilsPlatformError,
  4 | )
  5 | from setuptools import setup, find_packages
  6 | from setuptools_rust import Binding, RustExtension
  7 | import sys
  8 | import platform
  9 | import shutil
 10 | import sys
 11 | import tempfile
 12 | import urllib.request
 13 | from tqdm import tqdm
 14 | 
 15 | 
 16 | _nccl_records = []
 17 | library_records = {}
 18 | 
 19 | 
 20 | class DownloadProgressBar(tqdm):
 21 |     def update_to(self, b=1, bsize=1, tsize=None):
 22 |         if tsize is not None:
 23 |             self.total = tsize
 24 |         self.update(b * bsize - self.n)
 25 | 
 26 | 
 27 | def download_url(url, output_path):
 28 |     with DownloadProgressBar(unit='B', unit_scale=True,
 29 |                              miniters=1, desc=url.split('/')[-1]) as t:
 30 |         urllib.request.urlretrieve(
 31 |             url, filename=output_path, reporthook=t.update_to)
 32 | 
 33 | 
 34 | def _make_nccl_url(public_version, filename):
 35 |     # https://developer.download.nvidia.com/compute/redist/nccl/v2.8/nccl_2.8.4-1+cuda11.2_x86_64.txz
 36 |     return (
 37 |         "https://developer.download.nvidia.com/compute/redist/nccl/"
 38 |         + "v{}/{}".format(public_version, filename)
 39 |     )
 40 | 
 41 | 
 42 | def _make_nccl_record(cuda_version, full_version, public_version, filename_linux):
 43 |     return {
 44 |         "cuda": cuda_version,
 45 |         "nccl": full_version,
 46 |         "assets": {
 47 |             "Linux": {
 48 |                 "url": _make_nccl_url(public_version, filename_linux),
 49 |                 "filename": "libnccl.so.{}".format(full_version),
 50 |             },
 51 |         },
 52 |     }
 53 | 
 54 | 
 55 | _nccl_records.append(
 56 |     _make_nccl_record("11.4", "2.10.3", "2.10",
 57 |                       "nccl_2.10.3-1+cuda11.4_x86_64.txz")
 58 | )
 59 | _nccl_records.append(
 60 |     _make_nccl_record("11.3", "2.10.3", "2.10",
 61 |                       "nccl_2.10.3-1+cuda11.0_x86_64.txz")
 62 | )
 63 | _nccl_records.append(
 64 |     _make_nccl_record("11.2", "2.10.3", "2.10",
 65 |                       "nccl_2.10.3-1+cuda11.0_x86_64.txz")
 66 | )
 67 | _nccl_records.append(
 68 |     _make_nccl_record("11.1", "2.10.3", "2.10",
 69 |                       "nccl_2.10.3-1+cuda11.0_x86_64.txz")
 70 | )
 71 | _nccl_records.append(
 72 |     _make_nccl_record("11.0", "2.10.3", "2.10",
 73 |                       "nccl_2.10.3-1+cuda11.0_x86_64.txz")
 74 | )
 75 | _nccl_records.append(
 76 |     _make_nccl_record("10.2", "2.10.3", "2.10",
 77 |                       "nccl_2.10.3-1+cuda10.2_x86_64.txz")
 78 | )
 79 | _nccl_records.append(
 80 |     _make_nccl_record("10.1", "2.10.3", "2.10",
 81 |                       "nccl_2.10.3-1+cuda10.2_x86_64.txz")
 82 | )
 83 | library_records["nccl"] = _nccl_records
 84 | 
 85 | 
 86 | def install_baguanet(url, destination):
 87 |     with tempfile.TemporaryDirectory() as tmpdir:
 88 |         filename = os.path.join(tmpdir, os.path.basename(url))
 89 |         print("Downloading {}...".format(url))
 90 |         download_url(url, filename)
 91 |         outdir = os.path.join(tmpdir, "extract")
 92 |         shutil.unpack_archive(filename, outdir)
 93 |         lib_dir = os.path.join(outdir, 'build')
 94 |         for filename in os.listdir(lib_dir):
 95 |             shutil.move(os.path.join(lib_dir, filename), destination)
 96 | 
 97 | 
 98 | def install_lib(cuda, prefix, library):
 99 |     record = None
100 |     lib_records = library_records
101 |     for record in lib_records[library]:
102 |         if record["cuda"] == cuda:
103 |             break
104 |     else:
105 |         raise RuntimeError(
106 |             """
107 | The CUDA version({}) specified is not supported.
108 | Should be one of {}.""".format(
109 |                 cuda, str([x["cuda"] for x in lib_records[library]])
110 |             )
111 |         )
112 |     if prefix is None:
113 |         prefix = os.path.expanduser("~/.bagua_core/cuda_lib")
114 |     destination = calculate_destination(prefix, cuda, library, record[library])
115 | 
116 |     if os.path.exists(destination):
117 |         print("The destination directory {} already exists.".format(destination))
118 |         shutil.rmtree(destination)
119 | 
120 |     target_platform = platform.system()
121 |     asset = record["assets"].get(target_platform, None)
122 |     if asset is None:
123 |         raise RuntimeError(
124 |             """
125 | The current platform ({}) is not supported.""".format(
126 |                 target_platform
127 |             )
128 |         )
129 | 
130 |     print(
131 |         "Installing {} {} for CUDA {} to: {}".format(
132 |             library, record[library], record["cuda"], destination
133 |         )
134 |     )
135 | 
136 |     url = asset["url"]
137 |     print("Downloading {}...".format(url))
138 |     with tempfile.TemporaryDirectory() as tmpdir:
139 |         filename = os.path.join(tmpdir, os.path.basename(url))
140 |         download_url(url, filename)
141 |         print("Extracting...")
142 |         outdir = os.path.join(tmpdir, "extract")
143 |         shutil.unpack_archive(filename, outdir)
144 |         print("Installing...")
145 |         if library == "nccl":
146 |             subdir = os.listdir(outdir)
147 |             assert len(subdir) == 1
148 |             shutil.move(os.path.join(outdir, subdir[0]), destination)
149 | 
150 |             # Install bagua-net
151 |             dst_dir = os.path.join(destination, 'bagua-net')
152 |             os.mkdir(dst_dir)
153 |             install_baguanet(
154 |                 "https://github.com/BaguaSys/bagua-net/releases/download/v0.1.1/bagua-net_refs.tags.v0.1.1_x86_64.tar.gz",
155 |                 dst_dir)
156 |         else:
157 |             assert False
158 |         print("Cleaning up...")
159 |     print("Done!")
160 | 
161 | 
162 | def calculate_destination(prefix, cuda, lib, lib_ver):
163 |     """Calculates the installation directory."""
164 |     return os.path.join(prefix, ".data")
165 | 
166 | 
167 | def check_torch_version():
168 |     try:
169 |         import torch
170 |     except ImportError:
171 |         print("import torch failed, is it installed?")
172 | 
173 |     version = torch.__version__
174 |     if version is None:
175 |         raise DistutilsPlatformError(
176 |             "Unable to determine PyTorch version from the version string '%s'"
177 |             % torch.__version__
178 |         )
179 |     return version
180 | 
181 | 
182 | def install_dependency_library():
183 |     nvcc_version = (
184 |         os.popen(
185 |             "nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'"
186 |         )
187 |         .read()
188 |         .strip()
189 |     )
190 |     print("nvcc_version: ", nvcc_version)
191 |     install_lib(nvcc_version, os.path.join(cwd, "python/bagua_core"), "nccl")
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     import colorama
196 |     colorama.init(autoreset=True)
197 |     cwd = os.path.dirname(os.path.abspath(__file__))
198 | 
199 |     if int(os.getenv("BAGUA_NO_INSTALL_DEPS", 0)) == 0 and \
200 |             len(sys.argv) > 1 and sys.argv[1] in ["install", "bdist_wheel"]:
201 |         print(
202 |             colorama.Fore.BLACK
203 |             + colorama.Back.CYAN
204 |             + "Bagua is automatically installing some system dependencies like NCCL, to disable set env variable BAGUA_NO_INSTALL_DEPS=1",
205 |         )
206 |         install_dependency_library()
207 | 
208 |     setup(
209 |         name="bagua-core",
210 |         use_scm_version={"local_scheme": "no-local-version"},
211 |         setup_requires=["setuptools_scm"],
212 |         url="https://github.com/BaguaSys/bagua-core",
213 |         python_requires=">=3.6",
214 |         description="Core communication lib for Bagua.",
215 |         package_dir={"": "python/"},
216 |         packages=find_packages("python/"),
217 |         package_data={"": [".data/lib/libnccl.so",
218 |                            ".data/bagua-net/libbagua_net.so",
219 |                            ".data/bagua-net/libnccl-net.so"]},
220 |         rust_extensions=[
221 |             RustExtension(
222 |                 "bagua_core.bagua_core",
223 |                 path="bagua-core-py/Cargo.toml",
224 |                 binding=Binding.PyO3,
225 |                 native=True,
226 |             )
227 |         ],
228 |         author="Kuaishou AI Platform & DS3 Lab",
229 |         author_email="admin@mail.xrlian.com",
230 |         install_requires=[
231 |             "setuptools_rust",
232 |             "colorama",
233 |         ],
234 |         zip_safe=False,
235 |     )
236 | 


--------------------------------------------------------------------------------
/bagua-core-internal/third_party/cub-1.8.0/cub/iterator/counting_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | /**
 63 |  * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
 64 |  *
 65 |  * \par Overview
 66 |  * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
 67 |  *   at \p offset will return the value \p base + \p offset.
 68 |  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
 69 |  *   functions.
 70 |  * - Compatible with Thrust API v1.7 or newer.
 71 |  *
 72 |  * \par Snippet
 73 |  * The code snippet below illustrates the use of \p CountingInputIteratorTto
 74 |  * dereference a sequence of incrementing integers.
 75 |  * \par
 76 |  * \code
 77 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
 78 |  *
 79 |  * cub::CountingInputIterator<int> itr(5);
 80 |  *
 81 |  * printf("%d\n", itr[0]);      // 5
 82 |  * printf("%d\n", itr[1]);      // 6
 83 |  * printf("%d\n", itr[2]);      // 7
 84 |  * printf("%d\n", itr[50]);     // 55
 85 |  *
 86 |  * \endcode
 87 |  *
 88 |  * \tparam ValueType            The value type of this iterator
 89 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
 90 |  */
 91 | template <
 92 |     typename ValueType,
 93 |     typename OffsetT = ptrdiff_t>
 94 | class CountingInputIterator
 95 | {
 96 | public:
 97 | 
 98 |     // Required iterator traits
 99 |     typedef CountingInputIterator               self_type;              ///< My own type
100 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
101 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
102 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
103 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
104 | 
105 | #if (THRUST_VERSION >= 100700)
106 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
107 |     typedef typename thrust::detail::iterator_facade_category<
108 |         thrust::any_system_tag,
109 |         thrust::random_access_traversal_tag,
110 |         value_type,
111 |         reference
112 |       >::type iterator_category;                                        ///< The iterator category
113 | #else
114 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
115 | #endif  // THRUST_VERSION
116 | 
117 | private:
118 | 
119 |     ValueType val;
120 | 
121 | public:
122 | 
123 |     /// Constructor
124 |     __host__ __device__ __forceinline__ CountingInputIterator(
125 |         const ValueType &val)          ///< Starting value for the iterator instance to report
126 |     :
127 |         val(val)
128 |     {}
129 | 
130 |     /// Postfix increment
131 |     __host__ __device__ __forceinline__ self_type operator++(int)
132 |     {
133 |         self_type retval = *this;
134 |         val++;
135 |         return retval;
136 |     }
137 | 
138 |     /// Prefix increment
139 |     __host__ __device__ __forceinline__ self_type operator++()
140 |     {
141 |         val++;
142 |         return *this;
143 |     }
144 | 
145 |     /// Indirection
146 |     __host__ __device__ __forceinline__ reference operator*() const
147 |     {
148 |         return val;
149 |     }
150 | 
151 |     /// Addition
152 |     template <typename Distance>
153 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
154 |     {
155 |         self_type retval(val + (ValueType) n);
156 |         return retval;
157 |     }
158 | 
159 |     /// Addition assignment
160 |     template <typename Distance>
161 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
162 |     {
163 |         val += (ValueType) n;
164 |         return *this;
165 |     }
166 | 
167 |     /// Subtraction
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
170 |     {
171 |         self_type retval(val - (ValueType) n);
172 |         return retval;
173 |     }
174 | 
175 |     /// Subtraction assignment
176 |     template <typename Distance>
177 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
178 |     {
179 |         val -= n;
180 |         return *this;
181 |     }
182 | 
183 |     /// Distance
184 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
185 |     {
186 |         return (difference_type) (val - other.val);
187 |     }
188 | 
189 |     /// Array subscript
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
192 |     {
193 |         return val + (ValueType) n;
194 |     }
195 | 
196 |     /// Structure dereference
197 |     __host__ __device__ __forceinline__ pointer operator->()
198 |     {
199 |         return &val;
200 |     }
201 | 
202 |     /// Equal to
203 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
204 |     {
205 |         return (val == rhs.val);
206 |     }
207 | 
208 |     /// Not equal to
209 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
210 |     {
211 |         return (val != rhs.val);
212 |     }
213 | 
214 |     /// ostream operator
215 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
216 |     {
217 |         os << "[" << itr.val << "]";
218 |         return os;
219 |     }
220 | 
221 | };
222 | 
223 | 
224 | 
225 | /** @} */       // end group UtilIterator
226 | 
227 | }               // CUB namespace
228 | CUB_NS_POSTFIX  // Optional outer namespace(s)
229 | 


--------------------------------------------------------------------------------