├── .gitignore ├── Cargo.toml ├── README.md ├── benches └── shared_tensor.rs ├── crates ├── parenchyma-blas │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── src │ │ ├── extension_package │ │ │ ├── axpby.rs │ │ │ ├── level1.rs │ │ │ ├── level2.rs │ │ │ ├── level3.rs │ │ │ ├── mod.rs │ │ │ └── transpose.rs │ │ ├── frameworks │ │ │ ├── mod.rs │ │ │ ├── native │ │ │ │ └── mod.rs │ │ │ └── open_cl │ │ │ │ ├── implementation │ │ │ │ ├── level1.rs │ │ │ │ └── mod.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── package.rs │ │ │ │ └── source │ │ │ │ ├── common.cl │ │ │ │ ├── level1 │ │ │ │ ├── level1.cl │ │ │ │ ├── xasum.cl │ │ │ │ ├── xaxpy.cl │ │ │ │ ├── xcopy.cl │ │ │ │ ├── xdot.cl │ │ │ │ ├── xnrm2.cl │ │ │ │ ├── xscal.cl │ │ │ │ └── xswap.cl │ │ │ │ └── level3 │ │ │ │ ├── level3.cl │ │ │ │ ├── xgemm_direct_part1.cl │ │ │ │ ├── xgemm_direct_part2.cl │ │ │ │ └── xgemm_direct_part3.cl │ │ └── lib.rs │ └── tests │ │ └── blas_specs.rs ├── parenchyma-deep │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ ├── src │ │ ├── extension_package │ │ │ ├── backward.rs │ │ │ ├── configuration.rs │ │ │ ├── convolution.rs │ │ │ ├── forward.rs │ │ │ └── mod.rs │ │ ├── frameworks │ │ │ ├── mod.rs │ │ │ ├── native │ │ │ │ └── mod.rs │ │ │ └── open_cl │ │ │ │ ├── _build.rs │ │ │ │ ├── _mod.rs │ │ │ │ ├── mod.rs │ │ │ │ ├── package.rs │ │ │ │ └── source │ │ │ │ ├── activation.cl │ │ │ │ ├── activationBackward.cl │ │ │ │ ├── convolution.cl │ │ │ │ └── softmax.cl │ │ └── lib.rs │ └── tests │ │ └── deep_specs.rs ├── parenchyma-ml │ ├── .gitignore │ ├── Cargo.toml │ └── src │ │ ├── extension_package.rs │ │ ├── frameworks │ │ ├── mod.rs │ │ ├── native.rs │ │ └── open_cl.rs │ │ └── lib.rs └── parenchyma-tr │ ├── .gitignore │ ├── Cargo.toml │ ├── README.md │ └── src │ ├── lib.rs │ ├── transformer.rs │ └── transformers │ ├── audio.rs │ ├── image.rs │ ├── mod.rs │ └── word.rs ├── license ├── LICENSE-APACHE ├── LICENSE-MIT └── README.md ├── src ├── backend.rs ├── changelog.rs ├── compute_device.rs ├── context.rs ├── error.rs ├── extension_package.rs ├── framework.rs ├── frameworks │ ├── mod.rs │ ├── native │ │ ├── context.rs │ │ ├── device.rs │ │ ├── framework.rs │ │ ├── memory.rs │ │ └── mod.rs │ └── open_cl │ │ ├── context.rs │ │ ├── device.rs │ │ ├── error.rs │ │ ├── framework.rs │ │ ├── memory.rs │ │ └── mod.rs ├── hardware.rs ├── lib.rs ├── memory.rs └── tensor │ ├── into_tensor.rs │ ├── mod.rs │ ├── tensor_map.rs │ ├── tensor_memories.rs │ ├── tensor_shape.rs │ ├── tensor_type.rs │ └── utility.rs └── tests ├── backend_specs.rs ├── framework_native_specs.rs └── shared_memory_specs.rs /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parenchyma" 3 | version = "0.0.4" 4 | authors = ["Jony "] 5 | keywords = ["backend", "computation", "opencl", "cuda", "hpc"] 6 | categories = ["science"] 7 | description = "A high-performance computing (HPC) framework" 8 | documentation = "https://docs.rs/parenchyma" 9 | repository = "https://github.com/lychee-eng/parenchyma" 10 | license = "MIT/Apache-2.0" 11 | 12 | [dependencies] 13 | # enum_primitive = "0.1.1" 14 | # futures = "0.1.11" 15 | # libloading = "0.3.2" 16 | log = "0.4" 17 | ndarray = "0.10.0" 18 | num = "0.2" 19 | ocl = "0.16.0" 20 | 21 | [dev-dependencies] 22 | # compiletest_rs = "0.2.5" 23 | lazy_static = "1.0.0" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # parenchyma 2 | 3 | [![Join the chat](https://badges.gitter.im/lychee-eng/parenchyma.svg)](https://gitter.im/lychee-eng/parenchyma) 4 | ![Project Status](https://img.shields.io/badge/status-pre--alpha-green.svg) 5 | [![](http://meritbadge.herokuapp.com/parenchyma)](https://crates.io/crates/parenchyma) 6 | [![License](https://img.shields.io/crates/l/parenchyma.svg)](#license) 7 | [![parenchyma](https://docs.rs/parenchyma/badge.svg)](https://docs.rs/parenchyma) 8 | 9 | Parenchyma started off as a hard fork of [Collenchyma][collenchyma-repo] (hence the name), an 10 | extensible HPC framework developed by the [Autumn team] as well as an amazing group 11 | of [contributors][collenchyma-contributors]. Aside from the name and overall design, the two 12 | libraries are quite dissimilar to each other (e.g., auto-sync (thanks 13 | to [@alexandermorozov](/../../issues/2)), async transfers, the fallback mechanism, etc.). Therefore, before migrating 14 | over, one should go through the documentation carefully as to not make the mistake of misusing 15 | the framework. Not doing so may result in unintended behavior for which Parenchyma 16 | developers/contributors are not responsible. 17 | 18 | Many of the original comments used for documentation purposes remain in the code base along with 19 | a few necessary additions/modifications. 20 | 21 | > Disclaimer: Parenchyma is currently undergoing extensive refactoring and improvement. Therefore, 22 | > it is likely that many of the features available in the original Collenchyma project may not yet 23 | > be available in the Parenchyma project. It is also likely that certain features may never be 24 | > available in the Parenchyma project, as the different approaches that are currently being 25 | > considered may prove to be better than the original approach. 26 | 27 | ### Tensor creation 28 | 29 | The easiest way to create a tensor is to use the `array` macro: 30 | 31 | ```rust 32 | #[macro_use(array)] 33 | extern crate parenchyma; 34 | 35 | use parenchyma::prelude::*; 36 | 37 | let t: SharedTensor = array![ 38 | [ 39 | [1,2,3], 40 | [4,5,6] 41 | ], 42 | [ 43 | [11,22,33], 44 | [44,55,66] 45 | ], 46 | [ 47 | [111,222,333], 48 | [444,555,666] 49 | ], 50 | [ 51 | [1111,2222,3333], 52 | [4444,5555,6666] 53 | ] 54 | ].into(); 55 | 56 | println!("{:?}", t); 57 | 58 | // shape=[4, 2, 3], strides=[6, 3, 1], layout=C (0x1), type=i32 59 | // 60 | // [[[1, 2, 3], 61 | // [4, 5, 6]], 62 | // [[11, 22, 33], 63 | // [44, 55, 66]], 64 | // [[111, 222, 333], 65 | // [444, 555, 666]], 66 | // [[1111, 2222, 3333], 67 | // [4444, 5555, 6666]]] 68 | ``` 69 | 70 | ### Synchronizing Data 71 | 72 | Synchronizing data across multiple compute devices and backends is straightforward. 73 | 74 | ```rust 75 | #[macro_use(array)] 76 | extern crate parenchyma; 77 | 78 | use parenchyma::prelude::*; 79 | 80 | let ref cuda: Backend = Backend::new::()?; 81 | 82 | let t = array![[1.5, 2.3, 3.7], [4.8, 5.2, 6.9]].into(); 83 | 84 | t.synchronize(cuda)?; 85 | ``` 86 | 87 | ## License 88 | 89 | Dual licensed under 90 | * Apache License, Version 2.0 ([LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0) 91 | * MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT) 92 | 93 | [Autumn team]: https://github.com/autumnai 94 | [collenchyma-repo]: https://github.com/autumnai/collenchyma 95 | [collenchyma-contributors]: https://github.com/autumnai/collenchyma/graphs/contributors 96 | [LICENSE-APACHE]: ../../../license/blob/master/LICENSE-APACHE 97 | [LICENSE-MIT]: ../../../license/blob/master/LICENSE-MIT -------------------------------------------------------------------------------- /benches/shared_tensor.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate parenchyma; 4 | extern crate test; 5 | 6 | use parenchyma::{Backend, Native, OpenCL, SharedTensor}; 7 | use test::Bencher; 8 | 9 | fn native_backend() -> Backend { 10 | Backend::new::().unwrap() 11 | } 12 | 13 | fn opencl_backend() -> Backend { 14 | Backend::new::().unwrap() 15 | } 16 | 17 | fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) { 18 | 19 | let mem = &mut SharedTensor::::new(s); 20 | 21 | // initialize and warm-up 22 | let _ = mem.write(&backend2).unwrap(); 23 | let _ = mem.read_write(&backend1).unwrap(); 24 | let _ = mem.read_write(&backend2).unwrap(); 25 | 26 | b.bytes = s as u64 * 2; // we do two transfers per iteration 27 | 28 | b.iter(|| { 29 | let _ = mem.read_write(&backend1).unwrap(); 30 | let _ = mem.read_write(&backend2).unwrap(); 31 | }); 32 | } 33 | 34 | fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) { 35 | 36 | let mem = &mut SharedTensor::::new(size); 37 | 38 | // initialize and warm-up 39 | let _ = mem.write(&src).unwrap(); 40 | let _ = mem.read(&dst).unwrap(); 41 | 42 | b.bytes = size as u64; 43 | 44 | b.iter(|| { 45 | let _ = mem.write(&src).unwrap(); 46 | let _ = mem.read(&dst).unwrap(); 47 | }); 48 | } 49 | 50 | // #[inline(never)] 51 | // fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) { 52 | // b.iter(|| 53 | // for _ in 0..256 { 54 | // let _ = device.allocate_memory(size).unwrap(); }); 55 | // } 56 | 57 | // // #[bench] 58 | // // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) { 59 | // // let opencl_backend = opencl_backend(); 60 | // // let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); 61 | 62 | // // bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576); 63 | // // } 64 | 65 | // // #[bench] 66 | // // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) { 67 | // // let opencl_backend = opencl_backend(); 68 | // // let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); 69 | 70 | // // bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576); 71 | // // } 72 | 73 | // #[bench] 74 | // fn bench_256_alloc_1mb_opencl(b: &mut Bencher) { 75 | // let opencl_backend = opencl_backend(); 76 | // let ref d = opencl_backend.devices()[0]; 77 | 78 | // bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576); 79 | // } 80 | 81 | #[bench] 82 | fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) { 83 | sync_back_and_forth(b, opencl_backend(), native_backend(), 1024); 84 | } 85 | 86 | #[bench] 87 | fn bench_sync_1kb_native_to_opencl(b: &mut Bencher) { 88 | unidirectional_sync(b, native_backend(), opencl_backend(), 1024); 89 | } 90 | 91 | #[bench] 92 | fn bench_sync_1kb_opencl_to_native(b: &mut Bencher) { 93 | unidirectional_sync(b, opencl_backend(), native_backend(), 1024); 94 | } 95 | 96 | #[bench] 97 | fn bench_sync_1mb_native_opencl_back_and_forth(b: &mut Bencher) { 98 | sync_back_and_forth(b, opencl_backend(), native_backend(), 1_048_576); 99 | } 100 | 101 | #[bench] 102 | fn bench_sync_1mb_native_to_opencl(b: &mut Bencher) { 103 | unidirectional_sync(b, native_backend(), opencl_backend(), 1_048_576); 104 | } 105 | 106 | #[bench] 107 | fn bench_sync_1mb_opencl_to_native(b: &mut Bencher) { 108 | unidirectional_sync(b, opencl_backend(), native_backend(), 1_048_576); 109 | } 110 | 111 | #[bench] 112 | fn bench_sync_128mb_native_opencl_back_and_forth(b: &mut Bencher) { 113 | sync_back_and_forth(b, opencl_backend(), native_backend(), 128 * 1_048_576); 114 | } 115 | 116 | #[bench] 117 | fn bench_sync_128mb_native_to_opencl(b: &mut Bencher) { 118 | unidirectional_sync(b, native_backend(), opencl_backend(), 128 * 1_048_576); 119 | } 120 | 121 | #[bench] 122 | fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) { 123 | unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576); 124 | } 125 | 126 | // // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) { 127 | 128 | // // let native_backend = native_backend(); 129 | // // let ref native_cpu = native_backend.devices()[0]; 130 | 131 | // // let mut x = SharedTensor::::from(vec![128]); 132 | // // x.write_only(native_cpu).unwrap(); 133 | // // x.write_only(device).unwrap(); 134 | // // x.read(native_cpu).unwrap(); 135 | 136 | // // b.iter(|| { 137 | // // let _ = x.read(native_cpu).unwrap(); 138 | // // }) 139 | // // } 140 | 141 | // // #[bench] 142 | // // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) { 143 | // // let opencl_backend = opencl_backend(); 144 | // // let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap(); 145 | 146 | // // bench_shared_tensor_access_time_first_(b, opencl_cpu); 147 | // // } 148 | 149 | // // #[bench] 150 | // // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) { 151 | // // let opencl_backend = opencl_backend(); 152 | // // let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap(); 153 | 154 | // // bench_shared_tensor_access_time_first_(b, opencl_gpu); 155 | // // } -------------------------------------------------------------------------------- /crates/parenchyma-blas/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /crates/parenchyma-blas/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parenchyma-blas" 3 | version = "0.0.1" 4 | authors = ["Jony "] 5 | license = "MIT/Apache-2.0" 6 | 7 | [dependencies] 8 | ocl = "0.16.0" 9 | rblas = "0.0.13" 10 | 11 | [dependencies.parenchyma] 12 | path = "../../" 13 | version = "0.0.4" 14 | 15 | [dev-dependencies] 16 | lazy_static = "1.1.0" -------------------------------------------------------------------------------- /crates/parenchyma-blas/README.md: -------------------------------------------------------------------------------- 1 | # parenchyma-blas 2 | 3 | This package provides full BLAS (Basic Linear Algebra Subprograms) support for Parenchyma, so you 4 | can use BLAS on servers, desktops or mobiles, GPUs, FPGAs or CPUS, without worrying about OpenCL or 5 | CUDA support on the machine. 6 | 7 | ## Provided Operations 8 | 9 | This package provides the following operations to Parenchyma backends: 10 | 11 | | | CUDA (cuBLAS) | OpenCL | Native (rblas) | 12 | |--- |--- |--- |--- | 13 | | Level 1 | (collenchyma) | ✓ | ✓ | 14 | | Level 2 | - | - | - | 15 | | Level 3 | (collenchyma) | (some) | (some) | -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/axpby.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::tensor::SharedTensor; 3 | 4 | /// Extends IBlas with Axpby 5 | pub trait Axpby: super::Vector { 6 | /// Performs the operation y := a*x + b*y . 7 | /// 8 | /// Consists of a scal(b, y) followed by a axpby(a,x,y). 9 | fn axpby(&self, a: &SharedTensor, x: &SharedTensor, b: &SharedTensor, y: &mut SharedTensor) -> Result { 10 | self.scal(b, y)?; 11 | self.axpy(a, x, y)?; 12 | Ok(()) 13 | } 14 | } 15 | 16 | impl Axpby for A where A: super::Vector { 17 | // .. 18 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/level1.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::tensor::SharedTensor; 3 | 4 | /// `Vector` consists of level 1 BLAS routines - vector operations on strided arrays. 5 | pub trait Vector { 6 | /// Provides the asum operation. 7 | /// 8 | /// Computes the sum of the absolute values of the elements of `x`, and the saves the `result`. 9 | fn asum(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 10 | unimplemented!("asum") 11 | } 12 | /// Provides the axpy operation. 13 | /// 14 | /// Computes a vector `x` times a constant `a` plus a vector `y` (i.e., `a * x + y`), and then 15 | /// saves the result to `y`. 16 | fn axpy(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) -> Result { 17 | unimplemented!("axpy") 18 | } 19 | /// Provides the copy operation. 20 | /// 21 | /// Copies `from.len()` elements of vector `from` into vector `to`. 22 | fn copy(&self, from: &SharedTensor, to: &mut SharedTensor) -> Result { 23 | unimplemented!("copy") 24 | } 25 | /// Provides the dot operation. 26 | /// 27 | /// Computes the [dot product] over `x` and `y`, and then saves the `result`. 28 | fn dot(&self, x: &SharedTensor, y: &SharedTensor, result: &mut SharedTensor) -> Result { 29 | unimplemented!("dot") 30 | } 31 | /// Provides the nrm2 operation. 32 | /// 33 | /// Computes the L2 norm (i.e., the euclidean length of vector `x`), and then saves the `result`. 34 | fn nrm2(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 35 | unimplemented!("nrm2") 36 | } 37 | /// Provides the scal operation. 38 | /// 39 | /// Scales a vector `x` by a constant `a` (i.e., `a * x`). 40 | fn scal(&self, a: &SharedTensor, x: &mut SharedTensor) -> Result { 41 | unimplemented!("scal") 42 | } 43 | /// Provides the swap operation. 44 | /// 45 | /// Swaps the elements of vector `x` and vector `y`. 46 | fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result { 47 | unimplemented!("swap") 48 | } 49 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/level2.rs: -------------------------------------------------------------------------------- 1 | /// `MatrixVector` consists of level 2 BLAS routines - a generalized matrix-vector multiplication 2 | /// and more. 3 | pub trait MatrixVector { } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/level3.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::tensor::SharedTensor; 3 | 4 | use super::Transposition; 5 | 6 | // pub struct View<'a>(&'a Array); 7 | // pub struct ViewMut<'a>(&'a mut Array); 8 | // impl<'a> Matrix for View<'a> { 9 | // fn rows(&self) -> i32 { 10 | // self.0.rows() 11 | // } 12 | // fn cols(&self) -> i32 { 13 | // self.0.cols() 14 | // } 15 | // fn as_ptr(&self) -> *const f32 { 16 | // unimplemented!() 17 | // } 18 | // fn as_mut_ptr(&self) -> *mut f32 { 19 | // unimplemented!() 20 | // } 21 | // } 22 | 23 | pub struct GenericMatrix<'a> { 24 | /// The factor of matrix A (scalar). 25 | pub scalar: &'a SharedTensor, 26 | /// Buffer object storing matrix A. 27 | pub matrix: &'a SharedTensor, 28 | /// How matrix A is to be transposed. 29 | pub transposition: Transposition, 30 | } 31 | 32 | /// The trait `Matrix` consists of level 3 BLAS routines - matrix-matrix operations, including a 33 | /// general matrix multiplication. 34 | pub trait Matrix { 35 | /// Computes a matrix-matrix product with general matrices. 36 | /// 37 | /// # Arguments 38 | /// 39 | /// * `alpha` - The factor of matrix A (scalar). 40 | /// * `amatrix_transposition` - How matrix A is to be transposed. 41 | /// * `amatrix` - The buffer object storing matrix A.. 42 | fn gemm( 43 | self: &Self, 44 | alpha: &SharedTensor, 45 | amatrix_transposition: Transposition, 46 | amatrix: &SharedTensor, 47 | bmatrix_transposition: Transposition, 48 | bmatrix: &SharedTensor, 49 | beta: &SharedTensor, 50 | cmatrix: &mut SharedTensor) -> Result { 51 | unimplemented!() 52 | } 53 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::axpby::Axpby; 2 | pub use self::level1::Vector; 3 | pub use self::level2::MatrixVector; 4 | pub use self::level3::{GenericMatrix, Matrix}; 5 | pub use self::transpose::Transposition; 6 | 7 | mod axpby; 8 | mod level1; 9 | mod level2; 10 | mod level3; 11 | mod transpose; 12 | 13 | use parenchyma::extension_package::ExtensionPackage; 14 | 15 | /// The BLAS package. 16 | pub enum Package { 17 | Native, 18 | OpenCL(::frameworks::open_cl::OpenCLPackage), 19 | } 20 | 21 | impl Package { 22 | pub fn open_cl(&self) -> &::frameworks::open_cl::OpenCLPackage { 23 | if let &Package::OpenCL(ref package) = self { 24 | package 25 | } else { 26 | panic!("an Open CL package was expected, but another package was found.") 27 | } 28 | } 29 | } 30 | 31 | /// Provides level 1, 2, and 3 BLAS operations. 32 | /// 33 | /// **note**: should be replaced with an actual trait alias ([RFC#1733]). 34 | /// 35 | /// [RFC#1733]: https://github.com/rust-lang/rfcs/pull/1733 36 | pub trait Extension: Axpby + Vector + MatrixVector + Matrix { } 37 | 38 | impl ExtensionPackage for Package { 39 | type Extension = Extension; 40 | 41 | fn package_name(&self) -> &'static str { 42 | return "parenchyma/blas"; 43 | } 44 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/extension_package/transpose.rs: -------------------------------------------------------------------------------- 1 | use rblas; 2 | 3 | /// Possible transpose operations that can be applied in Level 2 and Level 3 BLAS operations. 4 | #[derive(Clone, Copy, Debug, Eq, PartialEq)] 5 | pub enum Transposition { 6 | /// Take the conjugate transpose of the matrix. 7 | ConjugateTranspose, 8 | /// Take the matrix as it is. 9 | NoTranspose, 10 | /// Take the transpose of the matrix. 11 | Transpose, 12 | } 13 | 14 | impl Into for Transposition { 15 | /// Converts a `Transposition` to an rblas `Transpose`. 16 | fn into(self) -> rblas::attribute::Transpose { 17 | match self { 18 | Transposition::ConjugateTranspose => rblas::attribute::Transpose::ConjTrans, 19 | Transposition::NoTranspose => rblas::attribute::Transpose::NoTrans, 20 | Transposition::Transpose => rblas::attribute::Transpose::Trans, 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod native; 2 | pub mod open_cl; -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/native/mod.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::{Error, ErrorKind, Result}; 2 | use parenchyma::extension_package::Dependency; 3 | use parenchyma::frameworks::NativeContext as Context; 4 | use parenchyma::tensor::SharedTensor; 5 | 6 | use rblas; 7 | use rblas::math::mat::Mat; 8 | use rblas::matrix::Matrix as IMatrix; 9 | 10 | use super::super::{Extension, Package, Transposition}; 11 | use super::super::extension_package::{Matrix, MatrixVector, Vector}; 12 | 13 | impl

Extension for Context

where P: Dependency { } 14 | 15 | impl

Vector for Context

where P: Dependency { 16 | fn asum(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 17 | result.as_mut_slice_unsynched()?[0] = rblas::Asum::asum(x.as_slice()?); 18 | Ok(()) 19 | } 20 | 21 | fn axpy(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) -> Result { 22 | Ok(rblas::Axpy::axpy( 23 | a.as_slice()?.get(0) 24 | .ok_or_else(|| Error::new(ErrorKind::Other, "Index out of bounds"))?, 25 | x.as_slice()?, 26 | y.as_mut_slice()? 27 | )) 28 | } 29 | 30 | fn copy(&self, from: &SharedTensor, to: &mut SharedTensor) -> Result { 31 | Ok(rblas::Copy::copy( 32 | from.as_slice()?, to.as_mut_slice_unsynched()?)) 33 | } 34 | 35 | fn dot(&self, x: &SharedTensor, y: &SharedTensor, result: &mut SharedTensor) -> Result { 36 | result.as_mut_slice_unsynched()?[0] = rblas::Dot::dot(x.as_slice()?, y.as_slice()?); 37 | Ok(()) 38 | } 39 | 40 | fn nrm2(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 41 | result.as_mut_slice_unsynched()?[0] = rblas::Nrm2::nrm2(x.as_slice()?); 42 | Ok(()) 43 | } 44 | 45 | fn scal(&self, a: &SharedTensor, x: &mut SharedTensor) -> Result { 46 | Ok(rblas::Scal::scal( 47 | a.as_slice()?.get(0) 48 | .ok_or_else(|| Error::new(ErrorKind::Other, "Index out of bounds"))?, 49 | x.as_mut_slice()? 50 | )) 51 | } 52 | 53 | fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result { 54 | Ok(rblas::Swap::swap(x.as_mut_slice()?, y.as_mut_slice()?)) 55 | } 56 | } 57 | 58 | impl

Matrix for Context

where P: Dependency { 59 | fn gemm( 60 | self: &Self, 61 | alpha: &SharedTensor, 62 | amatrix_transposition: Transposition, 63 | amatrix: &SharedTensor, 64 | bmatrix_transposition: Transposition, 65 | bmatrix: &SharedTensor, 66 | beta: &SharedTensor, 67 | cmatrix: &mut SharedTensor) -> Result { 68 | 69 | let a_0 = amatrix.shape().dimensions()[0] as i32; 70 | let a_1 = amatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32; 71 | 72 | let b_0 = bmatrix.shape().dimensions()[0] as i32; 73 | let b_1 = bmatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32; 74 | 75 | let c_0 = cmatrix.shape().dimensions()[0] as i32; 76 | let c_1 = cmatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32; 77 | 78 | let input = as_matrix(amatrix.as_slice()?, a_0 as usize, a_1 as usize); 79 | let weights = as_matrix(bmatrix.as_slice()?, b_0 as usize, b_1 as usize); 80 | let mut output = as_matrix(cmatrix.as_slice()?, c_0 as usize, c_1 as usize); 81 | 82 | rblas::Gemm::gemm( 83 | &alpha.as_slice()?[0], 84 | amatrix_transposition.into(), 85 | &input, 86 | 87 | bmatrix_transposition.into(), 88 | &weights, 89 | &beta.as_slice()?[0], 90 | 91 | &mut output 92 | ); 93 | 94 | read_from_matrix(&output, cmatrix.as_mut_slice()?); 95 | 96 | Ok(()) 97 | } 98 | } 99 | 100 | fn as_matrix(slice: &[f32], nrows: usize, ncols: usize) -> Mat { 101 | let mut mat: Mat = Mat::new(nrows, ncols); 102 | 103 | for i in 0..nrows { 104 | for j in 0..ncols { 105 | let index = ncols * i + j; 106 | unsafe { 107 | *mat.as_mut_ptr().offset(index as isize) = slice[index].clone(); 108 | } 109 | } 110 | } 111 | 112 | mat 113 | } 114 | 115 | fn read_from_matrix(mat: &Mat, slice: &mut [f32]) { 116 | let n = mat.rows(); 117 | let m = mat.cols(); 118 | for i in 0..n { 119 | for j in 0..m { 120 | let index = m * i + j; 121 | slice[index] = mat[i][j].clone(); 122 | } 123 | } 124 | } 125 | 126 | impl

MatrixVector for Context

where P: Dependency { } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/implementation/level1.rs: -------------------------------------------------------------------------------- 1 | use parenchyma; 2 | use parenchyma::open_cl::OpenCLContext; 3 | use parenchyma::{Context, SharedTensor}; 4 | use parenchyma::error::Result; 5 | 6 | use extension::Vector; 7 | 8 | impl Vector for OpenCLContext { 9 | 10 | fn asum( 11 | &self, 12 | x: &SharedTensor, 13 | result: &mut SharedTensor) -> Result { 14 | 15 | unimplemented!() 16 | } 17 | 18 | fn axpy( 19 | &self, 20 | a: &SharedTensor, 21 | x: &SharedTensor, 22 | y: &mut SharedTensor) -> Result { 23 | 24 | let kernel: ::ocl::Kernel = unimplemented!(); 25 | 26 | let n = x.shape().capacity; 27 | 28 | let alpha = parenchyma::tensor(self, alpha)?; 29 | let x = parenchyma::tensor(self, x)?; 30 | let y = parenchyma::tensor_mut(self, y)?; 31 | 32 | let offset = 0; 33 | let inc = 1; 34 | 35 | kernel 36 | .arg_scl(n) 37 | .arg_buf(alpha) 38 | .arg_buf(x).arg_scl(offset).arg_scl(inc) 39 | .arg_buf(y).arg_scl(offset).arg_scl(inc) 40 | // //.gwo(..) 41 | // .gws([WGS, 1, 1]) 42 | // .lws([WGS, 1, 1]) 43 | // // todo The queue must be associated with a device associated with the kernel's program. 44 | .queue(self.active_direct().queue().clone()) 45 | .enq()?; 46 | 47 | 48 | Ok(()) 49 | } 50 | 51 | fn copy( 52 | &self, 53 | from: &SharedTensor, 54 | to: &mut SharedTensor) -> Result { 55 | 56 | unimplemented!() 57 | } 58 | 59 | fn dot( 60 | &self, 61 | x: &SharedTensor, 62 | y: &SharedTensor, 63 | result: &mut SharedTensor) -> Result { 64 | 65 | unimplemented!() 66 | } 67 | 68 | fn nrm2( 69 | &self, 70 | x: &SharedTensor, 71 | result: &mut SharedTensor) -> Result { 72 | 73 | unimplemented!() 74 | } 75 | 76 | fn scal( 77 | &self, 78 | a: &SharedTensor, 79 | x: &mut SharedTensor) -> Result { 80 | 81 | unimplemented!() 82 | } 83 | 84 | fn swap( 85 | &self, 86 | x: &mut SharedTensor, 87 | y: &mut SharedTensor) -> Result { 88 | 89 | unimplemented!() 90 | } 91 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/implementation/mod.rs: -------------------------------------------------------------------------------- 1 | mod level1; -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/package.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | use std::ffi::CString; 3 | use parenchyma::error::Result; 4 | use parenchyma::frameworks::OpenCLContext; 5 | 6 | // const WGS: usize = 64; 7 | // const WGS1: usize = 64; 8 | // const WGS2: usize = 64; 9 | 10 | // /// Caches instances of `Kernel` 11 | // #[derive(Debug)] 12 | // pub struct OpenCLPackage { 13 | // pub(in super) program: ocl::Program, 14 | // asum: [ocl::Kernel; 2], 15 | // pub(in super) axpy: ocl::Kernel, 16 | // copy: ocl::Kernel, 17 | // dot: [ocl::Kernel; 2], 18 | // nrm2: [ocl::Kernel; 2], 19 | // scal: ocl::Kernel, 20 | // swap: ocl::Kernel, 21 | 22 | // gemm_direct: Gemm, 23 | // } 24 | 25 | // #[derive(Debug)] 26 | // pub struct Gemm { 27 | // tt: ocl::Kernel, 28 | // tn: ocl::Kernel, 29 | // nt: ocl::Kernel, 30 | // nn: ocl::Kernel, 31 | // } 32 | 33 | /// Caches instances of `Kernel` 34 | #[derive(Debug)] 35 | pub struct OpenCLPackage { 36 | pub(in frameworks::open_cl) program: ocl::Program, 37 | } 38 | 39 | impl OpenCLPackage { 40 | pub fn compile(cx: &mut OpenCLContext<()>) -> Result { 41 | let program = cx.program(vec![ 42 | CString::new(include_str!("source/common.cl")).unwrap(), 43 | 44 | CString::new(include_str!("source/level1/level1.cl")).unwrap(), 45 | CString::new(include_str!("source/level1/xasum.cl")).unwrap(), 46 | CString::new(include_str!("source/level1/xaxpy.cl")).unwrap(), 47 | CString::new(include_str!("source/level1/xcopy.cl")).unwrap(), 48 | CString::new(include_str!("source/level1/xdot.cl")).unwrap(), 49 | CString::new(include_str!("source/level1/xnrm2.cl")).unwrap(), 50 | CString::new(include_str!("source/level1/xscal.cl")).unwrap(), 51 | CString::new(include_str!("source/level1/xswap.cl")).unwrap(), 52 | 53 | CString::new(include_str!("source/level3/level3.cl")).unwrap(), 54 | CString::new(include_str!("source/level3/xgemm_direct_part1.cl")).unwrap(), 55 | CString::new(include_str!("source/level3/xgemm_direct_part2.cl")).unwrap(), 56 | CString::new(include_str!("source/level3/xgemm_direct_part3.cl")).unwrap(), 57 | ])?; 58 | 59 | // Ok(OpenCLPackage { 60 | // asum: [ocl::Kernel::new("Xasum", &program)?, ocl::Kernel::new("XasumEpilogue", &program)?], 61 | // axpy: ocl::Kernel::new("Xaxpy", &program)?, 62 | // copy: ocl::Kernel::new("Xcopy", &program)?, 63 | // dot: [ocl::Kernel::new("Xdot", &program)?, ocl::Kernel::new("XdotEpilogue", &program)?], 64 | // nrm2: [ocl::Kernel::new("Xnrm2", &program)?, ocl::Kernel::new("Xnrm2Epilogue", &program)?], 65 | // scal: ocl::Kernel::new("Xscal", &program)?, 66 | // swap: ocl::Kernel::new("Xswap", &program)?, 67 | 68 | // gemm_direct: Gemm { 69 | // tt: ocl::Kernel::new("XgemmDirectTT", &program)?, 70 | // tn: ocl::Kernel::new("XgemmDirectTN", &program)?, 71 | // nt: ocl::Kernel::new("XgemmDirectNT", &program)?, 72 | // nn: ocl::Kernel::new("XgemmDirectNN", &program)?, 73 | // }, 74 | 75 | // program, 76 | // }) 77 | 78 | Ok(OpenCLPackage { program }) 79 | } 80 | } -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/common.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the common defines and type-defs for the CLBlast OpenCL kernels. 11 | // 12 | // ================================================================================================= 13 | 14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 15 | // literal). Comment-out this line for syntax-highlighting when developing. 16 | 17 | // ================================================================================================= 18 | 19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 20 | // this file is used outside of the CLBlast library. 21 | #ifndef PRECISION 22 | #define PRECISION 32 // Data-types: half, single or double precision, complex or regular 23 | #endif 24 | 25 | // ================================================================================================= 26 | 27 | // Enable support for double-precision 28 | #if PRECISION == 16 29 | #pragma OPENCL EXTENSION cl_khr_fp16: enable 30 | #endif 31 | 32 | // Enable support for double-precision 33 | #if PRECISION == 64 || PRECISION == 6464 34 | #if __OPENCL_VERSION__ <= CL_VERSION_1_1 35 | #pragma OPENCL EXTENSION cl_khr_fp64: enable 36 | #endif 37 | #endif 38 | 39 | // Half-precision 40 | #if PRECISION == 16 41 | typedef half real; 42 | typedef half2 real2; 43 | typedef half4 real4; 44 | typedef half8 real8; 45 | typedef half16 real16; 46 | #define ZERO 0 47 | #define ONE 1 48 | #define SMALLEST -1.0e14 49 | 50 | // Single-precision 51 | #elif PRECISION == 32 52 | typedef float real; 53 | typedef float2 real2; 54 | typedef float4 real4; 55 | typedef float8 real8; 56 | typedef float16 real16; 57 | #define ZERO 0.0f 58 | #define ONE 1.0f 59 | #define SMALLEST -1.0e37f 60 | 61 | // Double-precision 62 | #elif PRECISION == 64 63 | typedef double real; 64 | typedef double2 real2; 65 | typedef double4 real4; 66 | typedef double8 real8; 67 | typedef double16 real16; 68 | #define ZERO 0.0 69 | #define ONE 1.0 70 | #define SMALLEST -1.0e37 71 | 72 | // Complex single-precision 73 | #elif PRECISION == 3232 74 | typedef struct cfloat {float x; float y;} real; 75 | typedef struct cfloat2 {real x; real y;} real2; 76 | typedef struct cfloat4 {real x; real y; real z; real w;} real4; 77 | typedef struct cfloat8 {real s0; real s1; real s2; real s3; 78 | real s4; real s5; real s6; real s7;} real8; 79 | typedef struct cfloat16 {real s0; real s1; real s2; real s3; 80 | real s4; real s5; real s6; real s7; 81 | real s8; real s9; real sA; real sB; 82 | real sC; real sD; real sE; real sF;} real16; 83 | #define ZERO 0.0f 84 | #define ONE 1.0f 85 | #define SMALLEST -1.0e37f 86 | 87 | // Complex double-precision 88 | #elif PRECISION == 6464 89 | typedef struct cdouble {double x; double y;} real; 90 | typedef struct cdouble2 {real x; real y;} real2; 91 | typedef struct cdouble4 {real x; real y; real z; real w;} real4; 92 | typedef struct cdouble8 {real s0; real s1; real s2; real s3; 93 | real s4; real s5; real s6; real s7;} real8; 94 | typedef struct cdouble16 {real s0; real s1; real s2; real s3; 95 | real s4; real s5; real s6; real s7; 96 | real s8; real s9; real sA; real sB; 97 | real sC; real sD; real sE; real sF;} real16; 98 | #define ZERO 0.0 99 | #define ONE 1.0 100 | #define SMALLEST -1.0e37 101 | #endif 102 | 103 | // Single-element version of a complex number 104 | #if PRECISION == 3232 105 | typedef float singlereal; 106 | #elif PRECISION == 6464 107 | typedef double singlereal; 108 | #else 109 | typedef real singlereal; 110 | #endif 111 | 112 | // Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no 113 | // conversion, but half-precision is not supported as kernel argument so it is converted from float. 114 | #if PRECISION == 16 115 | typedef float real_arg; 116 | #define GetRealArg(x) (half)x 117 | #else 118 | typedef real real_arg; 119 | #define GetRealArg(x) x 120 | #endif 121 | 122 | // ================================================================================================= 123 | 124 | // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific 125 | // devices, this is enabled (see src/routine.cc). 126 | #ifndef USE_CL_MAD 127 | #define USE_CL_MAD 0 128 | #endif 129 | 130 | // Sets a variable to zero 131 | #if PRECISION == 3232 || PRECISION == 6464 132 | #define SetToZero(a) a.x = ZERO; a.y = ZERO 133 | #else 134 | #define SetToZero(a) a = ZERO 135 | #endif 136 | 137 | // Sets a variable to zero (only the imaginary part) 138 | #if PRECISION == 3232 || PRECISION == 6464 139 | #define ImagToZero(a) a.y = ZERO 140 | #else 141 | #define ImagToZero(a) 142 | #endif 143 | 144 | // Sets a variable to one 145 | #if PRECISION == 3232 || PRECISION == 6464 146 | #define SetToOne(a) a.x = ONE; a.y = ZERO 147 | #else 148 | #define SetToOne(a) a = ONE 149 | #endif 150 | 151 | // Determines whether a variable is zero 152 | #if PRECISION == 3232 || PRECISION == 6464 153 | #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO)) 154 | #else 155 | #define IsZero(a) (a == ZERO) 156 | #endif 157 | 158 | // The absolute value (component-wise) 159 | #if PRECISION == 3232 || PRECISION == 6464 160 | #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y) 161 | #else 162 | #define AbsoluteValue(value) value = fabs(value) 163 | #endif 164 | 165 | // Adds two complex variables 166 | #if PRECISION == 3232 || PRECISION == 6464 167 | #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y 168 | #else 169 | #define Add(c, a, b) c = a + b 170 | #endif 171 | 172 | // Multiply two complex variables (used in the defines below) 173 | #if PRECISION == 3232 || PRECISION == 6464 174 | #define MulReal(a, b) a.x*b.x - a.y*b.y 175 | #define MulImag(a, b) a.x*b.y + a.y*b.x 176 | #endif 177 | 178 | // The scalar multiply function 179 | #if PRECISION == 3232 || PRECISION == 6464 180 | #define Multiply(c, a, b) c.x = MulReal(a,b); c.y = MulImag(a,b) 181 | #else 182 | #define Multiply(c, a, b) c = a * b 183 | #endif 184 | 185 | // The scalar multiply-add function 186 | #if PRECISION == 3232 || PRECISION == 6464 187 | #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b) 188 | #else 189 | #if USE_CL_MAD == 1 190 | #define MultiplyAdd(c, a, b) c = mad(a, b, c) 191 | #else 192 | #define MultiplyAdd(c, a, b) c += a * b 193 | #endif 194 | #endif 195 | 196 | // The scalar AXPBY function 197 | #if PRECISION == 3232 || PRECISION == 6464 198 | #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d) 199 | #else 200 | #define AXPBY(e, a, b, c, d) e = a*b + c*d 201 | #endif 202 | 203 | // The complex conjugate operation for complex transforms 204 | #if PRECISION == 3232 || PRECISION == 6464 205 | #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y 206 | #else 207 | #define COMPLEX_CONJUGATE(value) 208 | #endif 209 | 210 | // ================================================================================================= 211 | 212 | // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is 213 | // enabled (see src/routine.cc). 214 | #ifndef USE_STAGGERED_INDICES 215 | #define USE_STAGGERED_INDICES 0 216 | #endif 217 | 218 | // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from: 219 | // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf 220 | // More details: https://github.com/CNugteren/CLBlast/issues/53 221 | #if USE_STAGGERED_INDICES == 1 222 | inline size_t GetGroupIDFlat() { 223 | return get_group_id(0) + get_num_groups(0) * get_group_id(1); 224 | } 225 | inline size_t GetGroupID1() { 226 | return (GetGroupIDFlat()) % get_num_groups(1); 227 | } 228 | inline size_t GetGroupID0() { 229 | return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0); 230 | } 231 | #else 232 | inline size_t GetGroupID1() { return get_group_id(1); } 233 | inline size_t GetGroupID0() { return get_group_id(0); } 234 | #endif 235 | 236 | // ================================================================================================= 237 | 238 | // End of the C++11 raw string literal 239 | 240 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/level1/level1.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the common functions and parameters specific for level 1 BLAS kernels. 11 | // 12 | // ================================================================================================= 13 | 14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 15 | // literal). Comment-out this line for syntax-highlighting when developing. 16 | 17 | // ================================================================================================= 18 | 19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 20 | // this kernel file is used outside of the CLBlast library. 21 | #ifndef WGS 22 | #define WGS 64 // The local work-group size 23 | #endif 24 | #ifndef WPT 25 | #define WPT 1 // The amount of work-per-thread 26 | #endif 27 | #ifndef VW 28 | #define VW 1 // Vector width of vectors X and Y 29 | #endif 30 | 31 | // ================================================================================================= 32 | 33 | // Data-widths 34 | #if VW == 1 35 | typedef real realV; 36 | #elif VW == 2 37 | typedef real2 realV; 38 | #elif VW == 4 39 | typedef real4 realV; 40 | #elif VW == 8 41 | typedef real8 realV; 42 | #elif VW == 16 43 | typedef real16 realV; 44 | #endif 45 | 46 | // ================================================================================================= 47 | 48 | // The vectorized multiply function 49 | inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) { 50 | #if VW == 1 51 | Multiply(cvec, aval, bvec); 52 | #elif VW == 2 53 | Multiply(cvec.x, aval, bvec.x); 54 | Multiply(cvec.y, aval, bvec.y); 55 | #elif VW == 4 56 | Multiply(cvec.x, aval, bvec.x); 57 | Multiply(cvec.y, aval, bvec.y); 58 | Multiply(cvec.z, aval, bvec.z); 59 | Multiply(cvec.w, aval, bvec.w); 60 | #elif VW == 8 61 | Multiply(cvec.s0, aval, bvec.s0); 62 | Multiply(cvec.s1, aval, bvec.s1); 63 | Multiply(cvec.s2, aval, bvec.s2); 64 | Multiply(cvec.s3, aval, bvec.s3); 65 | Multiply(cvec.s4, aval, bvec.s4); 66 | Multiply(cvec.s5, aval, bvec.s5); 67 | Multiply(cvec.s6, aval, bvec.s6); 68 | Multiply(cvec.s7, aval, bvec.s7); 69 | #elif VW == 16 70 | Multiply(cvec.s0, aval, bvec.s0); 71 | Multiply(cvec.s1, aval, bvec.s1); 72 | Multiply(cvec.s2, aval, bvec.s2); 73 | Multiply(cvec.s3, aval, bvec.s3); 74 | Multiply(cvec.s4, aval, bvec.s4); 75 | Multiply(cvec.s5, aval, bvec.s5); 76 | Multiply(cvec.s6, aval, bvec.s6); 77 | Multiply(cvec.s7, aval, bvec.s7); 78 | Multiply(cvec.s8, aval, bvec.s8); 79 | Multiply(cvec.s9, aval, bvec.s9); 80 | Multiply(cvec.sA, aval, bvec.sA); 81 | Multiply(cvec.sB, aval, bvec.sB); 82 | Multiply(cvec.sC, aval, bvec.sC); 83 | Multiply(cvec.sD, aval, bvec.sD); 84 | Multiply(cvec.sE, aval, bvec.sE); 85 | Multiply(cvec.sF, aval, bvec.sF); 86 | #endif 87 | return cvec; 88 | } 89 | 90 | // The vectorized multiply-add function 91 | inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) { 92 | #if VW == 1 93 | MultiplyAdd(cvec, aval, bvec); 94 | #elif VW == 2 95 | MultiplyAdd(cvec.x, aval, bvec.x); 96 | MultiplyAdd(cvec.y, aval, bvec.y); 97 | #elif VW == 4 98 | MultiplyAdd(cvec.x, aval, bvec.x); 99 | MultiplyAdd(cvec.y, aval, bvec.y); 100 | MultiplyAdd(cvec.z, aval, bvec.z); 101 | MultiplyAdd(cvec.w, aval, bvec.w); 102 | #elif VW == 8 103 | MultiplyAdd(cvec.s0, aval, bvec.s0); 104 | MultiplyAdd(cvec.s1, aval, bvec.s1); 105 | MultiplyAdd(cvec.s2, aval, bvec.s2); 106 | MultiplyAdd(cvec.s3, aval, bvec.s3); 107 | MultiplyAdd(cvec.s4, aval, bvec.s4); 108 | MultiplyAdd(cvec.s5, aval, bvec.s5); 109 | MultiplyAdd(cvec.s6, aval, bvec.s6); 110 | MultiplyAdd(cvec.s7, aval, bvec.s7); 111 | #elif VW == 16 112 | MultiplyAdd(cvec.s0, aval, bvec.s0); 113 | MultiplyAdd(cvec.s1, aval, bvec.s1); 114 | MultiplyAdd(cvec.s2, aval, bvec.s2); 115 | MultiplyAdd(cvec.s3, aval, bvec.s3); 116 | MultiplyAdd(cvec.s4, aval, bvec.s4); 117 | MultiplyAdd(cvec.s5, aval, bvec.s5); 118 | MultiplyAdd(cvec.s6, aval, bvec.s6); 119 | MultiplyAdd(cvec.s7, aval, bvec.s7); 120 | MultiplyAdd(cvec.s8, aval, bvec.s8); 121 | MultiplyAdd(cvec.s9, aval, bvec.s9); 122 | MultiplyAdd(cvec.sA, aval, bvec.sA); 123 | MultiplyAdd(cvec.sB, aval, bvec.sB); 124 | MultiplyAdd(cvec.sC, aval, bvec.sC); 125 | MultiplyAdd(cvec.sD, aval, bvec.sD); 126 | MultiplyAdd(cvec.sE, aval, bvec.sE); 127 | MultiplyAdd(cvec.sF, aval, bvec.sF); 128 | #endif 129 | return cvec; 130 | } 131 | 132 | // ================================================================================================= 133 | 134 | // End of the C++11 raw string literal 135 | 136 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xasum.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the Xasum kernel. It implements a absolute sum computation using reduction 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded, 12 | // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel 13 | // is executed with a single workgroup only, computing the final result. 14 | // 15 | // ================================================================================================= 16 | 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 18 | // literal). Comment-out this line for syntax-highlighting when developing. 19 | 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 21 | // this kernel file is used outside of the CLBlast library. 22 | #ifndef WGS1 23 | #define WGS1 64 // The local work-group size of the main kernel 24 | #endif 25 | #ifndef WGS2 26 | #define WGS2 64 // The local work-group size of the epilogue kernel 27 | #endif 28 | 29 | // ================================================================================================= 30 | 31 | // The main reduction kernel, performing the loading and the majority of the operation 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) 33 | void Xasum(const int n, 34 | const __global real* restrict xgm, const int x_offset, const int x_inc, 35 | __global real* output) { 36 | __local real lm[WGS1]; 37 | const int lid = get_local_id(0); 38 | const int wgid = get_group_id(0); 39 | const int num_groups = get_num_groups(0); 40 | 41 | // Performs loading and the first steps of the reduction 42 | real acc; 43 | SetToZero(acc); 44 | int id = wgid*WGS1 + lid; 45 | while (id < n) { 46 | real x = xgm[id*x_inc + x_offset]; 47 | #if defined(ROUTINE_SUM) // non-absolute version 48 | #else 49 | AbsoluteValue(x); 50 | #endif 51 | Add(acc, acc, x); 52 | id += WGS1*num_groups; 53 | } 54 | lm[lid] = acc; 55 | barrier(CLK_LOCAL_MEM_FENCE); 56 | 57 | // Performs reduction in local memory 58 | #pragma unroll 59 | for (int s=WGS1/2; s>0; s=s>>1) { 60 | if (lid < s) { 61 | Add(lm[lid], lm[lid], lm[lid + s]); 62 | } 63 | barrier(CLK_LOCAL_MEM_FENCE); 64 | } 65 | 66 | // Stores the per-workgroup result 67 | if (lid == 0) { 68 | output[wgid] = lm[0]; 69 | } 70 | } 71 | 72 | // ================================================================================================= 73 | 74 | // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to 75 | // be launched with a single workgroup only. 76 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) 77 | void XasumEpilogue(const __global real* restrict input, 78 | __global real* asum, const int asum_offset) { 79 | __local real lm[WGS2]; 80 | const int lid = get_local_id(0); 81 | 82 | // Performs the first step of the reduction while loading the data 83 | Add(lm[lid], input[lid], input[lid + WGS2]); 84 | barrier(CLK_LOCAL_MEM_FENCE); 85 | 86 | // Performs reduction in local memory 87 | #pragma unroll 88 | for (int s=WGS2/2; s>0; s=s>>1) { 89 | if (lid < s) { 90 | Add(lm[lid], lm[lid], lm[lid + s]); 91 | } 92 | barrier(CLK_LOCAL_MEM_FENCE); 93 | } 94 | 95 | // Computes the absolute value and stores the final result 96 | if (lid == 0) { 97 | #if PRECISION == 3232 || PRECISION == 6464 98 | asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number 99 | #else 100 | asum[asum_offset] = lm[0]; 101 | #endif 102 | } 103 | } 104 | 105 | // ================================================================================================= 106 | 107 | // End of the C++11 raw string literal 108 | 109 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xaxpy.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit 11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't 12 | // support vector data-types. 13 | // 14 | // This kernel uses the level-1 BLAS common tuning parameters. 15 | // 16 | // ================================================================================================= 17 | 18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 19 | // literal). Comment-out this line for syntax-highlighting when developing. 20 | 21 | // ================================================================================================= 22 | 23 | // Full version of the kernel with offsets and strided accesses 24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) 25 | void Xaxpy(const int n, 26 | const __global real* arg_alpha, 27 | const __global real* restrict xgm, const int x_offset, const int x_inc, 28 | __global real* ygm, const int y_offset, const int y_inc) { 29 | const real alpha = GetRealArg(arg_alpha[0]); 30 | 31 | // Loops over the work that needs to be done (allows for an arbitrary number of threads) 32 | #pragma unroll 33 | for (int id = get_global_id(0); id 9 | // 10 | // This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit 11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't 12 | // support vector data-types. 13 | // 14 | // This kernel uses the level-1 BLAS common tuning parameters. 15 | // 16 | // ================================================================================================= 17 | 18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 19 | // literal). Comment-out this line for syntax-highlighting when developing. 20 | 21 | // ================================================================================================= 22 | 23 | // Full version of the kernel with offsets and strided accesses 24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) 25 | void Xcopy(const int n, 26 | const __global real* restrict xgm, const int x_offset, const int x_inc, 27 | __global real* ygm, const int y_offset, const int y_inc) { 28 | 29 | // Loops over the work that needs to be done (allows for an arbitrary number of threads) 30 | #pragma unroll 31 | for (int id = get_global_id(0); id 9 | // 10 | // This file contains the Xdot kernel. It implements a dot-product computation using reduction 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are 12 | // multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel 13 | // is executed with a single workgroup only, computing the final result. 14 | // 15 | // ================================================================================================= 16 | 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 18 | // literal). Comment-out this line for syntax-highlighting when developing. 19 | 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 21 | // this kernel file is used outside of the CLBlast library. 22 | #ifndef WGS1 23 | #define WGS1 64 // The local work-group size of the main kernel 24 | #endif 25 | #ifndef WGS2 26 | #define WGS2 64 // The local work-group size of the epilogue kernel 27 | #endif 28 | 29 | // ================================================================================================= 30 | 31 | // The main reduction kernel, performing the multiplication and the majority of the sum operation 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) 33 | void Xdot(const int n, 34 | const __global real* restrict xgm, const int x_offset, const int x_inc, 35 | const __global real* restrict ygm, const int y_offset, const int y_inc, 36 | __global real* output, const int do_conjugate) { 37 | __local real lm[WGS1]; 38 | const int lid = get_local_id(0); 39 | const int wgid = get_group_id(0); 40 | const int num_groups = get_num_groups(0); 41 | 42 | // Performs multiplication and the first steps of the reduction 43 | real acc; 44 | SetToZero(acc); 45 | int id = wgid*WGS1 + lid; 46 | while (id < n) { 47 | real x = xgm[id*x_inc + x_offset]; 48 | real y = ygm[id*y_inc + y_offset]; 49 | if (do_conjugate) { COMPLEX_CONJUGATE(x); } 50 | MultiplyAdd(acc, x, y); 51 | id += WGS1*num_groups; 52 | } 53 | lm[lid] = acc; 54 | barrier(CLK_LOCAL_MEM_FENCE); 55 | 56 | // Performs reduction in local memory 57 | #pragma unroll 58 | for (int s=WGS1/2; s>0; s=s>>1) { 59 | if (lid < s) { 60 | Add(lm[lid], lm[lid], lm[lid + s]); 61 | } 62 | barrier(CLK_LOCAL_MEM_FENCE); 63 | } 64 | 65 | // Stores the per-workgroup result 66 | if (lid == 0) { 67 | output[wgid] = lm[0]; 68 | } 69 | } 70 | 71 | // ================================================================================================= 72 | 73 | // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to 74 | // be launched with a single workgroup only. 75 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) 76 | void XdotEpilogue(const __global real* restrict input, 77 | __global real* dot, const int dot_offset) { 78 | __local real lm[WGS2]; 79 | const int lid = get_local_id(0); 80 | 81 | // Performs the first step of the reduction while loading the data 82 | Add(lm[lid], input[lid], input[lid + WGS2]); 83 | barrier(CLK_LOCAL_MEM_FENCE); 84 | 85 | // Performs reduction in local memory 86 | #pragma unroll 87 | for (int s=WGS2/2; s>0; s=s>>1) { 88 | if (lid < s) { 89 | Add(lm[lid], lm[lid], lm[lid + s]); 90 | } 91 | barrier(CLK_LOCAL_MEM_FENCE); 92 | } 93 | 94 | // Stores the final result 95 | if (lid == 0) { 96 | dot[dot_offset] = lm[0]; 97 | } 98 | } 99 | 100 | // ================================================================================================= 101 | 102 | // End of the C++11 raw string literal 103 | 104 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xnrm2.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared, 12 | // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel 13 | // is executed with a single workgroup only, computing the final result. 14 | // 15 | // ================================================================================================= 16 | 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 18 | // literal). Comment-out this line for syntax-highlighting when developing. 19 | 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 21 | // this kernel file is used outside of the CLBlast library. 22 | #ifndef WGS1 23 | #define WGS1 64 // The local work-group size of the main kernel 24 | #endif 25 | #ifndef WGS2 26 | #define WGS2 64 // The local work-group size of the epilogue kernel 27 | #endif 28 | 29 | // ================================================================================================= 30 | 31 | // The main reduction kernel, performing the multiplication and the majority of the operation 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1))) 33 | void Xnrm2(const int n, 34 | const __global real* restrict xgm, const int x_offset, const int x_inc, 35 | __global real* output) { 36 | __local real lm[WGS1]; 37 | const int lid = get_local_id(0); 38 | const int wgid = get_group_id(0); 39 | const int num_groups = get_num_groups(0); 40 | 41 | // Performs multiplication and the first steps of the reduction 42 | real acc; 43 | SetToZero(acc); 44 | int id = wgid*WGS1 + lid; 45 | while (id < n) { 46 | real x1 = xgm[id*x_inc + x_offset]; 47 | real x2 = x1; 48 | COMPLEX_CONJUGATE(x2); 49 | MultiplyAdd(acc, x1, x2); 50 | id += WGS1*num_groups; 51 | } 52 | lm[lid] = acc; 53 | barrier(CLK_LOCAL_MEM_FENCE); 54 | 55 | // Performs reduction in local memory 56 | #pragma unroll 57 | for (int s=WGS1/2; s>0; s=s>>1) { 58 | if (lid < s) { 59 | Add(lm[lid], lm[lid], lm[lid + s]); 60 | } 61 | barrier(CLK_LOCAL_MEM_FENCE); 62 | } 63 | 64 | // Stores the per-workgroup result 65 | if (lid == 0) { 66 | output[wgid] = lm[0]; 67 | } 68 | } 69 | 70 | // ================================================================================================= 71 | 72 | // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to 73 | // be launched with a single workgroup only. 74 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1))) 75 | void Xnrm2Epilogue(const __global real* restrict input, 76 | __global real* nrm2, const int nrm2_offset) { 77 | __local real lm[WGS2]; 78 | const int lid = get_local_id(0); 79 | 80 | // Performs the first step of the reduction while loading the data 81 | Add(lm[lid], input[lid], input[lid + WGS2]); 82 | barrier(CLK_LOCAL_MEM_FENCE); 83 | 84 | // Performs reduction in local memory 85 | #pragma unroll 86 | for (int s=WGS2/2; s>0; s=s>>1) { 87 | if (lid < s) { 88 | Add(lm[lid], lm[lid], lm[lid + s]); 89 | } 90 | barrier(CLK_LOCAL_MEM_FENCE); 91 | } 92 | 93 | // Computes the square root and stores the final result 94 | if (lid == 0) { 95 | #if PRECISION == 3232 || PRECISION == 6464 96 | nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number 97 | #else 98 | nrm2[nrm2_offset] = sqrt(lm[0]); 99 | #endif 100 | } 101 | } 102 | 103 | // ================================================================================================= 104 | 105 | // End of the C++11 raw string literal 106 | 107 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xscal.cl: -------------------------------------------------------------------------------- 1 | 2 | // ================================================================================================= 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max- 5 | // width of 100 characters per line. 6 | // 7 | // Author(s): 8 | // Cedric Nugteren 9 | // 10 | // This file contains the Xscal kernel. It contains one fast vectorized version in case of unit 11 | // strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support 12 | // vector data-types. 13 | // 14 | // This kernel uses the level-1 BLAS common tuning parameters. 15 | // 16 | // ================================================================================================= 17 | 18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 19 | // literal). Comment-out this line for syntax-highlighting when developing. 20 | 21 | // ================================================================================================= 22 | 23 | // Full version of the kernel with offsets and strided accesses 24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) 25 | void Xscal(const int n, const __global real* arg_alpha, 26 | __global real* xgm, const int x_offset, const int x_inc) { 27 | const real alpha = GetRealArg(arg_alpha[0]); 28 | 29 | // Loops over the work that needs to be done (allows for an arbitrary number of threads) 30 | #pragma unroll 31 | for (int id = get_global_id(0); id 9 | // 10 | // This file contains the Xswap kernel. It contains one fast vectorized version in case of unit 11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't 12 | // support vector data-types. 13 | // 14 | // This kernel uses the level-1 BLAS common tuning parameters. 15 | // 16 | // ================================================================================================= 17 | 18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 19 | // literal). Comment-out this line for syntax-highlighting when developing. 20 | 21 | // ================================================================================================= 22 | 23 | // Full version of the kernel with offsets and strided accesses 24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1))) 25 | void Xswap(const int n, 26 | __global real* xgm, const int x_offset, const int x_inc, 27 | __global real* ygm, const int y_offset, const int y_inc) { 28 | 29 | // Loops over the work that needs to be done (allows for an arbitrary number of threads) 30 | #pragma unroll 31 | for (int id = get_global_id(0); id 9 | // 10 | // This file contains the common functions and parameters specific for level 3 BLAS kernels. 11 | // 12 | // ================================================================================================= 13 | 14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string 15 | // literal). Comment-out this line for syntax-highlighting when developing. 16 | 17 | // ================================================================================================= 18 | 19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case 20 | // this kernel file is used outside of the CLBlast library. 21 | 22 | // For the 'fast' copy kernel 23 | #ifndef COPY_DIMX 24 | #define COPY_DIMX 8 // Local workgroup size in the first dimension (x) 25 | #endif 26 | #ifndef COPY_DIMY 27 | #define COPY_DIMY 8 // Local workgroup size in the second dimension (y) 28 | #endif 29 | #ifndef COPY_WPT 30 | #define COPY_WPT 1 // Work per thread in the first dimension (x) 31 | #endif 32 | #ifndef COPY_VW 33 | #define COPY_VW 1 // Vector width in the second dimension (y) 34 | #endif 35 | 36 | // For the padding/copy kernels and the conversion kernels 37 | #ifndef PAD_DIMX 38 | #define PAD_DIMX 8 // Local workgroup size in the first dimension (x) 39 | #endif 40 | #ifndef PAD_DIMY 41 | #define PAD_DIMY 8 // Local workgroup size in the second dimension (y) 42 | #endif 43 | #ifndef PAD_WPTX 44 | #define PAD_WPTX 1 // Work per thread in the first dimension (x) 45 | #endif 46 | #ifndef PAD_WPTY 47 | #define PAD_WPTY 1 // Work per thread in the second dimension (y) 48 | #endif 49 | 50 | // For the 'fast' transpose kernel 51 | #ifndef TRA_DIM 52 | #define TRA_DIM 8 // Number of local threads in the two dimensions (x,y) 53 | #endif 54 | #ifndef TRA_WPT 55 | #define TRA_WPT 1 // Work per thread in one dimension and vector-width in the other 56 | #endif 57 | #ifndef TRA_PAD 58 | #define TRA_PAD 0 // Padding of the local memory to avoid bank-conflicts 59 | #endif 60 | #ifndef TRA_SHUFFLE 61 | #define TRA_SHUFFLE 0 // Shuffling of the global indices to avoid global memory bank-conflicts 62 | #endif 63 | 64 | // For the padding/transpose kernels 65 | #ifndef PADTRA_TILE 66 | #define PADTRA_TILE 8 // Number of local threads in the two dimensions (x,y) 67 | #endif 68 | #ifndef PADTRA_WPT 69 | #define PADTRA_WPT 1 // Amount of work per thread 70 | #endif 71 | #ifndef PADTRA_PAD 72 | #define PADTRA_PAD 0 // Padding of the local memory to avoid bank-conflicts 73 | #endif 74 | 75 | // ================================================================================================= 76 | 77 | // End of the C++11 raw string literal 78 | 79 | // ================================================================================================= -------------------------------------------------------------------------------- /crates/parenchyma-blas/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Parenchyma extension package for backend-agnostic BLAS operations. 2 | //! 3 | //! Provides backend-agnostic [BLAS] operations for [Parenchyma]. 4 | //! 5 | //! BLAS (Basic Linear Algebra Subprograms) is a specification that prescribes a set of low-level 6 | //! routines for performing common linear algebra operations such as vector addition, scalar 7 | //! multiplication, dot products, linear combinations, and matrix multiplication. They are the de 8 | //! facto standard low-level routines for linear algebra libraries; the routines have bindings for 9 | //! both C and Fortran. Although the BLAS specification is general, BLAS implementations are often 10 | //! optimized for speed on a particular machine, so using them can bring substantial performance 11 | //! benefits. BLAS implementations will take advantage of special floating point hardware such as 12 | //! vector registers or SIMD instructions.
13 | //! 14 | //! # Overview 15 | //! 16 | //! A Parenchyma extension package provides functionality through two types: 17 | //! 18 | //! * __Package__ 19 | //! This enum provides the actual initialized functions. 20 | //! 21 | //! * __Extension__ 22 | //! This trait provides methods that specify the exact backend-agnostic behavior of a collection of 23 | //! operations. Since a shared tensor completely manages memory, tensors can simply be passed in as 24 | //! arguments for the fastest possible execution. 25 | //! 26 | //! Aside from the generic functionality provided by the two traits, the extension can be further 27 | //! extended. 28 | //! 29 | //! For more information, read the documentation. 30 | //! 31 | //! # Example Usage 32 | //! 33 | //! ```ignore 34 | //! #[macro_use(array)] 35 | //! extern crate parenchyma; 36 | //! extern crate parenchyma_blas as blas; 37 | //! 38 | //! use parenchyma::frameworks::Native; 39 | //! use parenchyma::prelude::*; 40 | //! 41 | //! let backend: Backend = Backend::new::()?; 42 | //! let ref x: SharedTensor = array![[1.5, 2.5, 3.5], [4.5, 5.5, 6.6]].into(); 43 | //! let ref mut result: SharedTensor = array![0.0].into(); 44 | //! 45 | //! backend.asum(x, result)?; 46 | //! 47 | //! println!("{:?}", result); 48 | //! ``` 49 | //! 50 | //! [BLAS]: https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms 51 | //! [Parenchyma]: https://github.com/lychee-eng/parenchyma 52 | #![allow(unused_variables)] 53 | #![feature(non_modrs_mods, type_ascription)] 54 | 55 | extern crate ocl; 56 | extern crate parenchyma; 57 | extern crate rblas; 58 | 59 | pub use self::extension_package::{Extension, GenericMatrix, Package, Transposition}; 60 | pub mod frameworks; 61 | 62 | mod extension_package; -------------------------------------------------------------------------------- /crates/parenchyma-blas/tests/blas_specs.rs: -------------------------------------------------------------------------------- 1 | #![feature(rustc_private)] 2 | 3 | #[macro_use] 4 | extern crate lazy_static; 5 | #[macro_use(array)] 6 | extern crate parenchyma; 7 | extern crate parenchyma_blas; 8 | 9 | #[cfg(test)] 10 | mod blas_specification_native { 11 | use parenchyma::frameworks::Native; 12 | use parenchyma::prelude::*; 13 | use parenchyma_blas::*; 14 | 15 | struct TestBackend(Backend); 16 | 17 | impl ::std::ops::Deref for TestBackend { 18 | type Target = Backend; 19 | fn deref(&self) -> &Self::Target { 20 | &self.0 21 | } 22 | } 23 | unsafe impl Sync for TestBackend { } 24 | 25 | lazy_static! { 26 | static ref BACKEND: TestBackend = TestBackend(Backend::new::>().unwrap()); 27 | } 28 | 29 | #[test] 30 | fn it_computes_correct_asum_on_native_for_f32() { 31 | let ref x = array![1., -2., 3.].into(); 32 | let ref mut result = SharedTensor::scalar(0.0); 33 | BACKEND.asum(x, result).unwrap(); 34 | assert_eq!(&[6.], result.as_slice().unwrap()); 35 | } 36 | 37 | #[test] 38 | fn it_computes_correct_axpy_on_native_for_f32() { 39 | let ref a = SharedTensor::scalar(2.0); 40 | let ref x = array![1., 2., 3.].into(); 41 | let ref mut y = array![1., 2., 3.].into(); 42 | BACKEND.axpy(a, x, y).unwrap(); 43 | assert_eq!(&[3., 6., 9.], y.as_slice().unwrap()); 44 | } 45 | 46 | #[test] 47 | fn it_computes_correct_copy_on_native_for_f32() { 48 | let ref mut x = array![1., 2., 3.].into(); 49 | let ref mut y = SharedTensor::from([3]); 50 | BACKEND.copy(x, y).unwrap(); 51 | assert_eq!(&[1., 2., 3.], y.as_slice().unwrap()); 52 | } 53 | 54 | #[test] 55 | fn it_computes_correct_dot_on_native_for_f32() { 56 | let ref x = array![1., 2., 3.].into(); 57 | let ref y = array![1., 2., 3.].into(); 58 | let ref mut result = SharedTensor::from([]); 59 | BACKEND.dot(x, y, result).unwrap(); 60 | assert_eq!(&[14.], result.as_slice().unwrap()); 61 | } 62 | 63 | #[test] 64 | fn it_computes_correct_nrm2_on_native_for_f32() { 65 | let ref x = array![1., 2., 2.].into(); 66 | let ref mut result = SharedTensor::from([]); 67 | BACKEND.nrm2(x, result).unwrap(); 68 | assert_eq!(&[3.], result.as_slice().unwrap()); 69 | } 70 | 71 | #[test] 72 | fn it_computes_correct_scal_on_native_for_f32() { 73 | let ref a = array![2.].into(); 74 | let ref mut x = array![1., 2., 3.].into(); 75 | BACKEND.scal(a, x).unwrap(); 76 | assert_eq!(&[2., 4., 6.], x.as_slice().unwrap()); 77 | } 78 | 79 | #[test] 80 | fn it_computes_correct_swap_on_native_for_f32() { 81 | let ref mut x = array![1., 2., 3.].into(); 82 | let ref mut y = array![3., 2., 1.].into(); 83 | BACKEND.swap(x, y).unwrap(); 84 | assert_eq!(&[3., 2., 1.], x.as_slice().unwrap()); 85 | assert_eq!(&[1., 2., 3.], y.as_slice().unwrap()); 86 | } 87 | 88 | #[test] 89 | fn it_computes_correct_gemm_on_native_for_f32() { 90 | 91 | let ref alpha = array![1.0].into(); 92 | let ref amat = 93 | array![ 94 | [2.0, 5.0], 95 | [2.0, 5.0], 96 | [2.0, 5.0] 97 | ].into(); 98 | 99 | let ref beta = array![0.0].into(); 100 | let ref bmat = 101 | array![ 102 | [4.0, 1.0, 1.0], 103 | [4.0, 1.0, 1.0] 104 | ].into(); 105 | 106 | let ref mut cmat = SharedTensor::from([3, 3]); 107 | let transposition = Transposition::NoTranspose; 108 | 109 | BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap(); 110 | 111 | assert_eq!(&[28., 7., 7., 28., 7., 7., 28., 7., 7.], cmat.as_slice().unwrap()); 112 | } 113 | 114 | #[test] 115 | fn it_computes_correct_transpose_gemm_on_native_for_f32() { 116 | 117 | let ref alpha = array![1.0].into(); 118 | let ref amat = 119 | array![ 120 | [2.0, 5.0], 121 | [2.0, 5.0], 122 | [2.0, 5.0] 123 | ].into(); 124 | 125 | let ref beta = array![0.0].into(); 126 | let ref bmat = 127 | array![ 128 | [4.0, 1.0, 1.0], 129 | [4.0, 1.0, 1.0] 130 | ].into(); 131 | 132 | let ref mut cmat = SharedTensor::from([2, 2]); 133 | let transposition = Transposition::Transpose; 134 | 135 | BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap(); 136 | 137 | assert_eq!(&[12., 12., 30., 30.], cmat.as_slice().unwrap()); 138 | } 139 | } 140 | 141 | #[cfg(test)] 142 | mod blas_specification_opencl { 143 | use parenchyma::frameworks::OpenCL; 144 | use parenchyma::hardware::{Hardware, HardwareKind}; 145 | use parenchyma::prelude::*; 146 | use parenchyma_blas::*; 147 | 148 | struct TestBackend(Backend); 149 | 150 | impl ::std::ops::Deref for TestBackend { 151 | type Target = Backend; 152 | fn deref(&self) -> &Self::Target { 153 | &self.0 154 | } 155 | } 156 | unsafe impl Sync for TestBackend { } 157 | 158 | lazy_static! { 159 | static ref BACKEND: TestBackend = { 160 | let mut backend: Backend = Backend::new::>().unwrap(); 161 | // required here! 162 | backend.select(&|hardware| hardware.kind == HardwareKind::GPU); 163 | TestBackend(backend) 164 | }; 165 | } 166 | 167 | #[test] 168 | fn it_computes_correct_axpy_on_opencl_for_f32() { 169 | let ref a = SharedTensor::scalar(2.0); 170 | let ref x = array![1., 2., 3.].into(); 171 | let ref mut y = array![1., 2., 3.].into(); 172 | BACKEND.axpy(a, x, y).unwrap(); 173 | assert_eq!(&[3., 6., 9.], y.as_slice().unwrap()); 174 | } 175 | 176 | #[test] 177 | fn it_computes_correct_copy_on_opencl_for_f32() { 178 | let ref mut x = array![1., 2., 3.].into(); 179 | let ref mut y = SharedTensor::from([3]); 180 | BACKEND.copy(x, y).unwrap(); 181 | assert_eq!(&[1., 2., 3.], y.as_slice().unwrap()); 182 | } 183 | 184 | #[test] 185 | fn it_computes_correct_gemm_on_opencl_for_f32() { 186 | 187 | let ref alpha = array![1.0].into(); 188 | let ref amat = 189 | array![ 190 | [2.0, 5.0], 191 | [2.0, 5.0], 192 | [2.0, 5.0] 193 | ].into(); 194 | 195 | let ref beta = array![0.0].into(); 196 | let ref bmat = 197 | array![ 198 | [4.0, 1.0, 1.0], 199 | [4.0, 1.0, 1.0] 200 | ].into(); 201 | 202 | let ref mut cmat = SharedTensor::from([3, 3]); 203 | let transposition = Transposition::NoTranspose; 204 | 205 | BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap(); 206 | 207 | assert_eq!(&[28., 7., 7., 28., 7., 7., 28., 7., 7.], cmat.as_slice().unwrap()); 208 | } 209 | 210 | #[test] 211 | fn it_computes_correct_transpose_gemm_on_opencl_for_f32() { 212 | 213 | let ref alpha = array![1.0].into(); 214 | let ref amat = 215 | array![ 216 | [2.0, 5.0], 217 | [2.0, 5.0], 218 | [2.0, 5.0] 219 | ].into(); 220 | 221 | let ref beta = array![0.0].into(); 222 | let ref bmat = 223 | array![ 224 | [4.0, 1.0, 1.0], 225 | [4.0, 1.0, 1.0] 226 | ].into(); 227 | 228 | let ref mut cmat = SharedTensor::from([2, 2]); 229 | let transposition = Transposition::Transpose; 230 | 231 | BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap(); 232 | 233 | assert_eq!(&[12., 12., 30., 30.], cmat.as_slice().unwrap()); 234 | } 235 | 236 | #[test] 237 | fn it_computes_correct_scal_on_opencl_for_f32() { 238 | let ref a = array![2.].into(); 239 | let ref mut x = array![1., 2., 3.].into(); 240 | BACKEND.scal(a, x).unwrap(); 241 | assert_eq!(&[2., 4., 6.], x.as_slice().unwrap()); 242 | } 243 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /crates/parenchyma-deep/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parenchyma-deep" 3 | version = "0.1.0" 4 | authors = ["Jony "] 5 | license = "MIT/Apache-2.0" 6 | 7 | [dependencies] 8 | ocl = "0.16.0" 9 | 10 | [dependencies.parenchyma] 11 | path = "../../" 12 | version = "0.0.4" 13 | 14 | [dev-dependencies] 15 | lazy_static = "1.1.0" -------------------------------------------------------------------------------- /crates/parenchyma-deep/README.md: -------------------------------------------------------------------------------- 1 | # parenchyma-deep 2 | 3 | This package provides full NN support for Parenchyma, so you can use NN on servers, desktops or 4 | mobiles, GPUs, FPGAs or CPUS, without worrying about OpenCL or CUDA support on the machine. 5 | 6 | ## Provided Operations 7 | 8 | This package provides the following operations to Parenchyma backends: 9 | 10 | | | CUDA (cuDNN) | OpenCL | Native (rust) | 11 | |--- |--- |--- |--- | 12 | | Sigmoid | (collenchyma) | - | ✓ | 13 | | Sigmoid (pointwise) | (collenchyma) | - | | 14 | | ReLU | (collenchyma) | - | ✓ | 15 | | ReLU (pointwise) | (collenchyma) | - | | 16 | | Tanh | (collenchyma) | - | ✓ | 17 | | Tanh (pointwise) | (collenchyma) | - | | 18 | | | | | | 19 | | Normalization (LRN) | (collenchyma) | - | - | 20 | | | | | | 21 | | Convolution | (collenchyma) | - | - | 22 | | | | | | 23 | | Softmax | (collenchyma) | - | ✓ | 24 | | Log Softmax | (collenchyma) | - | ✓ | 25 | | | | | | 26 | | Pooling Max | (collenchyma) | - | - | 27 | | Pooling Avg | (collenchyma) | - | - | -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/extension_package/backward.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::prelude::SharedTensor; 3 | use super::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration}; 4 | 5 | pub trait Backward { 6 | /// Computes the gradient of a [CNN convolution] over the input tensor `x` with respect 7 | /// to the data. 8 | /// 9 | /// Saves the result to `result_diff`. 10 | /// 11 | /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network 12 | fn convolution_grad_data( 13 | self: &Self, 14 | filter: &SharedTensor, 15 | x_diff: &SharedTensor, 16 | result_diff: &mut SharedTensor, 17 | workspace: &mut SharedTensor, 18 | configuration: &ConvolutionConfiguration) -> Result { 19 | unimplemented!() 20 | } 21 | /// Computes the gradient of a [CNN convolution][convolution] with respect to the filter. 22 | /// 23 | /// Saves the result to `filter_diff`. 24 | /// 25 | /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network 26 | fn convolution_grad_filter( 27 | self: &Self, 28 | src_data: &SharedTensor, 29 | dest_diff: &SharedTensor, 30 | filter_diff: &mut SharedTensor, 31 | workspace: &mut SharedTensor, 32 | configuration: &ConvolutionConfiguration) -> Result { 33 | unimplemented!() 34 | } 35 | /// Computes the gradient of a logarithmic softmax over the input tensor `x`. 36 | /// 37 | /// Saves the result to `result_diff`. 38 | fn log_softmax_grad( 39 | self: &Self, 40 | x: &SharedTensor, 41 | x_diff: &SharedTensor, 42 | result_diff: &mut SharedTensor) -> Result { 43 | unimplemented!() 44 | } 45 | /// Computes the gradient of a [LRN][lrn] over the input Tensor `x` with complete memory management. 46 | /// [lrn]: https://en.wikipedia.org/wiki/lrnal_neural_network 47 | /// 48 | /// Saves the result to `result_diff`. 49 | /// 50 | /// For a no-memory managed version see `lrn_grad_plain`. 51 | fn lrn_grad( 52 | self: &Self, 53 | x: &SharedTensor, 54 | x_diff: &SharedTensor, 55 | result: &SharedTensor, 56 | result_diff: &mut SharedTensor, 57 | configuration: &LrnConfiguration) -> Result { 58 | unimplemented!() 59 | } 60 | /// Computes the gradient of [max pooling] over the input Tensor `x`. 61 | /// 62 | /// Saves the result to `result_diff`. 63 | /// 64 | /// [max pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer 65 | fn pooling_max_grad( 66 | self: &Self, 67 | x: &SharedTensor, 68 | x_diff: &SharedTensor, 69 | result: &SharedTensor, 70 | result_diff: &mut SharedTensor, 71 | configuration: &PoolingConfiguration) -> Result { 72 | unimplemented!() 73 | } 74 | /// Computes the gradient of [ReLU] over the input tensor `x`. 75 | /// 76 | /// Saves the result to `result_diff`. 77 | /// 78 | /// [ReLU]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 79 | fn relu_grad( 80 | self: &Self, 81 | x: &SharedTensor, 82 | x_diff: &SharedTensor, 83 | result: &SharedTensor, 84 | result_diff: &mut SharedTensor) -> Result { 85 | unimplemented!() 86 | } 87 | /// Computes the gradient of [ReLU] over the input tensor `x`. 88 | /// 89 | /// Saves the result back to `x_diff`. 90 | /// 91 | /// [ReLU]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 92 | fn relu_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result { 93 | unimplemented!() 94 | } 95 | /// Computes the gradient of a [sigmoid function] over the input tensor `x`. 96 | /// 97 | /// Saves the result to `result_diff`. 98 | /// 99 | /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function 100 | fn sigmoid_grad( 101 | self: &Self, 102 | x: &SharedTensor, 103 | x_diff: &SharedTensor, 104 | result: &SharedTensor, 105 | result_diff: &mut SharedTensor) -> Result { 106 | unimplemented!() 107 | } 108 | /// Computes the gradient of a [sigmoid function] over the input tensor `x`. 109 | /// 110 | /// Saves the result back to `x_diff`. 111 | /// 112 | /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function 113 | fn sigmoid_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result { 114 | unimplemented!() 115 | } 116 | /// Computes the gradient of a [softmax] over the input tensor `x`. 117 | /// 118 | /// Saves the result to `result_diff`. 119 | /// 120 | /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function 121 | fn softmax_grad( 122 | self: &Self, 123 | x: &SharedTensor, 124 | x_diff: &SharedTensor, 125 | result_diff: &mut SharedTensor) -> Result { 126 | unimplemented!() 127 | } 128 | /// Computes the gradient of [tanh] over the input Tensor `x`. 129 | /// 130 | /// Saves the result to `result_diff`. 131 | /// 132 | /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function 133 | fn tanh_grad( 134 | self: &Self, 135 | x: &SharedTensor, 136 | x_diff: &SharedTensor, 137 | result: &SharedTensor, 138 | result_diff: &mut SharedTensor) -> Result { 139 | unimplemented!() 140 | } 141 | /// Computes the gradient of [tanh] over the input Tensor `x`. 142 | /// 143 | /// Saves the result back to `x_diff`. 144 | /// 145 | /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function 146 | fn tanh_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result { 147 | unimplemented!() 148 | } 149 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/extension_package/configuration.rs: -------------------------------------------------------------------------------- 1 | #[derive(Clone, Copy, Debug)] 2 | pub struct ConvolutionConfiguration; 3 | 4 | // impl ConvolutionConfiguration { 5 | // /// Creates a new convolution configuration, which needs to be passed to further 6 | // /// convolution operations. 7 | // pub fn new

( 8 | // backend: &Backend

, 9 | // src: &SharedTensor, 10 | // dest: &SharedTensor, 11 | // filter: &mut SharedTensor, 12 | // algo_forward: ConvForwardAlgo, 13 | // algo_backward_filter: ConvBackwardDataAlgo, 14 | // algo_backward_data: ConvBackwardDataAlgo, 15 | // stride: &[i32], 16 | // zero_padding: &[i32]) -> Result { 17 | 18 | // unimplemented!() 19 | // } 20 | // } 21 | 22 | #[derive(Clone, Copy, Debug)] 23 | pub struct LrnConfiguration; 24 | 25 | #[derive(Clone, Copy, Debug)] 26 | pub struct PoolingConfiguration; -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/extension_package/convolution.rs: -------------------------------------------------------------------------------- 1 | /// Different algorithms to compute the gradient with respect to the filter. 2 | #[derive(Clone, Copy, Debug)] 3 | pub enum ConvBackwardDataAlgo { 4 | /// Attempt to automatically find the best algorithm of all the other available ones. 5 | Auto, 6 | /// Compute the convolution as matrix product without forming the matrix that holds the input data. 7 | /// 8 | /// Does not need any memory workspace. 9 | /// 10 | /// The results are deterministic. 11 | ImplicitGemm, 12 | /// Compute the convolution as sum of matrix product without forming the matrix that holds the input data. 13 | /// 14 | /// Does not need any memory workspace. 15 | /// 16 | /// The results are non-deterministic. 17 | ImplicitGemmSum, 18 | /// Compute the convolution as Fast-Fourier Transform. 19 | /// 20 | /// Needs a significant memory workspace. 21 | /// 22 | /// The results are deterministic. 23 | Fft, 24 | /// Compute the convolution as Fast-Fourier Transform with 32x32 tiles. 25 | /// 26 | /// Needs a significant memory workspace. 27 | /// 28 | /// The results are deterministic. 29 | FftTiling, 30 | } 31 | 32 | /// Different algorithms to compute the gradient with respect to the filter. 33 | #[derive(Clone, Copy, Debug)] 34 | pub enum ConvBackwardFilterAlgo { 35 | /// Attempt to automatically find the best algorithm of all the other available ones. 36 | Auto, 37 | /// Compute the convolution as matrix product without forming the matrix that holds the input data. 38 | /// 39 | /// Does not need any memory workspace. 40 | /// 41 | /// The results are deterministic. 42 | ImplicitGemm, 43 | /// Compute the convolution as sum of matrix product without forming the matrix that holds the input data. 44 | /// 45 | /// Does not need any memory workspace. 46 | /// 47 | /// The results are non-deterministic. 48 | ImplicitGemmSum, 49 | /// Similar to `ImplicitGEMMSum` but needs some workspace to precompile the implicit indices. 50 | /// 51 | /// The results are non-deterministic. 52 | ImplicitPrecompiledGemmSum, 53 | /// Compute the convolution as Fast-Fourier Transform. 54 | /// 55 | /// Needs a significant memory workspace. 56 | /// 57 | /// The results are deterministic. 58 | Fft, 59 | } 60 | 61 | /// Different algorithms to compute the convolution forward algorithm. 62 | #[derive(Clone, Copy, Debug)] 63 | pub enum ConvForwardAlgo { 64 | /// Attempt to automatically find the best algorithm of all the other available ones. 65 | Auto, 66 | /// Compute the convolution as explicit matrix product. 67 | /// 68 | /// Needs a significant memory workspace. 69 | Gemm, 70 | /// Compute the convolution as matrix product without forming the matrix that holds the input data. 71 | /// 72 | /// Does not need any memory workspace. 73 | ImplicitGemm, 74 | /// Similar to `ImplicitGEMM` but needs some workspace to precompile the implicit indices. 75 | ImplicitPrecompiledGemm, 76 | /// Compute the convolution as Fast-Fourier Transform. 77 | /// 78 | /// Needs a significant memory workspace. 79 | Fft, 80 | /// Compute the convolution as Fast-Fourier Transform with 32x32 tiles. 81 | /// 82 | /// Needs a significant memory workspace. 83 | FftTiling, 84 | /// Compute the convolution without implicit or explicit matrix-multiplication. **Do not try to use this**. 85 | /// 86 | /// Listed in cuDNN docs but cuDNN does not provide a implementation. 87 | Direct, 88 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/extension_package/forward.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::prelude::SharedTensor; 3 | use super::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration}; 4 | 5 | pub trait Forward { 6 | /// Computes a [CNN convolution] over the input tensor `x`, and then saves the `result`. 7 | /// 8 | /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network 9 | fn convolution( 10 | self: &Self, 11 | filter: &SharedTensor, 12 | x: &SharedTensor, 13 | result: &mut SharedTensor, 14 | workspace: &mut SharedTensor, 15 | configuration: &ConvolutionConfiguration) -> Result { 16 | unimplemented!() 17 | } 18 | /// Computes the exponential linear unit [new] over tensor `x`. 19 | /// 20 | /// Saves the `result`. 21 | fn elu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 22 | unimplemented!() 23 | } 24 | /// Computes a logarithmic softmax over the input tensor `x`, and then saves the `result`. 25 | fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 26 | unimplemented!() 27 | } 28 | /// Computes a [local response normalization] over the input tensor `x`. 29 | /// 30 | /// Saves the result to `result`. 31 | /// 32 | /// [local response normalization]: https://en.wikipedia.org/wiki/lrnal_neural_network 33 | fn lrn( 34 | self: &Self, 35 | x: &SharedTensor, 36 | result: &mut SharedTensor, 37 | configuration: &LrnConfiguration) -> Result { 38 | unimplemented!() 39 | } 40 | /// Computes non-linear down-sampling ([max pooling]) over the input tensor `x`. 41 | /// 42 | /// Saves the result to `result`. 43 | /// 44 | /// [max pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer 45 | fn pooling_max( 46 | self: &Self, 47 | x: &SharedTensor, 48 | result: &mut SharedTensor, 49 | configuration: &PoolingConfiguration) -> Result { 50 | unimplemented!() 51 | } 52 | /// Computes the [rectified linear units] over tensor `x`. 53 | /// 54 | /// Saves the `result`. 55 | /// 56 | /// [rectified linear units]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 57 | fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 58 | unimplemented!() 59 | } 60 | /// Computes the [rectified linear units] over the input Tensor `x`. 61 | /// 62 | /// note: pointwise operations overwrite the input with the result of the operation. 63 | /// 64 | /// [rectified linear units]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks) 65 | fn relu_pointwise(&self, x: &mut SharedTensor) -> Result { 66 | unimplemented!() 67 | } 68 | /// Computes the [sigmoid function] over tensor `x`. 69 | /// 70 | /// Saves the `result`. 71 | /// 72 | /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function 73 | fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 74 | unimplemented!() 75 | } 76 | /// Computes the [sigmoid function][sigmoid] over the input tensor `x`. 77 | /// 78 | /// note: pointwise operations overwrite the input with the result of the operation. 79 | /// 80 | /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function 81 | fn sigmoid_pointwise(&self, x: &mut SharedTensor) -> Result { 82 | unimplemented!() 83 | } 84 | /// Computes a [softmax] over the input tensor `x`. 85 | /// 86 | /// Saves the result to `result`. 87 | /// 88 | /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function 89 | fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 90 | unimplemented!() 91 | } 92 | /// Computes the [hyperbolic tangent] over tensor `x`. 93 | /// 94 | /// Saves the `result`. 95 | /// 96 | /// [hyperbolic tangent]: https://en.wikipedia.org/wiki/Hyperbolic_function 97 | fn tanh(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 98 | unimplemented!() 99 | } 100 | /// Computes the [hyperbolic tangent][tanh] over the input Tensor `x`. 101 | /// 102 | /// note: pointwise operations overwrite the input with the result of the operation. 103 | /// 104 | /// [hyperbolic tangent]: https://en.wikipedia.org/wiki/Hyperbolic_function 105 | fn tanh_pointwise(&self, x: &mut SharedTensor) -> Result { 106 | unimplemented!() 107 | } 108 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/extension_package/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::backward::Backward; 2 | pub use self::configuration::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration}; 3 | pub use self::convolution::{ConvBackwardDataAlgo, ConvBackwardFilterAlgo, ConvForwardAlgo}; 4 | pub use self::forward::Forward; 5 | 6 | mod backward; 7 | mod configuration; 8 | mod convolution; 9 | mod forward; 10 | 11 | use parenchyma::extension_package::ExtensionPackage; 12 | 13 | /// The BLAS package. 14 | pub enum Package { 15 | OpenCL(::frameworks::open_cl::OpenCLPackage), 16 | } 17 | 18 | impl Package { 19 | pub fn open_cl(&self) -> &::frameworks::open_cl::OpenCLPackage { 20 | match self { 21 | &Package::OpenCL(ref package) => package 22 | } 23 | } 24 | } 25 | 26 | /// Provides the functionality for a backend to support DNN related operations. 27 | pub trait Extension: Backward + Forward { 28 | // .. 29 | } 30 | 31 | impl ExtensionPackage for Package { 32 | type Extension = Extension; 33 | 34 | fn package_name(&self) -> &'static str { 35 | return "parenchyma/deep"; 36 | } 37 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod native; 2 | pub mod open_cl; -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/native/mod.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::extension_package::Dependency; 3 | use parenchyma::frameworks::NativeContext as Context; 4 | use parenchyma::tensor::SharedTensor; 5 | use super::super::{Extension, Package}; 6 | use super::super::extension_package::{Backward, Forward}; 7 | 8 | impl

Backward for Context

where 9 | P: Dependency { 10 | fn log_softmax_grad( 11 | self: &Self, 12 | x: &SharedTensor, 13 | x_diff: &SharedTensor, 14 | result_diff: &mut SharedTensor) -> Result { 15 | let x_slice = x.as_slice().unwrap(); 16 | let x_diff_slice = x_diff.as_slice().unwrap(); 17 | let mut sum = 0.0; 18 | for &grad_val in x_diff_slice.iter() { 19 | sum += grad_val; 20 | } 21 | let res = x_slice.iter().zip(x_diff_slice.iter()) 22 | .map(|(x_val, x_diff_val)| { 23 | x_diff_val - x_val.exp() * sum 24 | }); 25 | result_diff.write_iter(res)?; 26 | Ok(()) 27 | } 28 | 29 | fn relu_grad( 30 | self: &Self, 31 | x: &SharedTensor, 32 | x_diff: &SharedTensor, 33 | result: &SharedTensor, 34 | result_diff: &mut SharedTensor) -> Result { 35 | let res = x.as_slice().unwrap().iter() 36 | .zip(x_diff.as_slice().unwrap().iter()) 37 | .map(|(x, dx)| if *x > 0.0 { *dx } else { 0.0 }); 38 | result_diff.write_iter(res)?; 39 | Ok(()) 40 | } 41 | 42 | fn sigmoid_grad( 43 | self: &Self, 44 | x: &SharedTensor, 45 | x_diff: &SharedTensor, 46 | result: &SharedTensor, 47 | result_diff: &mut SharedTensor) -> Result { 48 | let res = x.as_slice().unwrap().iter().zip(x_diff.as_slice().unwrap().iter()) 49 | .map(|(t, dt)| *t * (1.0 -*t) * *dt); 50 | result_diff.write_iter(res)?; 51 | Ok(()) 52 | } 53 | 54 | fn softmax_grad( 55 | self: &Self, 56 | x: &SharedTensor, 57 | x_diff: &SharedTensor, 58 | result_diff: &mut SharedTensor) -> Result { 59 | let mut dot = 0.0; 60 | let sig_data_slice = x.as_slice().unwrap(); 61 | let sig_dx_slice = x_diff.as_slice().unwrap(); 62 | for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) { 63 | dot += t * dt; 64 | } 65 | let res = sig_data_slice.iter().zip(sig_dx_slice.iter()).map(|(t, dt)| t * (dt - dot)); 66 | result_diff.write_iter(res)?; 67 | Ok(()) 68 | } 69 | 70 | fn tanh_grad( 71 | self: &Self, 72 | x: &SharedTensor, 73 | x_diff: &SharedTensor, 74 | result: &SharedTensor, 75 | result_diff: &mut SharedTensor) -> Result { 76 | let res = x.as_slice().unwrap().iter() 77 | .zip(x_diff.as_slice().unwrap().iter()) 78 | .map(|(x, dx)| (1.0 - x.powi(2)) * *dx); 79 | result_diff.write_iter(res)?; 80 | Ok(()) 81 | } 82 | } 83 | 84 | impl

Forward for Context

where 85 | P: Dependency { 86 | fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 87 | let mut max_input = ::std::f32::NEG_INFINITY; 88 | for &input in x.as_slice().unwrap() { 89 | max_input = max_input.max(input); 90 | } 91 | let mut logsum = 0.; 92 | for exp in x.as_slice().unwrap().iter().map(|t| (-(max_input - t)).exp()) { 93 | logsum += exp; 94 | } 95 | logsum = max_input + logsum.ln(); 96 | let res = x.as_slice().unwrap().iter().map(|t| t - logsum); 97 | result.write_iter(res)?; 98 | Ok(()) 99 | } 100 | 101 | fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 102 | let res = x.as_slice().unwrap().iter().map(|elem| elem.max(0.0)); 103 | result.write_iter(res)?; 104 | Ok(()) 105 | } 106 | 107 | fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 108 | let res = x.as_slice().unwrap().iter().map(|x| 1.0 / (1.0 + (-*x).exp())); 109 | result.write_iter(res)?; 110 | Ok(()) 111 | } 112 | 113 | fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 114 | let mut exps = Vec::with_capacity(x.shape().capacity()); 115 | let mut sum = 0.0; 116 | for exp in x.as_slice().unwrap().iter().map(|t| t.exp()) { 117 | exps.push(exp); 118 | sum += exp; 119 | } 120 | let res = exps.iter().map(|t| t / sum); 121 | result.write_iter(res)?; 122 | Ok(()) 123 | } 124 | 125 | fn tanh(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 126 | let res = x.as_slice().unwrap().iter().map(|elem| elem.tanh()); 127 | result.write_iter(res)?; 128 | Ok(()) 129 | } 130 | } 131 | 132 | impl

Extension for Context

where 133 | P: Dependency { 134 | // .. 135 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/_build.rs: -------------------------------------------------------------------------------- 1 | use package::ParenchymaDeep; 2 | use parenchyma::{Build, Result}; 3 | use parenchyma::opencl::OpenCLContext; 4 | use parenchyma::utility::Uninitialized; 5 | use super::Package; 6 | 7 | impl Build> for ParenchymaDeep { 8 | 9 | fn build(cx: &mut OpenCLContext) -> Result { 10 | 11 | let program = cx.create_program(&[ 12 | include_str!("source/activation.cl"), 13 | include_str!("source/activationBackward.cl"), 14 | include_str!("source/convolution.cl") 15 | ])?; 16 | 17 | let cl_package = Package { 18 | tanh: program.create_kernel("tanh_float")?, 19 | sigmoid: program.create_kernel("sigmoid_float")?, 20 | relu: program.create_kernel("relu_float")?, 21 | elu: program.create_kernel("elu_float")?, 22 | 23 | tanh_backward: program.create_kernel("tanh_backward_float")?, 24 | sigmoid_backward: program.create_kernel("sigmoid_backward_float")?, 25 | relu_backward: program.create_kernel("relu_backward_float")?, 26 | elu_backward: program.create_kernel("elu_backward_float")?, 27 | 28 | convolution: program.create_kernel("convolve_ints")?, 29 | 30 | program, 31 | }; 32 | 33 | Ok(ParenchymaDeep { cl: cl_package }) 34 | } 35 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/_mod.rs: -------------------------------------------------------------------------------- 1 | mod build; 2 | 3 | use extension::{ActivationMode, Backward, Deep, Forward}; 4 | use package::ParenchymaDeep; 5 | 6 | use parenchyma::{Result, SharedTensor}; 7 | use parenchyma::opencl::OpenCLContext; 8 | use parenchyma::opencl::high; 9 | use parenchyma::utility::Uninitialized; 10 | 11 | #[derive(Debug)] 12 | pub struct Package { 13 | program: high::Program, 14 | 15 | // === activation 16 | 17 | tanh: high::Kernel, 18 | sigmoid: high::Kernel, 19 | relu: high::Kernel, 20 | elu: high::Kernel, 21 | 22 | // === activation backward 23 | 24 | tanh_backward: high::Kernel, 25 | sigmoid_backward: high::Kernel, 26 | relu_backward: high::Kernel, 27 | elu_backward: high::Kernel, 28 | 29 | // == conv 30 | convolution: high::Kernel, 31 | } 32 | 33 | impl Deep for OpenCLContext { } 34 | 35 | impl Forward for OpenCLContext { 36 | 37 | fn activation( 38 | &self, 39 | mode: ActivationMode, 40 | input: &SharedTensor, 41 | output: &mut SharedTensor) -> Result { 42 | 43 | use extension::ActivationMode::*; 44 | 45 | let kernel = match mode { 46 | Tanh => unsafe { &self.package().cl.tanh }, 47 | Sigmoid => unsafe { &self.package().cl.sigmoid }, 48 | ReLu => unsafe { &self.package().cl.relu }, 49 | Elu => unsafe { &self.package().cl.elu }, 50 | }; 51 | 52 | let length = input.shape.capacity(); 53 | 54 | kernel.set_arg(0, input.read(self)?)?; 55 | kernel.set_arg(1, output.write(self)?)?; 56 | kernel.set_arg(2, &length)?; 57 | 58 | let global_work = &[length]; 59 | let local_work = &[]; 60 | 61 | // TODO event_wait_list 62 | let events = &[]; 63 | 64 | // TODO 65 | let event = self.device().queue() 66 | .enqueue_nd_range_kernel(kernel, global_work, local_work, events)?; 67 | 68 | Ok(()) 69 | } 70 | } 71 | 72 | impl Backward for OpenCLContext { 73 | 74 | fn activation_backward( 75 | &self, 76 | mode: ActivationMode, 77 | input: &SharedTensor, 78 | input_diff: &SharedTensor, 79 | output_diff: &mut SharedTensor) -> Result { 80 | 81 | use extension::ActivationMode::*; 82 | 83 | let kernel = match mode { 84 | Tanh => unsafe { &self.package().cl.tanh_backward }, 85 | Sigmoid => unsafe { &self.package().cl.sigmoid_backward }, 86 | ReLu => unsafe { &self.package().cl.relu_backward }, 87 | Elu => unsafe { &self.package().cl.elu_backward }, 88 | }; 89 | 90 | let length = input.shape.capacity(); 91 | 92 | kernel.set_arg(0, input.read(self)?)?; 93 | kernel.set_arg(1, input_diff.read(self)?)?; 94 | kernel.set_arg(2, output_diff.write(self)?)?; 95 | kernel.set_arg(3, &length)?; 96 | 97 | let global_work = &[length]; 98 | let local_work = &[]; 99 | 100 | // TODO event_wait_list 101 | let events = &[]; 102 | 103 | 104 | // TODO 105 | let event = self.device().queue() 106 | .enqueue_nd_range_kernel(kernel, global_work, local_work, events)?; 107 | 108 | Ok(()) 109 | } 110 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::package::OpenCLPackage; 2 | 3 | mod package; 4 | 5 | use super::super::{Extension, Package}; 6 | use super::super::extension_package::{Backward, Forward}; 7 | 8 | use ocl; 9 | use parenchyma::error::Result; 10 | use parenchyma::extension_package::{Dependency, ExtensionPackageCtor}; 11 | use parenchyma::frameworks::{OpenCLContext as Context, OpenCLMemory as Memory}; 12 | use parenchyma::tensor::{self, SharedTensor}; 13 | 14 | impl ExtensionPackageCtor> for super::super::Package { 15 | fn package(target: &mut Context<()>) -> Result { 16 | OpenCLPackage::compile(target).map(Package::OpenCL) 17 | } 18 | } 19 | 20 | impl

Backward for Context

where 21 | P: Dependency { 22 | fn log_softmax_grad( 23 | &self, 24 | x: &SharedTensor, 25 | x_diff: &SharedTensor, 26 | result: &mut SharedTensor) -> Result { 27 | 28 | let n = x.shape().capacity; 29 | let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 30 | let x_diff: &Memory<_> = tensor::reference(x_diff, /*on:*/ self.device())?; 31 | let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 32 | 33 | unsafe { 34 | ocl::Kernel::new("log_softmax_backward_float", &self.extension_package().dependency().open_cl().program)? 35 | .arg_buf(x) 36 | .arg_buf(x_diff) 37 | .arg_buf(result) 38 | .arg_scl(n as i32) 39 | 40 | .gws([1, 1, 1]) 41 | .lws([1, 1, 1]) 42 | .queue(self.device().queue().clone()) 43 | .enq()?; 44 | } 45 | 46 | Ok(()) 47 | } 48 | 49 | // fn relu_grad( 50 | // self: &Self, 51 | // x: &SharedTensor, 52 | // x_diff: &SharedTensor, 53 | // result: &SharedTensor, 54 | // result_diff: &mut SharedTensor) -> Result { 55 | // let res = x.as_slice().unwrap().iter() 56 | // .zip(x_diff.as_slice().unwrap().iter()) 57 | // .map(|(x, dx)| if *x > 0.0 { *dx } else { 0.0 }); 58 | // result_diff.write_iter(res)?; 59 | // Ok(()) 60 | // } 61 | 62 | fn sigmoid_grad( 63 | self: &Self, 64 | x: &SharedTensor, 65 | x_diff: &SharedTensor, 66 | _: &SharedTensor, 67 | result_diff: &mut SharedTensor) -> Result { 68 | 69 | let n = x.shape().capacity; 70 | let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 71 | let x_diff: &Memory<_> = tensor::reference(x_diff, /*on:*/ self.device())?; 72 | // let result: &Memory<_> = tensor::reference(result, /*on:*/ self.device())?; 73 | let result_diff: &mut Memory<_> = tensor::mut_reference(result_diff, /*on:*/ self.device())?; 74 | 75 | unsafe { 76 | ocl::Kernel::new("sigmoid_backward_float", &self.extension_package().dependency().open_cl().program)? 77 | .arg_buf(x) 78 | .arg_buf(x_diff) 79 | .arg_buf(result_diff) 80 | .arg_scl(n as i32) 81 | 82 | .gws([n]) 83 | .queue(self.device().queue().clone()) 84 | .enq()?; 85 | } 86 | 87 | Ok(()) 88 | } 89 | 90 | // fn softmax_grad( 91 | // self: &Self, 92 | // x: &SharedTensor, 93 | // x_diff: &SharedTensor, 94 | // result_diff: &mut SharedTensor) -> Result { 95 | // let mut dot = 0.0; 96 | // let sig_data_slice = x.as_slice().unwrap(); 97 | // let sig_dx_slice = x_diff.as_slice().unwrap(); 98 | // for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) { 99 | // dot += t * dt; 100 | // } 101 | // let res = sig_data_slice.iter().zip(sig_dx_slice.iter()).map(|(t, dt)| t * (dt - dot)); 102 | // result_diff.write_iter(res)?; 103 | // Ok(()) 104 | // } 105 | 106 | // fn tanh_grad( 107 | // self: &Self, 108 | // x: &SharedTensor, 109 | // x_diff: &SharedTensor, 110 | // result: &SharedTensor, 111 | // result_diff: &mut SharedTensor) -> Result { 112 | // let res = x.as_slice().unwrap().iter() 113 | // .zip(x_diff.as_slice().unwrap().iter()) 114 | // .map(|(x, dx)| (1.0 - x.powi(2)) * *dx); 115 | // result_diff.write_iter(res)?; 116 | // Ok(()) 117 | // } 118 | } 119 | 120 | impl

Forward for Context

where 121 | P: Dependency { 122 | // fn elu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 123 | // let n = x.shape().capacity; 124 | // let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 125 | // let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 126 | 127 | // unsafe { 128 | // ocl::Kernel::new("elu_float", &self.extension_package().dependency().open_cl().program)? 129 | // .arg_buf(x) 130 | // .arg_buf(result) 131 | // .arg_scl(n as i32) 132 | 133 | // .gws([n]) 134 | // .queue(self.device().queue().clone()) 135 | // .enq()?; 136 | // } 137 | 138 | // Ok(()) 139 | // } 140 | 141 | fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 142 | let n = x.shape().capacity; 143 | let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 144 | let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 145 | 146 | unsafe { 147 | ocl::Kernel::new("log_softmax_float", &self.extension_package().dependency().open_cl().program)? 148 | .arg_buf(x) 149 | .arg_buf(result) 150 | .arg_scl(n as i32) 151 | 152 | .gws([1, 1, 1]) 153 | .lws([1, 1, 1]) 154 | .queue(self.device().queue().clone()) 155 | .enq()?; 156 | } 157 | 158 | Ok(()) 159 | } 160 | 161 | // fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 162 | // let n = x.shape().capacity; 163 | // let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 164 | // let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 165 | 166 | // unsafe { 167 | // ocl::Kernel::new("relu_float", &self.extension_package().dependency().open_cl().program)? 168 | // .arg_buf(x) 169 | // .arg_buf(result) 170 | // .arg_scl(n as i32) 171 | 172 | // .gws([n]) 173 | // .queue(self.device().queue().clone()) 174 | // .enq()?; 175 | // } 176 | 177 | // Ok(()) 178 | // } 179 | 180 | fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 181 | let n = x.shape().capacity; 182 | let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 183 | let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 184 | 185 | unsafe { 186 | ocl::Kernel::new("sigmoid_float", &self.extension_package().dependency().open_cl().program)? 187 | .arg_buf(x) 188 | .arg_buf(result) 189 | .arg_scl(n as i32) 190 | 191 | .gws([n]) 192 | .queue(self.device().queue().clone()) 193 | .enq()?; 194 | } 195 | 196 | Ok(()) 197 | } 198 | 199 | // fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result { 200 | // let n = x.shape().capacity; 201 | // let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?; 202 | // let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?; 203 | 204 | // unsafe { 205 | // ocl::Kernel::new("softmax_float", &self.extension_package().dependency().open_cl().program)? 206 | // .arg_buf(x) 207 | // .arg_buf(result) 208 | // .arg_scl(n as i32) 209 | 210 | // .gws([1, 1, 1]) 211 | // .lws([1, 1, 1]) 212 | // .queue(self.device().queue().clone()) 213 | // .enq()?; 214 | // } 215 | 216 | // Ok(()) 217 | // } 218 | } 219 | 220 | impl

Extension for Context

where 221 | P: Dependency { 222 | // .. 223 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/package.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | use std::ffi::CString; 3 | use parenchyma::error::Result; 4 | use parenchyma::frameworks::OpenCLContext; 5 | 6 | /// Caches instances of `Kernel` 7 | #[derive(Debug)] 8 | pub struct OpenCLPackage { 9 | pub(in frameworks::open_cl) program: ocl::Program, 10 | } 11 | 12 | impl OpenCLPackage { 13 | pub fn compile(cx: &mut OpenCLContext<()>) -> Result { 14 | let program = cx.program(vec![ 15 | CString::new(include_str!("source/activation.cl")).unwrap(), 16 | CString::new(include_str!("source/activationBackward.cl")).unwrap(), 17 | CString::new(include_str!("source/convolution.cl")).unwrap(), 18 | CString::new(include_str!("source/softmax.cl")).unwrap() 19 | ])?; 20 | 21 | // let cl_package = Package { 22 | // tanh: program.create_kernel("tanh_float")?, 23 | // sigmoid: program.create_kernel("sigmoid_float")?, 24 | // relu: program.create_kernel("relu_float")?, 25 | // elu: program.create_kernel("elu_float")?, 26 | 27 | // tanh_backward: program.create_kernel("tanh_backward_float")?, 28 | // sigmoid_backward: program.create_kernel("sigmoid_backward_float")?, 29 | // relu_backward: program.create_kernel("relu_backward_float")?, 30 | // elu_backward: program.create_kernel("elu_backward_float")?, 31 | 32 | // convolution: program.create_kernel("convolve_ints")?, 33 | 34 | // program, 35 | // }; 36 | 37 | Ok(OpenCLPackage { program }) 38 | } 39 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/source/activation.cl: -------------------------------------------------------------------------------- 1 | #define ACTIVATION_TYPE(function, type) \ 2 | kernel void function##_##type(global const type* in, global type* out, const uintptr_t len) \ 3 | { \ 4 | const uintptr_t current = get_global_id(0); \ 5 | if(current >= len) { \ 6 | return void(); \ 7 | } \ 8 | out[current] = function(in[current]); \ 9 | } \ 10 | 11 | #define ACTIVATION(function) ACTIVATION_TYPE(function, float) ACTIVATION_TYPE(function, double) \ 12 | 13 | // ================================================================================================= 14 | 15 | ACTIVATION(tanh) 16 | 17 | #define sigmoid(x) (1 / (1 + exp(-x))) 18 | ACTIVATION(sigmoid) 19 | 20 | #define relu(x) (x > 0 ? x : 0) 21 | ACTIVATION(relu) 22 | 23 | #define elu(x) (x > 0 ? x : exp(x) - 1) 24 | ACTIVATION(elu) -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/source/activationBackward.cl: -------------------------------------------------------------------------------- 1 | 2 | // TODO newline required for some reason.. 3 | #define BACKWARD_WITH_TYPE(name, type, activationDeriv) \ 4 | kernel void name##_backward_##type(global const type* in, global const type* inDiff, global type* outDiff, const uintptr_t len) \ 5 | { \ 6 | const uintptr_t current = get_global_id(0); \ 7 | if(current >= len) { \ 8 | return void(); \ 9 | } \ 10 | outDiff[current] = activationDeriv(in[current]) * inDiff[current]; \ 11 | } \ 12 | 13 | #define BACKWARD(name, deriv) \ 14 | BACKWARD_WITH_TYPE(name, float, deriv) BACKWARD_WITH_TYPE(name, double, deriv) \ 15 | 16 | // ================================================================================================= 17 | 18 | #define tanhDeriv(x) (1 - x * x) 19 | BACKWARD(tanh, tanhDeriv) 20 | 21 | #define sigmoidDeriv(x) (x * (1 - x)) 22 | BACKWARD(sigmoid, sigmoidDeriv) 23 | 24 | #define reluDeriv(x) (x > 0 ? 1 : 0) 25 | BACKWARD(relu, reluDeriv) 26 | 27 | #define eluDeriv(x) (x > 0 ? 1 : x + 1) 28 | BACKWARD(elu, eluDeriv) -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/source/convolution.cl: -------------------------------------------------------------------------------- 1 | 2 | // TODO newline required for some reason.. 3 | 4 | // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail 5 | // 6 | // This Source Code Form is subject to the terms of the Mozilla Public License, 7 | // v. 2.0. If a copy of the MPL was not distributed with this file, You can 8 | // obtain one at http://mozilla.org/MPL/2.0/. 9 | 10 | // expected defines: 11 | // one of: [ TANH | RELU | LINEAR ] 12 | // BIASED (or not) 13 | 14 | void kernel convolve_ints(global const int *p_imageSize, global const int *p_filterSize, 15 | global const int *image, global const int *filter, global int *result) { 16 | int id = get_global_id(0); 17 | int imageSize = p_imageSize[0]; 18 | int filterSize = p_filterSize[0]; 19 | int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize); 20 | int localid = id % (imageSize * imageSize); 21 | int row = localid / imageSize; 22 | int col = localid % imageSize; 23 | int halfFilterSize = filterSize >> 1; 24 | int sum = 0; 25 | int minm = max(-halfFilterSize, -row); 26 | int maxm = min(halfFilterSize, imageSize - 1 - row); 27 | int minn = max(-halfFilterSize, -col); 28 | int maxn = min(halfFilterSize, imageSize - 1 - col); 29 | int m = minm; 30 | while(m <= maxm) { 31 | int x = (row + m); 32 | int ximage = imageOffset + x * imageSize; 33 | int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize; 34 | int n = minn; 35 | while(n <= maxn) { 36 | int y = col + n; 37 | sum += image[ ximage + y] * filter[ filterrowoffset + n ]; 38 | n++; 39 | } 40 | m++; 41 | } 42 | result[id] = sum; 43 | } 44 | 45 | void kernel convolve_floats(global const int *p_imageSize, global const int *p_filterSize, 46 | global const float *image, global const float *filter, global float *result) { 47 | int id = get_global_id(0); 48 | int imageSize = p_imageSize[0]; 49 | int filterSize = p_filterSize[0]; 50 | int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize); 51 | int localid = id % (imageSize * imageSize); 52 | int row = localid / imageSize; 53 | int col = localid % imageSize; 54 | int halfFilterSize = filterSize >> 1; 55 | float sum = 0; 56 | int minm = max(-halfFilterSize, -row); 57 | int maxm = min(halfFilterSize, imageSize - 1 - row); 58 | int minn = max(-halfFilterSize, -col); 59 | int maxn = min(halfFilterSize, imageSize - 1 - col); 60 | int m = minm; 61 | while(m <= maxm) { 62 | int x = (row + m); 63 | int ximage = imageOffset + x * imageSize; 64 | int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize; 65 | int n = minn; 66 | while(n <= maxn) { 67 | int y = col + n; 68 | sum += image[ ximage + y] * filter[ filterrowoffset + n ]; 69 | n++; 70 | } 71 | m++; 72 | } 73 | result[id] = sum; 74 | } 75 | 76 | void kernel convolve_imagecubes_int(global const int *p_numInputPlanes, global const int *p_numFilters, 77 | global const int *p_imageSize, global const int *p_filterSize, 78 | global const int *images, global const int *filters, global int *output) { 79 | int globalId = get_global_id(0); 80 | 81 | int numInputPlanes = p_numInputPlanes[0]; 82 | int numFilters = p_numFilters[0]; 83 | int imageSize = p_imageSize[0]; 84 | int filterSize = p_filterSize[0]; 85 | int imageSizeSquared = imageSize * imageSize; 86 | 87 | int outputImage2Id = globalId / imageSizeSquared; 88 | int filterId = outputImage2Id % numFilters; 89 | int inputImage3Id = outputImage2Id / numFilters; 90 | 91 | int filterOffset = filterId * filterSize * filterSize; 92 | int inputImage3Offset = inputImage3Id * numInputPlanes * imageSizeSquared; 93 | 94 | // intraimage coords 95 | int localid = globalId % imageSizeSquared; 96 | int row = localid / imageSize; 97 | int col = localid % imageSize; 98 | 99 | int halfFilterSize = filterSize >> 1; 100 | int sum = 0; 101 | int minm = max(-halfFilterSize, -row); 102 | int maxm = min(halfFilterSize, imageSize - 1 - row); 103 | int minn = max(-halfFilterSize, -col); 104 | int maxn = min(halfFilterSize, imageSize - 1 - col); 105 | int plane = 0; 106 | while(plane < numInputPlanes) { 107 | int inputImageOffset = inputImage3Offset + plane * imageSizeSquared; 108 | int filterPlaneOffset = filterOffset + plane * filterSize * filterSize; 109 | int m = minm; 110 | while(m <= maxm) { 111 | int y = row + m; 112 | int inputimagerowoffset = inputImageOffset + y * imageSize; 113 | int filterrowoffset = filterPlaneOffset + (m+halfFilterSize) * filterSize + halfFilterSize; 114 | int n = minn; 115 | while(n <= maxn) { 116 | int x = col + n; 117 | sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ]; 118 | n++; 119 | } 120 | m++; 121 | } 122 | plane++; 123 | } 124 | output[globalId] = sum; 125 | } 126 | 127 | // receive images as a stack of images 128 | // globalid = n * numfilters * imagesize * imagesize + filter * imagesize * imagesize + imagerow * imagesize + imagecol 129 | // globalid globalid 130 | // inputimage3 1 inputimage2 1----filter 1 -> outputimage2 1 outputimage3 1 131 | // inputimage2 2_/\_filter 2 -> outputimage2 2 132 | // inputimage3 2 inputimage2 3 filter 1 -> outputimage2 3 outputimage3 2 133 | // inputimage2 4 filter 2 -> outputimage2 4 134 | // 135 | // each outputimage is only written once, by a combination of: 136 | // - one inputimage3 137 | // - one filter 138 | // each inputimage3 is mapped to each filter once, each time writing to one outputimage 139 | // 140 | // images is: 141 | // numimages * numinputplanes * imagesizesquared 142 | // filters is: 143 | // numfilters * numinputplanes * filtersizesquared 144 | // outputs is: 145 | // numimages * numfilters * outputimagesizesquared 146 | 147 | // images are organized like [imageId][plane][row][col] 148 | // filters are organized like [filterid][plane][filterrow][filtercol] 149 | // output are organized like [imageid][filterid][row][col] 150 | void kernel convolve_imagecubes_float( 151 | const int numInputPlanes, const int numFilters, 152 | const int imageSize, const int filterSize, 153 | global const float *images, global const float *filters, global float *output) { 154 | int globalId = get_global_id(0); 155 | 156 | int imageSizeSquared = imageSize * imageSize; 157 | 158 | int outputImage2Id = globalId / imageSizeSquared; 159 | int filterId = outputImage2Id % numFilters; 160 | int inputImage3Id = outputImage2Id / numFilters; 161 | 162 | int filterOffset = filterId * filterSize * filterSize; 163 | int inputImage3Offset = inputImage3Id * numInputPlanes * imageSizeSquared; 164 | 165 | // intraimage coords 166 | int localid = globalId % imageSizeSquared; 167 | int row = localid / imageSize; 168 | int col = localid % imageSize; 169 | 170 | int halfFilterSize = filterSize >> 1; 171 | float sum = 0; 172 | // m should vary from -halfFilterSize through 0 to halfFilterSize 173 | // n too... 174 | int minm = max(-halfFilterSize, -row); 175 | int maxm = min(halfFilterSize, imageSize - 1 - row); 176 | int minn = max(-halfFilterSize, -col); 177 | int maxn = min(halfFilterSize, imageSize - 1 - col); 178 | int inputPlane = 0; 179 | while(inputPlane < numInputPlanes) { 180 | int inputImageOffset = inputImage3Offset + inputPlane * imageSizeSquared; 181 | int m = minm; 182 | while(m <= maxm) { 183 | int y = row + m; 184 | int inputimagerowoffset = inputImageOffset + y * imageSize; 185 | int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize; 186 | int n = minn; 187 | while(n <= maxn) { 188 | int x = col + n; 189 | sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ]; 190 | n++; 191 | } 192 | m++; 193 | } 194 | inputPlane++; 195 | } 196 | 197 | output[globalId] = sum; 198 | } 199 | 200 | void kernel convolve_imagecubes_float_nopadzeros( 201 | const int numInputPlanes, const int numFilters, 202 | const int inputSize, const int filterSize, 203 | global const float *images, global const float *filters, global float *output) { 204 | int globalId = get_global_id(0); 205 | 206 | int inputSizeSquared = inputSize * inputSize; 207 | int outputSize = inputSize - filterSize + 1; 208 | int outputSizeSquared = outputSize * outputSize; 209 | 210 | int outputImage2Id = globalId / outputSizeSquared; 211 | int filterId = outputImage2Id % numFilters; 212 | int inputImage3Id = outputImage2Id / numFilters; 213 | 214 | int filterOffset = filterId * filterSize * filterSize; 215 | int inputImage3Offset = inputImage3Id * numInputPlanes * inputSizeSquared; 216 | 217 | // intraimage coords 218 | int localid = globalId % outputSizeSquared; 219 | int outputRow = localid / outputSize; 220 | int outputCol = localid % outputSize; 221 | 222 | int halfFilterSize = filterSize >> 1; 223 | float sum = 0; 224 | int minm = -halfFilterSize; 225 | int maxm = halfFilterSize; 226 | int minn = -halfFilterSize; 227 | int maxn = halfFilterSize; 228 | int inputPlane = 0; 229 | while(inputPlane < numInputPlanes) { 230 | int inputImageOffset = inputImage3Offset + inputPlane * inputSizeSquared; 231 | int m = minm; 232 | while(m <= maxm) { 233 | int inputRow = outputRow + m + halfFilterSize; 234 | int inputimagerowoffset = inputImageOffset + inputRow * inputSize; 235 | int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize; 236 | int n = minn; 237 | while(n <= maxn) { 238 | int inputCol = outputCol + n + halfFilterSize; 239 | sum += images[ inputimagerowoffset + inputCol] * filters[ filterrowoffset + n ]; 240 | n++; 241 | } 242 | m++; 243 | } 244 | inputPlane++; 245 | } 246 | output[globalId] = sum; 247 | } 248 | 249 | -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/frameworks/open_cl/source/softmax.cl: -------------------------------------------------------------------------------- 1 | 2 | // TODO write a parallel reduction: 3 | // 4 | // * find the max element in the array 5 | // * .. 6 | 7 | #define SOFTMAX(interfn) \ 8 | __kernel __attribute((reqd_work_group_size(1, 1, 1))) \ 9 | void interfn##_float(__global float* x, __global float* result, const uintptr_t len) { \ 10 | float in_max = -MAXFLOAT; \ 11 | float sum = 0.0; \ 12 | uintptr_t i; \ 13 | for(i = 0; i < len; i++) { \ 14 | float current = x[i]; \ 15 | in_max = (in_max > current) ? in_max : current; \ 16 | } \ 17 | for(i = 0; i < len; i++) { \ 18 | float current = exp(x[i] - in_max); \ 19 | sum += current; \ 20 | result[i] = current; \ 21 | } \ 22 | for(i = 0; i < len; i++) { \ 23 | result[i] = interfn(result[i] / sum); \ 24 | } \ 25 | } \ 26 | 27 | #define softmax(x) (x) 28 | SOFTMAX(softmax) 29 | 30 | #define log_softmax(x) (log(x)) 31 | SOFTMAX(log_softmax) 32 | 33 | __kernel void log_softmax_backward_float( 34 | __global float* x, 35 | __global float* x_diff, 36 | __global float* result, 37 | const uintptr_t len) 38 | { 39 | float sum = 0.0; 40 | uintptr_t i; 41 | for(i = 0; i < len; i++) { 42 | sum += x_diff[i]; 43 | } 44 | for(i = 0; i < len; i++) { 45 | result[i] = x_diff[i] - exp(x[i]) * sum; 46 | } 47 | } -------------------------------------------------------------------------------- /crates/parenchyma-deep/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Parenchyma extension package for backend-agnostic deep neural network (NN) operations. 2 | 3 | #![allow(unused_variables)] 4 | #![feature(non_modrs_mods)] 5 | 6 | extern crate ocl; 7 | extern crate parenchyma; 8 | 9 | pub use self::extension_package::{Extension, Package}; 10 | pub mod frameworks; 11 | 12 | mod extension_package; -------------------------------------------------------------------------------- /crates/parenchyma-deep/tests/deep_specs.rs: -------------------------------------------------------------------------------- 1 | #![feature(rustc_private)] 2 | 3 | #[macro_use] 4 | extern crate lazy_static; 5 | extern crate parenchyma; 6 | extern crate parenchyma_deep; 7 | 8 | #[cfg(test)] 9 | mod deep_specification_native { 10 | use parenchyma::frameworks::Native; 11 | use parenchyma::prelude::*; 12 | use parenchyma_deep::*; 13 | 14 | struct TestBackend(Backend); 15 | impl ::std::ops::Deref for TestBackend { 16 | type Target = Backend; 17 | fn deref(&self) -> &Self::Target { &self.0 } 18 | } 19 | unsafe impl Sync for TestBackend { } 20 | 21 | lazy_static! { 22 | static ref BACKEND: TestBackend = TestBackend(Backend::new::>().unwrap()); 23 | } 24 | 25 | fn get_memory() -> (SharedTensor, SharedTensor) { 26 | let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap(); 27 | let result: SharedTensor = SharedTensor::from([1, 1, 3]); 28 | (x, result) 29 | } 30 | 31 | fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){ 32 | let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 33 | let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 34 | let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 35 | let result_diff = SharedTensor::from([1, 1, 3]); 36 | (x, x_diff, result, result_diff) 37 | } 38 | 39 | fn get_memory_softmax() -> (SharedTensor, SharedTensor) { 40 | let x = SharedTensor::with([1, 1, 4], vec![1.0; 4]).unwrap(); 41 | let result: SharedTensor = SharedTensor::from([1, 1, 4]); 42 | (x, result) 43 | } 44 | 45 | #[test] 46 | fn it_computes_correct_log_softmax_on_for_f32() { 47 | let (mut x, mut result) = get_memory_softmax(); 48 | BACKEND.log_softmax(&mut x, &mut result).unwrap(); 49 | assert_eq!(&[-1.3862944, -1.3862944, -1.3862944, -1.3862944], result.as_slice().unwrap()); 50 | } 51 | 52 | #[test] 53 | fn it_computes_correct_log_softmax_grad_on_for_f32() { 54 | let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory(); 55 | BACKEND.log_softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap(); 56 | assert_eq!(&[-9.873127, -9.873127, -27.556225], result_diff.as_slice().unwrap()); 57 | } 58 | 59 | #[test] 60 | fn it_computes_correct_relu_on_for_f32() { 61 | let (mut x, mut result) = get_memory(); 62 | BACKEND.relu(&mut x, &mut result).unwrap(); 63 | assert_eq!(&[1., 1., 2.], result.as_slice().unwrap()); 64 | } 65 | 66 | #[test] 67 | fn it_computes_correct_relu_grad_on_for_f32() { 68 | let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory(); 69 | BACKEND.relu_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap(); 70 | assert_eq!(&[1., 1., 2.], result_diff.as_slice().unwrap()); 71 | } 72 | 73 | #[test] 74 | fn it_computes_correct_sigmoid_on_for_f32() { 75 | let (mut x, mut result) = get_memory(); 76 | BACKEND.sigmoid(&mut x, &mut result).unwrap(); 77 | assert_eq!(&[0.7310585786, 0.7310586, 0.880797], result.as_slice().unwrap()); 78 | } 79 | 80 | #[test] 81 | fn it_computes_correct_sigmoid_grad_on_for_f32() { 82 | let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory(); 83 | BACKEND.sigmoid_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap(); 84 | assert_eq!(&[0., 0., -4.], result_diff.as_slice().unwrap()); 85 | } 86 | 87 | #[test] 88 | fn it_computes_correct_softmax_on_for_f32() { 89 | let (mut x, mut result) = get_memory_softmax(); 90 | BACKEND.softmax(&mut x, &mut result).unwrap(); 91 | assert_eq!(&[0.25, 0.25, 0.25, 0.25], result.as_slice().unwrap()); 92 | } 93 | 94 | #[test] 95 | fn it_computes_correct_softmax_grad_on_for_f32() { 96 | let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory(); 97 | BACKEND.softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap(); 98 | assert_eq!(&[-5., -5., -8.], result_diff.as_slice().unwrap()); 99 | } 100 | } 101 | 102 | #[cfg(test)] 103 | mod deep_specification_opencl { 104 | use parenchyma::frameworks::OpenCL; 105 | use parenchyma::hardware::{Hardware, HardwareKind}; 106 | use parenchyma::prelude::*; 107 | use parenchyma_deep::*; 108 | 109 | struct TestBackend(Backend); 110 | impl ::std::ops::Deref for TestBackend { 111 | type Target = Backend; 112 | fn deref(&self) -> &Self::Target { &self.0 } 113 | } 114 | unsafe impl Sync for TestBackend { } 115 | 116 | lazy_static! { 117 | static ref BACKEND: TestBackend = { 118 | let mut backend: Backend = Backend::new::>().unwrap(); 119 | // required here! 120 | backend.select(&|hardware| hardware.kind == HardwareKind::GPU); 121 | TestBackend(backend) 122 | }; 123 | } 124 | 125 | fn get_memory() -> (SharedTensor, SharedTensor) { 126 | let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap(); 127 | let result: SharedTensor = SharedTensor::from([1, 1, 3]); 128 | (x, result) 129 | } 130 | 131 | fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){ 132 | let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 133 | let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 134 | let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 135 | let result_diff = SharedTensor::from([1, 1, 3]); 136 | (x, x_diff, result, result_diff) 137 | } 138 | 139 | fn get_memory_softmax() -> (SharedTensor, SharedTensor) { 140 | let x = SharedTensor::with([1, 1, 4], vec![1.0; 4]).unwrap(); 141 | let result: SharedTensor = SharedTensor::from([1, 1, 4]); 142 | (x, result) 143 | } 144 | 145 | #[test] 146 | fn it_computes_correct_log_softmax_on_for_f32() { 147 | let (mut x, mut result) = get_memory_softmax(); 148 | BACKEND.log_softmax(&mut x, &mut result).unwrap(); 149 | assert_eq!(&[-1.3862944, -1.3862944, -1.3862944, -1.3862944], result.as_slice().unwrap()); 150 | } 151 | 152 | #[test] 153 | fn it_computes_correct_relu_on_for_f32() { 154 | let (mut x, mut result) = get_memory(); 155 | BACKEND.relu(&mut x, &mut result).unwrap(); 156 | assert_eq!(&[1., 1., 2.], result.as_slice().unwrap()); 157 | } 158 | 159 | #[test] 160 | fn it_computes_correct_sigmoid_on_for_f32() { 161 | let (mut x, mut result) = get_memory(); 162 | BACKEND.sigmoid(&mut x, &mut result).unwrap(); 163 | assert_eq!(&[0.7310585786, 0.7310586, 0.880797], result.as_slice().unwrap()); 164 | } 165 | 166 | #[test] 167 | fn it_computes_correct_softmax_on_for_f32() { 168 | let (mut x, mut result) = get_memory_softmax(); 169 | BACKEND.softmax(&mut x, &mut result).unwrap(); 170 | assert_eq!(&[0.25, 0.25, 0.25, 0.25], result.as_slice().unwrap()); 171 | } 172 | } 173 | 174 | #[cfg(test)] 175 | mod deep_specification_backward_opencl { 176 | use parenchyma::frameworks::OpenCL; 177 | use parenchyma::hardware::{Hardware, HardwareKind}; 178 | use parenchyma::prelude::*; 179 | use parenchyma_deep::*; 180 | 181 | struct TestBackend(Backend); 182 | impl ::std::ops::Deref for TestBackend { 183 | type Target = Backend; 184 | fn deref(&self) -> &Self::Target { &self.0 } 185 | } 186 | unsafe impl Sync for TestBackend { } 187 | 188 | lazy_static! { 189 | static ref BACKEND: TestBackend = { 190 | let mut backend: Backend = Backend::new::>().unwrap(); 191 | // required here! 192 | backend.select(&|hardware| hardware.kind == HardwareKind::GPU); 193 | TestBackend(backend) 194 | }; 195 | } 196 | 197 | fn get_memory() -> (SharedTensor, SharedTensor) { 198 | let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap(); 199 | let result: SharedTensor = SharedTensor::from([1, 1, 3]); 200 | (x, result) 201 | } 202 | 203 | fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){ 204 | let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 205 | let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 206 | let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap(); 207 | let result_diff = SharedTensor::from([1, 1, 3]); 208 | (x, x_diff, result, result_diff) 209 | } 210 | 211 | #[test] 212 | fn it_computes_correct_log_softmax_grad_on_for_f32() { 213 | let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory(); 214 | BACKEND.log_softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap(); 215 | assert_eq!(&[-9.873127, -9.873127, -27.556223], result_diff.as_slice().unwrap()); 216 | } 217 | 218 | #[test] 219 | fn it_computes_correct_sigmoid_grad_on_for_f32() { 220 | let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory(); 221 | BACKEND.sigmoid_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap(); 222 | assert_eq!(&[0., 0., -4.], result_diff.as_slice().unwrap()); 223 | } 224 | } -------------------------------------------------------------------------------- /crates/parenchyma-ml/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target 3 | **/*.rs.bk 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /crates/parenchyma-ml/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parenchyma-ml" 3 | version = "0.1.0" 4 | authors = ["Jony "] 5 | license = "MIT/Apache-2.0" 6 | 7 | [dependencies.parenchyma] 8 | path = "../../" 9 | version = "0.0.4" 10 | 11 | [dependencies.parenchyma-blas] 12 | path = "../parenchyma-blas" 13 | 14 | [dependencies.parenchyma-deep] 15 | path = "../parenchyma-deep" -------------------------------------------------------------------------------- /crates/parenchyma-ml/src/extension_package.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::extension_package::{Dependency, ExtensionPackage}; 2 | use super::{parenchyma_blas, parenchyma_deep}; 3 | 4 | /// The machine learning package. 5 | pub struct Package { 6 | /// The BLAS package. 7 | pub(crate) blas: parenchyma_blas::Package, 8 | /// The Deep NN package. 9 | pub(crate) deep: parenchyma_deep::Package, 10 | } 11 | 12 | impl Dependency for Package { 13 | fn dependency(&self) -> &parenchyma_blas::Package { 14 | &self.blas 15 | } 16 | } 17 | 18 | impl Dependency for Package { 19 | fn dependency(&self) -> &parenchyma_deep::Package { 20 | &self.deep 21 | } 22 | } 23 | 24 | /// **note**: should be replaced with an actual trait alias ([RFC#1733]). 25 | /// 26 | /// [RFC#1733]: https://github.com/rust-lang/rfcs/pull/1733 27 | pub trait Dependencies: 28 | Dependency + 29 | Dependency { 30 | //.. 31 | } 32 | 33 | impl Dependencies for D 34 | where D: 35 | Dependency + 36 | Dependency { 37 | // .. 38 | } 39 | 40 | pub trait Extension 41 | where Self: 42 | parenchyma_blas::Extension + 43 | parenchyma_deep::Extension { 44 | // .. 45 | } 46 | 47 | impl ExtensionPackage for Package { 48 | type Extension = Extension; 49 | fn package_name(&self) -> &'static str { 50 | return "parenchyma/ml"; 51 | } 52 | } -------------------------------------------------------------------------------- /crates/parenchyma-ml/src/frameworks/mod.rs: -------------------------------------------------------------------------------- 1 | mod native; 2 | mod open_cl; -------------------------------------------------------------------------------- /crates/parenchyma-ml/src/frameworks/native.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::frameworks::NativeContext as Context; 2 | use super::super::{Dependencies, Extension}; 3 | 4 | impl

Extension for Context

where P: Dependencies { } -------------------------------------------------------------------------------- /crates/parenchyma-ml/src/frameworks/open_cl.rs: -------------------------------------------------------------------------------- 1 | use super::super::{Dependencies, Extension, Package}; 2 | 3 | use parenchyma::error::Result; 4 | use parenchyma::extension_package::ExtensionPackageCtor; 5 | use parenchyma::frameworks::OpenCLContext as Context; 6 | use parenchyma_blas::Package as BLASPackage; 7 | use parenchyma_blas::frameworks::open_cl::OpenCLPackage as OpenCLBLASPackage; 8 | use parenchyma_deep::Package as DeepPackage; 9 | use parenchyma_deep::frameworks::open_cl::OpenCLPackage as OpenCLDeepPackage; 10 | 11 | impl

Extension for Context

where P: Dependencies { } 12 | 13 | impl ExtensionPackageCtor> for Package { 14 | fn package(target: &mut Context<()>) -> Result { 15 | let blas = OpenCLBLASPackage::compile(target).map(BLASPackage::OpenCL)?; 16 | let deep = OpenCLDeepPackage::compile(target).map(DeepPackage::OpenCL)?; 17 | 18 | Ok(Package { blas, deep }) 19 | } 20 | } -------------------------------------------------------------------------------- /crates/parenchyma-ml/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Parenchyma extension bundle for backend-agnostic operations related to machine learning (ML). 2 | //! 3 | //! A Parenchyma package that bundles the BLAS and Deep NN packages together to make one convenient 4 | //! ML package. 5 | //! 6 | //! # Example Usage 7 | //! 8 | //! ```ignore 9 | //! extern crate parenchyma; 10 | //! extern crate parenchyma_ml; 11 | //! 12 | //! #[macro_use] 13 | //! use parenchyma::prelude::*; 14 | //! use extension_package::package::Package as MachLrnPackage; 15 | //! 16 | //! // Initialize an OpenCL or CUDA backend packaged with the NN extension. 17 | //! let backend = BackendConfig::::new::()?; 18 | //! 19 | //! // Initialize two tensors. 20 | //! let ref x: SharedTensor = array![3.5, 12.4, 0.5, 6.5].into(); 21 | //! let ref mut result: SharedTensor = data.shape().into(); 22 | //! 23 | //! // Run the sigmoid operation, provided by the NN extension, on your OpenCL/CUDA enabled 24 | //! // GPU (or CPU, which is possible through OpenCL) 25 | //! backend.sigmoid(x, result)?; 26 | //! 27 | //! // Print the result: `[0.97068775, 0.9999959, 0.62245935, 0.9984988] shape=[4], strides=[1]` 28 | //! println!("{:?}", result); 29 | //! ``` 30 | 31 | extern crate parenchyma; 32 | extern crate parenchyma_blas; 33 | extern crate parenchyma_deep; 34 | 35 | pub use self::extension_package::{Dependencies, Extension, Package}; 36 | 37 | mod extension_package; 38 | mod frameworks; -------------------------------------------------------------------------------- /crates/parenchyma-tr/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock -------------------------------------------------------------------------------- /crates/parenchyma-tr/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "parenchyma-tr" 3 | version = "0.1.0" 4 | authors = ["Jony "] 5 | license = "MIT/Apache-2.0" 6 | 7 | [dependencies] 8 | image = "0.17.0" 9 | 10 | [dependencies.parenchyma] 11 | path = "../parenchyma" -------------------------------------------------------------------------------- /crates/parenchyma-tr/README.md: -------------------------------------------------------------------------------- 1 | # parenchyma-tr 2 | 3 | **parenchyma-tr** _ processing data for machine learning tasks and for making the output data 4 | easier to work with. -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! A framework for pre and post processing machine intelligence based data 2 | 3 | extern crate image; 4 | extern crate parenchyma; 5 | 6 | pub use self::transformer::Transformer; 7 | 8 | mod transformer; 9 | mod transformers; -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/transformer.rs: -------------------------------------------------------------------------------- 1 | use parenchyma::error::Result; 2 | use parenchyma::prelude::SharedTensor; 3 | 4 | /// An trait for dealing with transformers so that any transformable data type can be 5 | /// transformed into a `SharedTensor`. 6 | pub trait Transformer { 7 | /// Returns the non-numeric data as a vector. 8 | fn as_vector(&self) -> Vec; 9 | /// Transforms (possibly non-numeric) data into a numeric `SharedTensor` with the provided 10 | /// `shape`. 11 | /// 12 | /// # Returns 13 | /// 14 | /// An `Error` is returned if the expected capacity (defined by the `shape`) differs from the 15 | /// observed one. 16 | fn transform(&self, shape: &[usize]) -> Result> { 17 | SharedTensor::with(shape, self.as_vector()) 18 | } 19 | } -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/transformers/audio.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonysy/parenchyma/d6043971a0b4cdea0430b4d0face7be9cf2ccde9/crates/parenchyma-tr/src/transformers/audio.rs -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/transformers/image.rs: -------------------------------------------------------------------------------- 1 | use image::DynamicImage; 2 | 3 | use super::super::Transformer; 4 | 5 | impl Transformer for DynamicImage { 6 | fn as_vector(&self) -> Vec { 7 | self.raw_pixels().iter().map(|&elem| elem as f32).collect() 8 | } 9 | } -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/transformers/mod.rs: -------------------------------------------------------------------------------- 1 | mod audio; 2 | mod image; 3 | mod word; -------------------------------------------------------------------------------- /crates/parenchyma-tr/src/transformers/word.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonysy/parenchyma/d6043971a0b4cdea0430b4d0face7be9cf2ccde9/crates/parenchyma-tr/src/transformers/word.rs -------------------------------------------------------------------------------- /license/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Project Developers & Contributors 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /license/README.md: -------------------------------------------------------------------------------- 1 | Licensed under either 2 | 3 | * Apache license, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 4 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) -------------------------------------------------------------------------------- /src/backend.rs: -------------------------------------------------------------------------------- 1 | //! The `Backend` is the heart of Parenchyma. It provides an interface for running parallel 2 | //! computations on one or more devices. It is the main and highest struct of Parenchyma. 3 | //! 4 | //! The `Backend` type is an abstraction over a [framework](./trait.Framework.html) and is used as 5 | //! a way to interact with your devices. You can create a backend for computation by first choosing 6 | //! a specific [framework](./trait.Framework.html) such as Open CL and afterwards selecting one or 7 | //! many available hardware to create a backend. A backend provides you with the functionality of 8 | //! managing the memory of the devices and copying memory objects to/from the host. Additionally, 9 | //! backends allow you to execute operations in parallel through kernel functions on the device(s) 10 | //! of the backend. 11 | //! 12 | //! # Architecture 13 | //! 14 | //! Backends are initialized by providing a framework and a selection of devices compatible with 15 | //! the framework to the [`Backend::new`](#method.new) associated function, or by simply 16 | //! calling [`Backend::default`](#method.default). The framework determines which devices are 17 | //! available and how parallel kernel functions can be executed. 18 | //! 19 | //! # Example 20 | //! 21 | //! ``` 22 | //! extern crate parenchyma; 23 | //! 24 | //! use parenchyma::frameworks::Native; 25 | //! use parenchyma::prelude::*; 26 | //! 27 | //! // The `new` function initializes the framework on which it's called. 28 | //! let framework: Native = Native::new().unwrap(); 29 | //! // The available frameworks can be obtained through the chosen `framework`. 30 | //! let hardware = framework.hardware().to_vec(); 31 | //! // A ready to go backend can be created from the framework and hardware. It's worth noting that 32 | //! // configuration options will be available in future versions. 33 | //! let backend: Backend = Backend::with(framework, hardware).unwrap(); 34 | //! ``` 35 | 36 | use std::fmt; 37 | use std::ops::Deref; 38 | 39 | use super::compute_device::ComputeDevice; 40 | use super::context::{Context, ContextCtor}; 41 | use super::error::{Error, ErrorKind, Result}; 42 | use super::extension_package::ExtensionPackage; 43 | use super::framework::{Framework, FrameworkCtor}; 44 | use super::hardware::Hardware; 45 | 46 | /// The representation of the backend. 47 | pub struct Backend { 48 | /// Provides the Framework. 49 | /// 50 | /// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be 51 | /// used and determines which hardwares will be available and how parallel kernel 52 | /// functions can be executed. 53 | framework: Box, 54 | /// The context associated with the `framework`. 55 | /// 56 | /// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one 57 | /// or more devices that are capable of executing methods and synchronizing memory. See 58 | /// the `Context` trait for more information. 59 | context: Box>, 60 | /// All _activatable_ hardware provided to the context. 61 | /// 62 | /// A cache of the hardware selection which is used as a representation of each framework's 63 | /// list of available devices when selecting a new active device. 64 | selection: Vec 65 | } 66 | 67 | impl

Backend

where P: ExtensionPackage { 68 | /// Constructs a backend of the provided type with its default configurations. 69 | /// 70 | /// # Return value 71 | /// 72 | /// The return value is a backend if the process goes well; otherwise, it returns 73 | /// a simple error. 74 | pub fn new() -> Result 75 | where F: FrameworkCtor, 76 | F::Context: ContextCtor { 77 | 78 | let framework = F::new()?; 79 | let hardware = framework.hardware().to_vec(); 80 | Self::with(framework, hardware) 81 | } 82 | 83 | /// Constructs a backend from the specified `framework` and `selection`. 84 | /// 85 | /// # Arguments 86 | /// 87 | /// * `framework` - One of the available frameworks. 88 | /// * `selection` - A selection of hardware provided by the specified `framework`. 89 | /// 90 | /// # Return value 91 | /// 92 | /// The return value is a backend if the process goes well; otherwise, it returns 93 | /// a simple error. 94 | pub fn with(framework: F, selection: Vec) -> Result 95 | where F: FrameworkCtor, 96 | F::Context: ContextCtor, { 97 | 98 | info!("[PARENCHYMA] Constructing a backend using the {} framework", framework.name()); 99 | let context = box F::Context::new(&framework, &selection)? as Box>; 100 | let framework = box framework as Box; 101 | Ok(Self { framework, context, selection }) 102 | } 103 | } 104 | 105 | impl

Backend

where P: ExtensionPackage { 106 | /// Returns the active framework's active context's active device. 107 | pub fn active_device(&self) -> &dyn ComputeDevice { 108 | self.context.active_codev() 109 | } 110 | 111 | /// Simply returns the selected hardware. 112 | pub fn selection(&self) -> &[Hardware] { 113 | &self.selection 114 | } 115 | 116 | /// Select the first device that meets the specified requirements. 117 | /// 118 | /// # Example 119 | /// 120 | /// ```rust 121 | /// use parenchyma::{Backend, HardwareKind, Native}; 122 | /// 123 | /// let mut native: Backend = Backend::new::().unwrap(); 124 | /// assert!(native.select(|hardware| hardware.kind == HardwareKind::CPU).is_ok()); 125 | /// ``` 126 | pub fn select(&mut self, pred: &Fn(&Hardware) -> bool) -> Result { 127 | 128 | let nth = { 129 | self.selection().iter().enumerate() 130 | .filter(|&(_, h)| pred(h)).map(|(i, _)| i).nth(0) 131 | }; 132 | 133 | match nth { 134 | Some(n) => self.context.activate(n), 135 | _ => { 136 | let message = "There are no devices matching the specified criteria."; 137 | Err(Error::new(ErrorKind::Other, message)) 138 | } 139 | } 140 | } 141 | 142 | /// Synchronizes backend. 143 | pub fn synchronize(&self) -> Result { 144 | Ok(()) 145 | } 146 | } 147 | 148 | impl

Deref for Backend

where P: ExtensionPackage { 149 | type Target = P::Extension; 150 | 151 | fn deref<'a>(&'a self) -> &'a Self::Target { 152 | self.context.extension() 153 | } 154 | } 155 | 156 | impl fmt::Debug for Backend { 157 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 158 | write!(f, "A backend provided by the {} framework", self.framework.name()) 159 | } 160 | } -------------------------------------------------------------------------------- /src/changelog.rs: -------------------------------------------------------------------------------- 1 | //! Project changelog (YEAR-MONTH-DAY) 2 | 3 | /// Release 0.0.4 (2017-11-08) 4 | /// 5 | /// * Partially sketched out a transfer matrix addressing issue#23 6 | /// * Simplified the complicated extension/build system resolving issue#25 7 | /// * The new extension/build system allows for framework specific backends. 8 | /// * Worked on a OpenCL solution to issue#16 9 | /// * Removed ndarray as it's not needed, which closes issue#20 10 | /// * Mapped memory process doesn't work well with ndarray + reshaping a tensor means reshaping 11 | /// the native array 12 | /// * Lazy synchronization via auto-sync has been fully integrated 13 | /// * Implemented logic around pinned memory with unpinned memory fallback 14 | pub mod r0_0_4 {} 15 | 16 | /// Release 0.0.3 (2017-03-04) 17 | /// 18 | /// * Implemented an OpenCL API wrapper 19 | /// * Partially implemented a CUDA API wrapper 20 | /// * Partially implemented native support 21 | /// * Worked on a fallback mechanism (see issue#15) 22 | /// * No longer requires framework related feature flags (from the original Collenchyma project) 23 | /// * No longer requires backends parameterized by a framework 24 | /// * New memory access API 25 | /// * Implemented auto-sync 26 | /// * Use a tensor lib (ndarray) as the underlying native memory representation 27 | /// * Add `Bundle` logic 28 | /// * Removed `IBinary`/`HashMap` technique. Use structs instead 29 | pub mod r0_0_3 {} -------------------------------------------------------------------------------- /src/compute_device.rs: -------------------------------------------------------------------------------- 1 | //! Provides a representation for one or many ready to use compute devices. 2 | 3 | use std::any::{Any, TypeId}; 4 | 5 | use super::error::Result; 6 | use super::memory::Memory; 7 | use super::tensor::TensorShape; 8 | 9 | /// An device capable of processing data. 10 | /// 11 | /// A compute device can be a single device, or multiple devices treated as a single device. 12 | /// 13 | /// ## Load Balancing Multiple Devices 14 | /// 15 | /// todo.. 16 | pub trait ComputeDevice: Any + Allocate + Allocate { } 17 | 18 | /// Implemented by allocators. 19 | pub trait Allocate { 20 | /// Allocates memory on the device. 21 | fn allocate(&self, shape: &TensorShape) -> Result>>; 22 | } 23 | 24 | impl ComputeDevice { 25 | /// Returns `true` if the boxed type is the same as `T`. 26 | #[inline] 27 | pub fn is(&self) -> bool where T: ComputeDevice { 28 | // Get TypeId of the type this function is instantiated with 29 | let t = TypeId::of::(); 30 | // Get TypeId of the type in the trait object 31 | let boxed = self.get_type_id(); 32 | // Compare both TypeIds on equality 33 | t == boxed 34 | } 35 | 36 | /// Returns some reference to the boxed value if it is of type `T`, or 37 | /// `None` if it isn't. 38 | #[inline] 39 | pub fn downcast_ref(&self) -> Option<&T> where T: ComputeDevice { 40 | if self.is::() { 41 | unsafe { 42 | Some(&*(self as *const ComputeDevice as *const T)) 43 | } 44 | } else { 45 | None 46 | } 47 | } 48 | } -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | //! Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for 2 | //! objects such as memory, command-queues, programs/modules and kernels. 3 | //! 4 | //! You can create a context encapsulating a selection of hardware via a [`Backend`]. 5 | //! 6 | //! [`Backend`]: ./struct.Backend.html 7 | 8 | use super::compute_device::ComputeDevice; 9 | use super::error::Result; 10 | use super::extension_package::ExtensionPackage; 11 | use super::hardware::Hardware; 12 | 13 | /// A trait implemented by all contexts. 14 | pub trait Context: 'static { 15 | /// The extension package built for the framework's context. 16 | type Package: ExtensionPackage; 17 | /// Returns the active device. 18 | fn active_codev(&self) -> &ComputeDevice; 19 | /// Returns the package extension. 20 | fn extension(&self) -> &::Extension; 21 | /// Set the device at the specified `index` as the active device. 22 | /// 23 | /// Only one device can be the _active_ device - the device in which operations are executed - 24 | /// if used through the context. 25 | fn activate(&mut self, index: usize) -> Result; 26 | } 27 | 28 | /// The non-object-safe part of the `Context`. 29 | /// 30 | /// todo: generic associated types may help here.. 31 | pub trait ContextCtor 32 | where Self: Context + Sized, 33 | Package: ExtensionPackage { 34 | /// The framework representation for the context. 35 | type F; 36 | /// Constructs a new context from the `framework` and the `selection` of hardware. 37 | fn new(framework: &Self::F, selection: &[Hardware]) -> Result; 38 | } -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | //! Types for working with errors. 2 | 3 | use std::{error, fmt}; 4 | use std::ops::Deref; 5 | 6 | /// A specialized `Result` typedef. 7 | pub type Result = ::std::result::Result; 8 | 9 | /// The error structure used by the Parenchyma crate. 10 | #[derive(Debug)] 11 | pub struct Error { 12 | kind: ErrorKind, 13 | /// A boxed sendable, syncable `Error`. 14 | inner: Option>, 15 | } 16 | 17 | /// A set of general categories. 18 | #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 19 | pub enum ErrorKind { 20 | /// A framework-specific error. 21 | /// 22 | /// Consider creating an framework-specific error by calling the `Error::from_framework` 23 | /// function, rather than constructing an `Error` using this variant. 24 | Framework(&'static str), 25 | /// An error returned when attempting to access uninitialized memory. 26 | UninitializedMemory, 27 | /// Unable to drop the provided device because a memory allocation was not found for it. 28 | AllocatedMemoryNotFoundForDevice, 29 | /// An error occurred while attempting to synchronize memory. 30 | MemorySynchronizationFailed, 31 | /// A memory synchronization route was requested, but no available synchronization route was found. 32 | NoAvailableSynchronizationRouteFound, 33 | /// An error occurred while attempting to allocate memory. 34 | MemoryAllocationFailed, 35 | /// An error occurred while downcasting 36 | MemoryDowncasting, 37 | 38 | // MARK: - A set of tensor error categories 39 | 40 | /// Maximum number of backing memories has been reached (`BitMap` - type alias for `u64`). 41 | CapacityExceeded, 42 | /// The tensor shape is incompatible with the shape of some data. 43 | IncompatibleShape, 44 | /// Invalid reshaped tensor size. 45 | InvalidReshapedTensorSize, 46 | 47 | /// Any error not part of this list. 48 | Other, 49 | /// A marker variant that tells the compiler that users of this enum cannot match 50 | /// it exhaustively ([related RFC](https://github.com/rust-lang/rust/issues/32770)). 51 | #[doc(hidden)] 52 | _NonExhaustive, 53 | } 54 | 55 | impl ErrorKind { 56 | fn as_str(&self) -> &'static str { 57 | use self::ErrorKind::*; 58 | 59 | match *self { 60 | Framework(name) => name, 61 | CapacityExceeded => "the maximum number of backing memories has been reached", 62 | IncompatibleShape => "the tensor shape is incompatible with the shape of the data", 63 | InvalidReshapedTensorSize => "size of the provided shape is not equal to the size of the current shape", 64 | UninitializedMemory => "uninitialized memory", 65 | AllocatedMemoryNotFoundForDevice => "memory allocation was not found for the provided device", 66 | MemorySynchronizationFailed => "memory synchronization failed", 67 | NoAvailableSynchronizationRouteFound => "no available memory synchronization route", 68 | MemoryAllocationFailed => "memory allocation failed", 69 | MemoryDowncasting => "something went wrong while downcasting", 70 | Other => "other error", 71 | _ => unreachable!(), 72 | } 73 | } 74 | } 75 | 76 | impl Error { 77 | /// Creates a new error from a known kind of error as well as an arbitrary error error. 78 | pub fn new(kind: K, error: E) -> Error 79 | where K: Into, 80 | E: Into> { 81 | Self::_new(kind.into(), Some(error.into())) 82 | } 83 | 84 | /// Returns a reference to the inner error wrapped by this error (if any). 85 | pub fn get_ref(&self) -> Option<&(error::Error + Send + Sync + 'static)> { 86 | match self.inner { 87 | Some(ref error) => Some(error.deref()), 88 | _ => None 89 | } 90 | } 91 | /// Returns the corresponding `ErrorKind` for this error. 92 | pub fn kind(&self) -> ErrorKind { 93 | self.kind 94 | } 95 | } 96 | 97 | impl Error { 98 | // "De-generization" technique.. 99 | fn _new(kind: ErrorKind, error: Option>) -> Error { 100 | Error { kind, inner: error } 101 | } 102 | } 103 | 104 | impl fmt::Display for Error { 105 | fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result { 106 | write!(fmt, "{}", self.kind.as_str()) 107 | } 108 | } 109 | 110 | impl error::Error for Error { 111 | fn description(&self) -> &str { 112 | if let Some(ref error) = self.inner { 113 | error.description() 114 | } else { 115 | self.kind.as_str() 116 | } 117 | } 118 | 119 | fn cause(&self) -> Option<&error::Error> { 120 | match self.inner { 121 | Some(ref error) => error.cause(), 122 | _ => None, 123 | } 124 | } 125 | } 126 | 127 | impl From for Error { 128 | /// Creates a new error from a known kind of error 129 | fn from(kind: ErrorKind) -> Error { 130 | Error::_new(kind, None) 131 | } 132 | } 133 | 134 | #[cfg(test)] 135 | mod test { 136 | use super::{Error, ErrorKind}; 137 | use std::{error, fmt}; 138 | 139 | #[test] 140 | fn test_downcasting() { 141 | #[derive(Debug)] 142 | struct TestError; 143 | impl fmt::Display for TestError { fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result { 144 | Ok(()) 145 | }} 146 | impl error::Error for TestError { fn description(&self) -> &str { "abc" } } 147 | let err = Error::new(ErrorKind::Other, TestError); 148 | assert!(err.get_ref().unwrap().is::()); 149 | assert_eq!("abc", err.get_ref().unwrap().description()); 150 | } 151 | } -------------------------------------------------------------------------------- /src/extension_package.rs: -------------------------------------------------------------------------------- 1 | //! A package can be a binary, a source file, c code, a single kernel, etc., or a collective which 2 | //! share related functionalities. A package is provided by a specific library such as BLAS. Notice 3 | //! that packages are analogous to those of Rust (i.e., crates): 4 | //! 5 | //! compiled crate <-> package 6 | //! library (one or more modules) <-> bundle 7 | //! 8 | //! A package needs to be _built_, which is handled by the specific implementation of a binary 9 | //! representation, and returns initialized operations based on the library. Interacting directly 10 | //! with the package itself is possible, but it should be used to construct the backend-agnostic 11 | //! operations, which can then be executed and parallelized via a unified interface. 12 | //! 13 | //! ## Extensions 14 | //! 15 | //! A library can be a binary, a source file, c code, a single kernel, etc., or a collective. 16 | //! 17 | //! A backend is a Rust struct like any other, therefore you probably would like to implement 18 | //! certain methods for the Backend. As the whole purpose of a Backend is to provide an 19 | //! abstraction over various computation devices and computation languages, these implemented 20 | //! methods will than be able to execute on different devices and use the full power of 21 | //! the machine's underlying hardware. 22 | //! 23 | //! Extending the backend with operations is easy. In Parenchyma we call crates, which provide 24 | //! operations for the backend, _extensions_. Extensions are usually a group of related 25 | //! operations of a common field. Two examples for Parenchyma extensions 26 | //! are [BLAS][parenchyma-blas] and [NN][parenchyma-nn]. 27 | //! 28 | //! An extension provides generic traits and the explicit implementation of these traits for 29 | //! one or (even better) all available Parenchyma frameworks - common host CPU, OpenCL, CUDA. 30 | //! 31 | //! The structure of an extension is pretty simple with as little overhead as possible. Macros 32 | //! and build-scripts make implementations even easier. If you would like to use specific 33 | //! extension for you backend, all you need to do is set them as dependencies in your Cargo 34 | //! file in addition to the Parenchyma crate. The extension then automatically extends the 35 | //! backend provided by Parenchyma. 36 | //! 37 | //! Extending the backend with your own extension is a straightforward process. For now we 38 | //! recommend that you take a look at the general code structure 39 | //! of [Parenchyma-BLAS][parenchyma-blas] or its documentation. Let us now about your extension 40 | //! on the Gitter chat, we are happy to feature your Parenchyma Extension on the README. 41 | 42 | use super::context::Context; 43 | use super::error::Result; 44 | 45 | /// Represents a package dependency. 46 | pub trait Dependency

: ExtensionPackage { 47 | /// Returns the dependency. 48 | fn dependency(&self) -> &P; 49 | } 50 | 51 | impl

Dependency

for P where P: ExtensionPackage { 52 | fn dependency(&self) -> &P { 53 | &self 54 | } 55 | } 56 | 57 | /// Provides the generic functionality for a backend-specific implementation of a library. 58 | pub trait ExtensionPackage: 'static { 59 | type Extension: ?Sized; 60 | 61 | /// The name of the package. 62 | /// 63 | /// This associated constant is primarily used for logging/debugging purposes. The naming 64 | /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn"). 65 | fn package_name(&self) -> &'static str; 66 | } 67 | 68 | /// Builds a package and provides the functionality for turning a library into backend-specific, 69 | /// executable operations, and tailored for the target framework. 70 | /// 71 | /// note: the `Context` trait is used here simply as a marker trait. 72 | pub trait ExtensionPackageCtor: Sized 73 | /*where Self: ExtensionPackage + Sized, 74 | TargetContext: Context*/ { 75 | /// Compiles the library into a package after initializing and configuring the library. 76 | /// 77 | /// This associated constant is primarily used for logging/debugging purposes. The naming 78 | /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn"). 79 | fn package(target: &mut TargetContext) -> Result; 80 | } 81 | 82 | impl ExtensionPackage for () { 83 | type Extension = ::std::any::Any; 84 | /// The default package. 85 | fn package_name(&self) -> &'static str { 86 | return "parenchyma/default"; 87 | } 88 | } 89 | 90 | impl ExtensionPackageCtor for () where T: Context { 91 | fn package(_target: &mut T) -> Result { 92 | return Ok(()); 93 | } 94 | } -------------------------------------------------------------------------------- /src/framework.rs: -------------------------------------------------------------------------------- 1 | //! Provides the generic functionality of a hardware supporting frameworks such 2 | //! as native CPU, Open CL, CUDA, etc.. 3 | //! 4 | //! The default framework is simply the host CPU for common computation. To make use 5 | //! of other devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to 6 | //! access the processing capabilities of the device(s). To start backend-agnostic and highly 7 | //! parallel computation, you start by initializing one of the `Framework` implementations, 8 | //! resulting in an initialized Framework, that contains, among other things, a list of all 9 | //! available hardwares through that framework. 10 | //! 11 | //! # Example 12 | //! 13 | //! ``` 14 | //! extern crate parenchyma; 15 | //! 16 | //! use parenchyma::frameworks::Native; 17 | //! use parenchyma::prelude::*; 18 | //! 19 | //! // A ready to go backend can be created by simply providing the framework type. 20 | //! let backend: Backend = Backend::new::().unwrap(); 21 | //! ``` 22 | 23 | use super::error::Result; 24 | use super::hardware::Hardware; 25 | 26 | /// A trait implemented for all frameworks. `Framework`s contain a list of all available 27 | /// devices as well as other objects specific to the implementor. 28 | pub trait Framework: 'static { 29 | /// Returns the name of the framework, which is mainly used for the purposes of debugging 30 | /// and reporting errors. 31 | fn name(&self) -> &'static str; 32 | /// Returns the cached and available hardware. 33 | /// 34 | /// note: this method will likely be replaced 35 | /// with a [field](https://github.com/rust-lang/rfcs/pull/1546). 36 | fn hardware(&self) -> &[Hardware]; 37 | } 38 | 39 | /// The non-object-safe part of the framework trait. 40 | /// 41 | /// A separate trait is used because it violates object-safety rules, i.e., `Framework` is the 42 | /// object-safe version of `FrameworkCtor` (or `FrameworkCtor` is the non-object-safe 43 | /// version of `Framework`). `FrameworkCtor` is simply a constructor (hence the name `*Ctor`). In 44 | /// other words, this trait is split into object-safe and non-object-safe parts. 45 | /// 46 | /// todo: generic associated types may help here.. 47 | pub trait FrameworkCtor: Framework + Sized { 48 | /// The context representation for the framework. 49 | type Context; 50 | /// Initializes a `Framework`. 51 | fn new() -> Result; 52 | } -------------------------------------------------------------------------------- /src/frameworks/mod.rs: -------------------------------------------------------------------------------- 1 | //! Exposes the specific framework implementations. 2 | 3 | pub use self::native::{HOST, Native, NativeContext, NativeDevice, NativeMemory}; 4 | pub use self::open_cl::{OpenCL, OpenCLBuf, OpenCLContext, OpenCLDevice, OpenCLMemory}; 5 | 6 | mod native; 7 | mod open_cl; -------------------------------------------------------------------------------- /src/frameworks/native/context.rs: -------------------------------------------------------------------------------- 1 | use std::marker::PhantomData; 2 | 3 | use std::marker::Unsize; 4 | use super::Native; 5 | use super::super::super::compute_device::ComputeDevice; 6 | use super::super::super::context::{Context, ContextCtor}; 7 | use super::super::super::error::Result; 8 | use super::super::super::extension_package::ExtensionPackage; 9 | use super::super::super::hardware::Hardware; 10 | 11 | /// Defines a Native context. 12 | pub struct NativeContext

(PhantomData

); 13 | 14 | impl Context for NativeContext 15 | where Package: ExtensionPackage, 16 | NativeContext: Unsize { 17 | 18 | type Package = Package; 19 | 20 | fn active_codev(&self) -> &ComputeDevice { 21 | &super::HOST 22 | } 23 | 24 | fn activate(&mut self, _: usize) -> Result { 25 | Ok(()) 26 | } 27 | 28 | fn extension(&self) -> &::Extension { 29 | self 30 | } 31 | } 32 | 33 | impl

ContextCtor

for NativeContext

34 | where P: 'static + ExtensionPackage, 35 | NativeContext

: Unsize { 36 | 37 | type F = Native

; 38 | 39 | fn new(_: &Self::F, _: &[Hardware]) -> Result { 40 | Ok(NativeContext(PhantomData)) 41 | } 42 | } -------------------------------------------------------------------------------- /src/frameworks/native/device.rs: -------------------------------------------------------------------------------- 1 | use ndarray::Array; 2 | 3 | use super::NativeMemory; 4 | use super::super::super::compute_device::{Allocate, ComputeDevice}; 5 | use super::super::super::error::Result; 6 | use super::super::super::memory::Memory; 7 | use super::super::super::tensor::TensorShape; 8 | 9 | /// The native device. 10 | #[derive(Debug)] 11 | pub struct NativeDevice; 12 | 13 | impl ComputeDevice for NativeDevice { } 14 | 15 | impl Allocate for NativeDevice { 16 | fn allocate(&self, shape: &TensorShape) -> Result>> { 17 | let mut v = Vec::with_capacity(shape.capacity()); 18 | 19 | unsafe { 20 | v.set_len(shape.capacity()); 21 | } 22 | 23 | let array = Array::from_shape_vec(shape.dimensions(), v).unwrap(); 24 | let memory = NativeMemory(array); 25 | 26 | return Ok(Box::new(memory)); 27 | } 28 | } -------------------------------------------------------------------------------- /src/frameworks/native/framework.rs: -------------------------------------------------------------------------------- 1 | use super::NativeContext; 2 | use super::super::super::error::Result; 3 | use super::super::super::framework::{Framework, FrameworkCtor}; 4 | use super::super::super::hardware::{Hardware, HardwareKind}; 5 | 6 | use std::marker::PhantomData; 7 | 8 | /// The native framework 9 | #[derive(Debug)] 10 | pub struct Native

{ 11 | hardware: [Hardware; 1], 12 | package: PhantomData

, 13 | } 14 | 15 | impl

Native

{ 16 | const ID: &'static str = "native/host"; 17 | } 18 | 19 | impl

Framework for Native

where P: 'static { 20 | fn name(&self) -> &'static str { 21 | return Native::

::ID; 22 | } 23 | 24 | fn hardware(&self) -> &[Hardware] { 25 | &self.hardware 26 | } 27 | } 28 | 29 | impl

FrameworkCtor for Native

where P: 'static { 30 | type Context = NativeContext

; 31 | 32 | fn new() -> Result { 33 | Ok(Native { 34 | hardware: [Hardware { 35 | id: 0usize, 36 | framework: Native::

::ID, 37 | kind: HardwareKind::CPU, 38 | name: String::from("Host CPU"), 39 | compute_units: 1, 40 | }], 41 | package: PhantomData, 42 | }) 43 | } 44 | } -------------------------------------------------------------------------------- /src/frameworks/native/memory.rs: -------------------------------------------------------------------------------- 1 | use ndarray::{Array, IxDyn}; 2 | use std::ops::{Deref, DerefMut}; 3 | 4 | // use super::super::super::{Device, Memory, TransferDirection}; 5 | // use super::super::super::error::Result; 6 | 7 | use super::NativeDevice; 8 | use super::super::super::compute_device::ComputeDevice; 9 | use super::super::super::memory::Memory; 10 | 11 | /// A newtype (with an internal type of an n-dimensional array) representing a native memory buffer. 12 | /// 13 | /// note: named `Memory` for consistency across frameworks. 14 | pub struct NativeMemory(pub(in crate) Array); 15 | 16 | impl Memory for NativeMemory { 17 | fn synchronized(&self, compute_device: &ComputeDevice) -> bool { 18 | compute_device.is::() 19 | } 20 | } 21 | 22 | impl Deref for NativeMemory { 23 | type Target = Array; 24 | fn deref(&self) -> &Self::Target { 25 | &self.0 26 | } 27 | } 28 | 29 | impl DerefMut for NativeMemory { 30 | fn deref_mut(&mut self) -> &mut Self::Target { 31 | &mut self.0 32 | } 33 | } -------------------------------------------------------------------------------- /src/frameworks/native/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::context::NativeContext; 2 | pub use self::device::NativeDevice; 3 | pub use self::framework::Native; 4 | pub use self::memory::NativeMemory; 5 | 6 | mod context; 7 | mod device; 8 | mod framework; 9 | mod memory; 10 | 11 | pub const HOST: NativeDevice = NativeDevice; -------------------------------------------------------------------------------- /src/frameworks/open_cl/context.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | use std::ffi::CString; 3 | use std::marker::Unsize; 4 | use super::{OpenCL, OpenCLDevice}; 5 | use super::super::super::compute_device::ComputeDevice; 6 | use super::super::super::context::{Context, ContextCtor}; 7 | use super::super::super::error::{Error, ErrorKind, Result}; 8 | use super::super::super::extension_package::{ExtensionPackage, ExtensionPackageCtor}; 9 | use super::super::super::hardware::Hardware; 10 | 11 | /// Defines a Open CL context. 12 | /// 13 | /// A context is responsible for managing OpenCL objects and resources (command-queues, program 14 | /// objects, kernel objects, executing kernels, etc.). The usual configuration is a single context 15 | /// encapsulating multiple devices. The resources, such as [buffers][buffer] and [events][event], 16 | /// can be shared across multiple devices in a single context. Other possible setups include: 17 | /// 18 | /// * a single context for multiple devices 19 | /// * a single context for a single device 20 | /// * a context for each device 21 | /// 22 | /// note: multi-platform contexts are not supported in OpenCL. 23 | /// 24 | /// ## Programs 25 | /// 26 | /// An OpenCL context can have multiple programs associated with it. Programs can be compiled 27 | /// individually to avoid possible name clashes due to using packages from multiple package 28 | /// authors. 29 | /// 30 | /// [buffer]: ./frameworks/opencl/struct.Memory.html 31 | /// [event]: ./frameworks/opencl/struct.Event.html 32 | pub struct OpenCLContext

{ 33 | /// The context. 34 | context: ocl::Context, 35 | /// The index of the _active_ device. 36 | active: usize, 37 | /// A list of devices associated with the context. 38 | selected_devices: Vec, 39 | /// The `Device`s' corresponding `Hardware`. 40 | selected_hardware: Vec, 41 | // todo document this: 42 | // package is stored here because 43 | // a) the program depends on the selected devices 44 | // b) the lazy static would new the context 45 | // 1) mutating would be possible but wouldn't be worth the cost and trouble 46 | extension_package: P, 47 | } 48 | 49 | impl

OpenCLContext

{ 50 | pub fn device(&self) -> &OpenCLDevice { 51 | &self.selected_devices[self.active] 52 | } 53 | 54 | pub fn extension_package(&self) -> &P { 55 | &self.extension_package 56 | } 57 | 58 | /// Builds and returns a program. 59 | pub fn program(&self, src_strings: Vec) -> Result { 60 | let cmplr_opts = CString::new("").unwrap(); 61 | let device_ids: Vec<_> = self.selected_devices.iter().map(|d| d.device.clone()).collect(); 62 | 63 | Ok(ocl::Program::new( 64 | self.context.core(), 65 | src_strings, 66 | Some(&device_ids), 67 | cmplr_opts 68 | )?) 69 | } 70 | } 71 | 72 | impl Context for OpenCLContext 73 | where Package: ExtensionPackage, 74 | OpenCLContext: Unsize { 75 | 76 | type Package = Package; 77 | 78 | fn active_codev(&self) -> &ComputeDevice { 79 | &self.selected_devices[self.active] 80 | } 81 | 82 | fn extension(&self) -> &::Extension { 83 | self 84 | } 85 | 86 | fn activate(&mut self, index: usize) -> Result { 87 | if index >= self.selected_devices.len() { 88 | return Err(Error::new(ErrorKind::Other, "device index out of range")); 89 | } 90 | 91 | self.active = index; 92 | 93 | Ok(()) 94 | } 95 | } 96 | 97 | impl

ContextCtor

for OpenCLContext

98 | where P: 'static + ExtensionPackage + ExtensionPackageCtor>, 99 | OpenCLContext

: Unsize { 100 | 101 | type F = OpenCL

; 102 | 103 | fn new(framework: &Self::F, selection: &[Hardware]) -> Result { 104 | 105 | let props = ocl::builders::ContextProperties::new().platform(framework.implementation); 106 | let s = ocl::builders::DeviceSpecifier::Indices(selection.iter().map(|h| h.id).collect()); 107 | let ctx = ocl::Context::new(Some(props), Some(s), None, None)?; 108 | 109 | let mut devices = vec![]; 110 | 111 | for hardware in selection.iter() { 112 | let d = ocl::Device::by_idx_wrap(framework.implementation, hardware.id); 113 | let queue = ocl::Queue::new(&ctx, d, Some(ocl::flags::QUEUE_PROFILING_ENABLE))?; 114 | 115 | devices.push(OpenCLDevice { 116 | device: d, 117 | context: ctx.clone(), 118 | queue, 119 | }); 120 | } 121 | 122 | let mut unpackaged = OpenCLContext { 123 | context: ctx, 124 | active: 0, 125 | selected_devices: devices, 126 | selected_hardware: selection.to_vec(), 127 | extension_package: (), 128 | }; 129 | 130 | let package = P::package(&mut unpackaged)?; 131 | 132 | Ok(OpenCLContext { 133 | context: unpackaged.context, 134 | active: unpackaged.active, 135 | selected_devices: unpackaged.selected_devices, 136 | selected_hardware: unpackaged.selected_hardware, 137 | extension_package: package, 138 | }) 139 | } 140 | } -------------------------------------------------------------------------------- /src/frameworks/open_cl/device.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | 3 | use super::{OpenCLBuf, OpenCLMemory}; 4 | use super::super::super::compute_device::{Allocate, ComputeDevice}; 5 | use super::super::super::error::Result; 6 | use super::super::super::memory::Memory; 7 | use super::super::super::tensor::{TensorShape, TensorType}; 8 | 9 | /// Represents an Open CL device. 10 | #[derive(Clone, Debug)] 11 | pub struct OpenCLDevice { 12 | pub(in frameworks::open_cl) device: ocl::Device, 13 | pub(in frameworks::open_cl) context: ocl::Context, 14 | /// A command queue 15 | /// 16 | /// A command queue is the mechanism for interaction with the device. The queue is used for 17 | /// operations such as kernel launches and memory copies. At least one command queue per device 18 | /// is required. Queues are used by the host application to submit work to devices and 19 | /// associated with devices within a context. 20 | /// 21 | /// __commands__: 22 | /// 23 | /// - memory copy or mapping 24 | /// - device code execution 25 | /// - synchronization point 26 | /// 27 | /// __modes__: 28 | /// 29 | /// - in-order 30 | /// - out-of-order 31 | /// 32 | /// ## TODO 33 | /// 34 | /// * Use events to synchronize 35 | pub(in frameworks::open_cl) queue: ocl::Queue, 36 | } 37 | 38 | impl OpenCLDevice { 39 | pub fn queue(&self) -> &ocl::Queue { 40 | &self.queue 41 | } 42 | } 43 | 44 | impl ComputeDevice for OpenCLDevice { } 45 | 46 | impl Allocate for OpenCLDevice where T: TensorType + 'static { 47 | fn allocate(&self, shape: &TensorShape) -> Result>> { 48 | let ctx = &self.context; 49 | let flags_opt = Some(ocl::flags::MEM_READ_WRITE); 50 | let dims = ocl::SpatialDims::One(shape.capacity); 51 | let host_data = None; 52 | let buf: OpenCLBuf = OpenCLBuf { 53 | buf: ocl::Buffer::new(ctx, flags_opt, dims, host_data)? 54 | }; 55 | let device = self.clone(); 56 | let memory = Box::new(OpenCLMemory { 57 | buf, 58 | device, 59 | }); 60 | 61 | return Ok(memory); 62 | } 63 | } -------------------------------------------------------------------------------- /src/frameworks/open_cl/error.rs: -------------------------------------------------------------------------------- 1 | use ocl::Error as OpenCLError; 2 | use error::{Error, ErrorKind}; 3 | 4 | impl From for Error { 5 | /// Creates a new error from a known kind of error 6 | fn from(e: OpenCLError) -> Error { 7 | Error::new(ErrorKind::Framework(super::OpenCL::<()>::ID), ::std::error::Error::description(&e)) 8 | } 9 | } -------------------------------------------------------------------------------- /src/frameworks/open_cl/framework.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | use ocl::Platform as Implementation; 3 | use ocl::enums::{DeviceInfo, DeviceInfoResult}; 4 | use ocl::flags::{DEVICE_TYPE_ACCELERATOR, DEVICE_TYPE_CPU, DEVICE_TYPE_GPU}; 5 | use std::marker::PhantomData; 6 | 7 | use super::OpenCLContext; 8 | use super::super::super::error::Result; 9 | use super::super::super::framework::{Framework, FrameworkCtor}; 10 | use super::super::super::hardware::{Hardware, HardwareKind}; 11 | 12 | /// Provides the Open CL framework. 13 | /// 14 | /// # Flow 15 | /// 16 | /// Since multiple platforms can exist, the first available platform is selected during 17 | /// the initialization. A list of available devices are then provided for your choosing. Then, 18 | /// the provided selection of devices are used to create a context, with a command queue for each 19 | /// device. At this stage, a program(s) is compiled. A (host) program is essentially a collection 20 | /// of kernels. A kernel is the smallest unit of execution. 21 | /// 22 | /// In OpenCL, the host code can read in a kernel binary (i.e., compiled off-line) or a kernel 23 | /// source file (i.e., compile on-line). More information on on-line/off-line compilation can be 24 | /// found [here][1]. Kernels are expensive to start, so they're typically used to do a large amount 25 | /// of work. Memory allocated on an OpenCL device can be used when executing kernels, and then 26 | /// transfered back. 27 | /// 28 | /// Work-groups, a collection of work-items, are assigned to execute on compute-units. A work-item 29 | /// is an instance of a kernel as runtime. That kernel instance is at a point in an index, which 30 | /// can be thought of as a grid and the work-groups which contain the work-items can be thought of 31 | /// as sub-grids within the grid. The work-groups can be defined explicitly or implicitly by 32 | /// simply specifying the number of work-items, both dealing with data parallelism. In terms of task 33 | /// parallelism, kernels are executed independent of an index space. 34 | /// It should also be noted that there are [built-in scalar data types][2] along with 35 | /// [built-in functions][3]. 36 | /// 37 | /// [1]: https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/online-offline-compilation/ 38 | /// [2]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html 39 | /// [3]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/mathFunctions.html 40 | #[derive(Debug)] 41 | pub struct OpenCL

{ 42 | /// A list of available devices for the first platform found. 43 | available_hardware: Vec, 44 | /// The specific Open CL implementation (e.g., AMD APP, NVIDIA or Intel Open CL) 45 | /// 46 | /// Platforms are defined by the implementation. Platforms enables the host to interact with 47 | /// OpenCL-capable devices. 48 | pub(in frameworks::open_cl) implementation: Implementation, 49 | package: PhantomData

, 50 | } 51 | 52 | impl

OpenCL

{ 53 | pub(in frameworks::open_cl) const ID: &'static str = "Open CL"; 54 | } 55 | 56 | impl

Framework for OpenCL

where P: 'static { 57 | fn name(&self) -> &'static str { 58 | return OpenCL::

::ID; 59 | } 60 | 61 | fn hardware(&self) -> &[Hardware] { 62 | &self.available_hardware 63 | } 64 | } 65 | 66 | impl

FrameworkCtor for OpenCL

where P: 'static { 67 | type Context = OpenCLContext

; 68 | 69 | fn new() -> Result { 70 | let ignore_env_var = false; 71 | let implementation = Implementation::first(ignore_env_var)?; 72 | let devices = ocl::Device::list_all(implementation)?; 73 | 74 | let available_hardware = { 75 | 76 | devices.iter().enumerate() 77 | 78 | .filter(|&(_, d)| { 79 | use ocl::enums::DeviceInfo::{MaxComputeUnits, Type}; 80 | use ocl::enums::DeviceInfoResult::Error; 81 | 82 | let _1 = d.is_available().unwrap_or(false); 83 | // let _2 = match d.info(Type) { Error(_) => false, _ => true }; 84 | // let _3 = match d.info(MaxComputeUnits) { Error(_) => false, _ => true }; 85 | // TODO 86 | 87 | _1 88 | }) 89 | 90 | .map(|(i, d)| { 91 | 92 | let kind = { 93 | match d.info(DeviceInfo::Type) { 94 | DeviceInfoResult::Type(t) => match t { 95 | DEVICE_TYPE_ACCELERATOR => HardwareKind::Accelerator, 96 | DEVICE_TYPE_CPU => HardwareKind::CPU, 97 | DEVICE_TYPE_GPU => HardwareKind::GPU, 98 | _ => HardwareKind::Unknown, 99 | }, 100 | _ => unreachable!(), 101 | } 102 | }; 103 | 104 | let compute_units = { 105 | match d.info(DeviceInfo::MaxComputeUnits) { 106 | DeviceInfoResult::MaxComputeUnits(n) => n as usize, 107 | _ => unreachable!(), 108 | } 109 | }; 110 | 111 | Hardware { 112 | id: i, 113 | framework: OpenCL::

::ID, 114 | kind, 115 | name: d.name(), 116 | compute_units, 117 | } 118 | }) 119 | 120 | .collect::>() 121 | }; 122 | 123 | Ok(OpenCL { available_hardware, implementation, package: PhantomData }) 124 | } 125 | } -------------------------------------------------------------------------------- /src/frameworks/open_cl/memory.rs: -------------------------------------------------------------------------------- 1 | use ocl; 2 | use super::OpenCLDevice; 3 | use super::super::NativeMemory; 4 | use super::super::super::compute_device::ComputeDevice; 5 | use super::super::super::error::{ErrorKind, Result}; 6 | use super::super::super::memory::{Memory, TransferDirection}; 7 | use super::super::super::tensor::TensorType; 8 | 9 | /// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named 10 | /// as such for consistency's sake. 11 | /// 12 | /// Memory objects can be copied to host memory, from host memory, or to other memory objects. 13 | /// Copying from the host to a device is considered _writing_. Copying from a device to the host is 14 | /// considered _reading_. 15 | /// 16 | /// Unlike CUDA, OpenCL [buffers][1] are only context specific, not device specific. Also note: 17 | /// currently, lazy allocation is used on the NVIDIA driver. That is, the buffer object, in a sense, 18 | /// is located _nowhere_ when allocated. It only exists when needed. 19 | /// 20 | /// [1]: https://goo.gl/S9B3TL 21 | #[derive(Clone, Debug)] 22 | pub struct OpenCLBuf where T: TensorType { 23 | pub(in super) buf: ocl::Buffer, 24 | } 25 | 26 | /// Memory representation for Open CL 27 | pub struct OpenCLMemory where T: TensorType { 28 | pub(in super) buf: OpenCLBuf, 29 | pub(in super) device: OpenCLDevice, 30 | } 31 | 32 | impl Memory for OpenCLMemory where T: TensorType + 'static { 33 | fn synchronized(&self, device: &ComputeDevice) -> bool { 34 | if let Some(op) = device.downcast_ref::() { 35 | (self.device.device == op.device) && (self.device.context.core() == op.context.core()) 36 | } else { 37 | false 38 | } 39 | } 40 | 41 | fn transfer(&mut self, dir: TransferDirection, m: &mut Memory) -> Result { 42 | match dir { 43 | TransferDirection::TransferIn => { 44 | if let Some(na) = m.downcast_ref::>() { 45 | let buffer_write_cmd = unsafe { 46 | self.buf.buf.write( 47 | na.0.as_slice_memory_order() 48 | .expect("the array's data is not contiguous") // TODO 49 | ) 50 | .queue(&self.device.queue) 51 | .block(true) // TODO 52 | .len(na.0.len()) 53 | }; 54 | 55 | Ok(buffer_write_cmd.enq()?) 56 | } else { 57 | Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()) 58 | } 59 | }, 60 | 61 | TransferDirection::TransferOut => { 62 | if let Some(na) = m.downcast_mut::>() { 63 | let length = na.0.len(); 64 | 65 | let buffer_read_cmd = unsafe { 66 | self.buf.buf.read( 67 | na.0.as_slice_memory_order_mut() 68 | .expect("the array's data is not contiguous") // TODO 69 | ) 70 | .queue(&self.device.queue) 71 | .block(true) // TODO 72 | .len(length) 73 | }; 74 | 75 | Ok(buffer_read_cmd.enq()?) 76 | } else { 77 | Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()) 78 | } 79 | } 80 | } 81 | } 82 | } 83 | 84 | impl ::ocl::core::AsMem for OpenCLMemory { 85 | fn as_mem(&self) -> &::ocl::core::Mem { 86 | self.buf.buf.as_mem() 87 | } 88 | } 89 | 90 | unsafe impl ::ocl::core::MemCmdAll for OpenCLMemory { } 91 | unsafe impl<'a, T: TensorType> ::ocl::core::MemCmdAll for &'a OpenCLMemory { } 92 | unsafe impl<'a, T: TensorType> ::ocl::core::MemCmdAll for &'a mut OpenCLMemory { } -------------------------------------------------------------------------------- /src/frameworks/open_cl/mod.rs: -------------------------------------------------------------------------------- 1 | pub use self::context::OpenCLContext; 2 | pub use self::device::OpenCLDevice; 3 | pub use self::framework::OpenCL; 4 | pub use self::memory::{OpenCLBuf, OpenCLMemory}; 5 | 6 | mod context; 7 | mod device; 8 | mod error; 9 | mod framework; 10 | mod memory; -------------------------------------------------------------------------------- /src/hardware.rs: -------------------------------------------------------------------------------- 1 | //! Hardware can be GPUs, multi-core CPUs or DSPs, Cell/B.E. processor or whatever else 2 | //! is supported by the provided framework. The struct holds all important information about 3 | //! the hardware. To execute code on hardware, turn hardware into a [`ComputeDevice`]. 4 | //! 5 | //! [`Device`]: [device]: ./compute_device/struct.Device.html 6 | 7 | /// Representation for hardware across frameworks. 8 | #[derive(Clone, Debug)] 9 | pub struct Hardware { 10 | /// The unique ID of the hardware. 11 | pub id: usize, 12 | /// Framework marker 13 | pub framework: &'static str, 14 | /// The type of compute device, such as a CPU or a GPU. 15 | pub kind: HardwareKind, 16 | /// The name. 17 | pub name: String, 18 | /// The number of compute units. 19 | /// 20 | /// A compute unit is the fundamental unit of computation. A compute device usually has 21 | /// multiple compute units. 22 | pub compute_units: usize, 23 | } 24 | 25 | /// General classes for devices, used to identify the type of a device. 26 | #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] 27 | pub enum HardwareKind { 28 | /// Used for accelerators. Accelerators can communicate with host processor using a peripheral 29 | /// interconnect such as PCIe. 30 | Accelerator, 31 | /// Used for cells. 32 | Cell, 33 | /// Used for devices that are host processors. The host processor runs the implementations 34 | /// and is a single or multi-core CPU. 35 | CPU, 36 | /// Used for digital signal processors. 37 | DSP, 38 | /// Used for GPU devices. 39 | GPU, 40 | /// Used for anything else. 41 | Unknown, 42 | } -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Provides a simple, unified API for running highly parallel computations on different 2 | //! devices across different GPGPU frameworks, allowing you to swap your backend at runtime. 3 | //! 4 | //! Parenchyma began as a hard fork of [Collenchyma], a now-defunct project started at [Autumn]. 5 | //! 6 | //! ## Abstract 7 | //! 8 | //! Code is often executed on the CPU, but can be executed on other devices, such as GPUs 9 | //! and accelerators. These devices are accessible through GPGPU frameworks. Most interfaces are 10 | //! complicated, making the use of these devices a painful experience. Some of the pain points when 11 | //! writing such code for a particular device are: 12 | //! 13 | //! * portability: not only do frameworks have different interfaces, devices support different 14 | //! versions and machines might have different hardware - all of this leads to code that will be 15 | //! executable only on a very specific set of machines and platforms. 16 | //! * learning curve: executing code on a device through a framework is quite different to 17 | //! running code on the native CPU and comes with a lot of hurdles. OpenCL's 1.2 specification, for 18 | //! example, has close to 400 pages. 19 | //! * custom code: integrating support for devices into your project requires the need for writing 20 | //! a lot of low-level code, e.g., kernels, memory management, and general business logic. 21 | //! 22 | //! Writing code for non-CPU devices is often a good choice, as these devices can execute 23 | //! operations a lot faster than native CPUs. GPUs, for example, can execute operations roughly 24 | //! one to two orders of magnitudes faster, thanks to better support of parallelizing operations. 25 | //! 26 | //! Parenchyma eliminates the pain points of writing device code, so you can run your code like any 27 | //! other code without needing to learn about kernels, events, or memory synchronization. Parenchyma 28 | //! also allows you to deploy your code with ease to servers, desktops and mobile device, all while 29 | //! enabling your code to make full use of the underlying hardware. 30 | //! 31 | //! ## Architecture 32 | //! 33 | //! The single entry point of Parenchyma is a [Backend](./struct.Backend.html). A 34 | //! backend is agnostic over the device it runs operations on. In order to be agnostic over the 35 | //! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to 36 | //! be agnostic over the framework as well. The framework is important, as it provides the interface 37 | //! to execute operations on devices, among other things. Since different vendors of hardware use 38 | //! different frameworks, it becomes important that the backend is agnostic over the framework. 39 | //! This allows us to run computations on any machine without having to worry about hardware 40 | //! availability, which gives us the freedom to write code once and deploy it on different machines 41 | //! where it will execute on the most potent hardware by default. 42 | //! 43 | //! ### Frameworks 44 | //! 45 | //! The default framework is simply the host CPU for common computation. To make use of other 46 | //! devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the 47 | //! processing capabilities of the device(s). 48 | //! 49 | //! ### Extensions 50 | //! 51 | //! Operations are introduced by a Parenchyma extension. An extension extends your backend with 52 | //! ready-to-execute operations. All you need to do is add the Parenchyma extension crate(s) 53 | //! to your `Cargo.toml` file alongside the Parenchyma crate. Your backend will then be extended with 54 | //! operations provided by the extension(s). The interface is simply the language you're using to 55 | //! work with Parenchyma. For example, you'd simply call `backend.dot(..)` using Rust-lang and 56 | //! a BLAS extension. Whether or not the dot operation is executed on one GPU, multiple GPUS or on 57 | //! a CPU device depends solely on how you configured the backend. 58 | //! 59 | //! ### Packages 60 | //! 61 | //! The concept of Parenchyma extensions has one more component - the [Package](./trait.ExtensionPackage.html) 62 | //! trait. As opposed to executing code on the native CPU, other devices need to compile and build 63 | //! the extension manually at runtime which makes up a significant part of a framework. We need an 64 | //! instance that's able to be initialized at runtime for holding the sate and compiled 65 | //! operations - which is the package's main purpose. 66 | //! 67 | //! ### Memory 68 | //! 69 | //! The last piece of Parenchyma is the memory. An operation happens over data, but this data needs 70 | //! to be accessible to the device on which the operation is executed. That memory space needs to be 71 | //! allocated on the device and then, in a later step, synced from the host to the device or from 72 | //! the device back to the host. Thanks to the [Tensor](./struct.SharedTensor.html) type, we do not 73 | //! have to care about memory management between devices for the execution of operations. The tensor 74 | //! tracks and automatically manages data and its memory across devices, which is often the host and 75 | //! the device. Memory can also be passed around to different backends. Operations take tensors 76 | //! as arguments while handling the synchronization and allocation for you. 77 | //! 78 | //! ## Development 79 | //! 80 | //! At the moment, Parenchyma itself will provide Rust APIs for the important 81 | //! frameworks - OpenCL and CUDA. 82 | //! 83 | //! If a framework isn't specified, the backend will try to use the most potent framework given 84 | //! the underlying hardware - which would probably be in this order: CUDA -> OpenCL -> Native. The 85 | //! process might take longer, as every framework needs to be checked and devices need to be loaded 86 | //! in order to identify the best setup. The time it takes to go through that process is a 87 | //! reasonable compromise as it would allow you to deploy a Parenchyma-backed application to almost 88 | //! any machine - server, desktops, mobiles, etc. 89 | //! 90 | //! ## Notes 91 | //! 92 | //! The `array` macro is re-exported as a way to allow the end-user to work directly with the 93 | //! crate instead of manually having to add `ndarray` for this single macro. 94 | //! 95 | //! [Collenchyma]: https://github.com/autumnai/collenchyma 96 | //! [Autumn]: https://github.com/autumnai 97 | #![feature(box_syntax, crate_in_paths, get_type_id, non_modrs_mods, unsize, use_extern_macros)] 98 | 99 | #[macro_use] 100 | extern crate log; 101 | #[macro_use(array)] 102 | extern crate ndarray; 103 | extern crate num; 104 | extern crate ocl; 105 | 106 | pub use self::ndarray::array; 107 | 108 | pub mod backend; 109 | pub mod changelog; 110 | pub mod compute_device; 111 | pub mod context; 112 | pub mod error; 113 | pub mod extension_package; 114 | pub mod framework; 115 | pub mod frameworks; 116 | pub mod hardware; 117 | pub mod memory; 118 | pub mod tensor; 119 | 120 | pub mod prelude { 121 | pub use super::backend::Backend; 122 | pub use super::framework::{Framework, FrameworkCtor}; 123 | pub use super::tensor::{IntoTensor, SharedTensor, TensorShape}; 124 | } -------------------------------------------------------------------------------- /src/memory.rs: -------------------------------------------------------------------------------- 1 | //! Provides a unified representation of memory across different frameworks. 2 | 3 | use std::any::{Any, TypeId}; 4 | use super::compute_device::ComputeDevice; 5 | use super::error::{ErrorKind, Result}; 6 | 7 | // TODO 8 | // pub struct Stacked<'p, T> { data: T, marker: PhantomData<&'p mut &'a ()> } 9 | // pub struct Boxed { data: Box } 10 | 11 | /// The transfer direction 12 | pub enum TransferDirection { 13 | /// Transfer data 14 | TransferIn, 15 | /// Transfer data out (i.e., _dumps_ data) 16 | TransferOut, 17 | } 18 | 19 | /// The struct `Memory` has generic type parameters representing memory and its location as seen 20 | /// by the shared tensor. 21 | /// 22 | /// **notes**: 23 | /// 24 | /// * The words _buf_ and _memory_ are used here (until a better term comes to 25 | /// mind (candidates: _allocation_, _partition_, etc.)) for the sake of simplification. 26 | /// * Each framework handles memory allocation differently (e.g., OpenCL allocates memory _lazily_ 27 | /// and isn't associated with any device within the context, even after it's used). 28 | /// * Downcast methods are provided, but normally you will want to use a [`SharedTensor`] which 29 | /// handles synchronization of the latest memory copy to the required device. 30 | pub trait Memory: Any { 31 | /// Specifies synchronization behavior for keeping data consistent across frameworks and contexts. 32 | /// 33 | /// **note** 34 | /// 35 | /// _Synch_ shouldn't be confused with the marker type `Sync` found in the standard library. 36 | /// The less common abbreviation for _synchronize_ (the extra _h_) is used here to 37 | /// avoid confusion. 38 | /// 39 | /// The `transfer` method handles the asynchronous data transfer behavior across 40 | /// frameworks and contexts. 41 | /// 42 | // # TODO: Transfer Matrix/Routes 43 | // 44 | // Host-GPU: Host <-> GPU 45 | // GPU-GPU: GPU -> HOST -> GPU 46 | // 47 | // ```{.text} 48 | // opencl device (context `a`) -> opencl device (context `b`) = true 49 | // opencl device -> native/host = true 50 | // opencl <-> cuda = false 51 | // native/host -> native/host = true 52 | // native/host -> cuda/opencl = false 53 | // ``` 54 | fn transfer(&mut self, TransferDirection, &mut Memory) -> Result { 55 | Err(ErrorKind::NoAvailableSynchronizationRouteFound.into()) 56 | } 57 | /// Determines whether or not the memory is allocated or pinned on the `backend`'s active device. 58 | /// 59 | /// # Arguments 60 | /// 61 | /// * `compute_device` - The computation device. 62 | /// 63 | /// **note**: 64 | /// 65 | /// Certain frameworks have a concept of _shared memory_, where the location of the 66 | /// memory is omnipresent (in a very abstract sense) as long as the devices are within the same 67 | /// context. 68 | #[allow(unused_variables)] 69 | fn synchronized(&self, compute_device: &ComputeDevice) -> bool { 70 | return false; 71 | } 72 | } 73 | 74 | impl Memory { 75 | /// Returns `true` if the boxed type is the same as `T`. 76 | #[inline] 77 | pub fn is>(&self) -> bool { 78 | // Get TypeId of the type this function is instantiated with 79 | let t = TypeId::of::(); 80 | 81 | // Get TypeId of the type in the trait object 82 | let boxed = self.get_type_id(); 83 | 84 | // Compare both TypeIds on equality 85 | t == boxed 86 | } 87 | 88 | /// Returns some reference to the boxed value if it is of type `T`, or 89 | /// `None` if it isn't. 90 | #[inline] 91 | pub fn downcast_ref>(&self) -> Option<&M> { 92 | if self.is::() { 93 | unsafe { 94 | Some(&*(self as *const Memory as *const M)) 95 | } 96 | } else { 97 | None 98 | } 99 | } 100 | 101 | /// Returns some mutable reference to the boxed value if it is of type `T`, or 102 | /// `None` if it isn't. 103 | #[inline] 104 | pub fn downcast_mut>(&mut self) -> Option<&mut M> { 105 | if self.is::() { 106 | unsafe { 107 | Some(&mut *(self as *mut Memory as *mut M)) 108 | } 109 | } else { 110 | None 111 | } 112 | } 113 | } -------------------------------------------------------------------------------- /src/tensor/into_tensor.rs: -------------------------------------------------------------------------------- 1 | use ndarray::{Array, Dimension}; 2 | use std::cell::RefCell; 3 | 4 | use super::{SharedTensor, TensorMap, TensorShape}; 5 | use super::super::memory::Memory; 6 | use super::super::frameworks::NativeMemory; 7 | 8 | pub trait IntoTensor { 9 | fn into_tensor(self) -> SharedTensor; 10 | } 11 | 12 | impl IntoTensor for Array where D: Dimension { 13 | fn into_tensor(self) -> SharedTensor { 14 | SharedTensor::::from(self) 15 | } 16 | } 17 | 18 | impl From> for SharedTensor where 19 | T: 'static, 20 | Dim: Dimension { 21 | 22 | fn from(array: Array) -> Self { 23 | if !array.is_standard_layout() { 24 | panic!("Array data must be laid out in contiguous “C order” in memory"); 25 | } 26 | 27 | let shape = TensorShape::from(array.shape()); 28 | let n = NativeMemory(array.into_dyn()); 29 | 30 | let memories = RefCell::new(vec![ 31 | Box::new(n) as Box> 32 | ]); 33 | 34 | let synch_map = TensorMap::with(1 << 0); 35 | SharedTensor { memories, shape, synch_map } 36 | } 37 | } -------------------------------------------------------------------------------- /src/tensor/tensor_map.rs: -------------------------------------------------------------------------------- 1 | use std::cell::Cell; 2 | 3 | /// A "newtype" with an internal type of `Cell`. `TensorMap` uses [bit manipulation][1] to manage 4 | /// memory versions. 5 | /// 6 | /// [1]: http://stackoverflow.com/a/141873/2561805 7 | #[allow(non_camel_case_types)] 8 | #[derive(Debug)] 9 | pub(in super) struct TensorMap(Cell); 10 | 11 | impl TensorMap { 12 | /// The maximum number of bits in the bit map can contain. 13 | pub const CAPACITY: usize = 64; 14 | 15 | /// Constructs a new `TensorMap`. 16 | pub(in super) fn new() -> TensorMap { 17 | TensorMap::with(0) 18 | } 19 | 20 | /// Constructs a new `TensorMap` with the supplied `n`. 21 | pub(in super) fn with(n: u64) -> TensorMap { 22 | TensorMap(Cell::new(n)) 23 | } 24 | 25 | // fn get(&self) -> u64 { 26 | // self.0.get() 27 | // } 28 | 29 | pub(in super) fn set(&self, v: u64) { 30 | self.0.set(v) 31 | } 32 | 33 | pub(in super) fn empty(&self) -> bool { 34 | self.0.get() == 0 35 | } 36 | 37 | pub(in super) fn insert(&self, k: usize) { 38 | self.0.set(self.0.get() | (1 << k)) 39 | } 40 | 41 | pub(in super) fn contains(&self, k: usize) -> bool { 42 | k < Self::CAPACITY && (self.0.get() & (1 << k) != 0) 43 | } 44 | 45 | pub(in super) fn latest(&self) -> u32 { 46 | self.0.get().trailing_zeros() 47 | } 48 | } -------------------------------------------------------------------------------- /src/tensor/tensor_memories.rs: -------------------------------------------------------------------------------- 1 | use std::cell::RefCell; 2 | use super::super::memory::Memory; 3 | 4 | pub type TensorMemories = RefCell>>>; -------------------------------------------------------------------------------- /src/tensor/tensor_shape.rs: -------------------------------------------------------------------------------- 1 | use super::super::error::{Error, ErrorKind, Result}; 2 | 3 | /// Describes the shape of a tensor. 4 | /// 5 | /// **note**: `From` conversion implementations are provided for low-rank shapes. 6 | #[derive(Clone, Debug, Eq, PartialEq)] 7 | pub struct TensorShape { 8 | /// The number of components the associated tensor can store. 9 | /// 10 | /// # Example 11 | /// 12 | /// ```{.text} 13 | /// // The following tensor has 9 components 14 | /// 15 | /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]] 16 | /// ``` 17 | pub capacity: usize, 18 | /// A list of numbers with each representing the dimension at each index. 19 | /// 20 | /// # Example 21 | /// 22 | /// The following tensor has a shape of `[2, 1]`: 23 | /// 24 | /// ```{.text} 25 | /// [[a], [b]] 26 | /// ``` 27 | pub dimsizes: Vec, 28 | // /// The stride tells the tensor how to interpret its flattened representation. 29 | // stride: Vec, 30 | } 31 | 32 | impl TensorShape { 33 | /// Checks that the shape of the provided `data` is compatible. 34 | pub fn check(&self, data: &[T]) -> Result { 35 | if self.capacity != data.len() { 36 | let message = format!( 37 | "TODO: incompatible shape. Capacity = {}, Length = {}", 38 | self.capacity, 39 | data.len()); 40 | let kind = ErrorKind::IncompatibleShape; 41 | let e = Error::new(kind, message); 42 | 43 | return Err(e); 44 | } 45 | 46 | Ok(()) 47 | } 48 | 49 | /// Returns the `dimensions`. 50 | pub fn dimensions(&self) -> &[usize] { 51 | &self.dimsizes 52 | } 53 | 54 | /// Returns the number of elements the tensor can hold without reallocating. 55 | pub fn capacity(&self) -> usize { 56 | self.capacity 57 | } 58 | 59 | /// Returns the total number of indices required to identify each component uniquely (i.e, the 60 | /// tensor's rank, degree, or order). 61 | /// 62 | /// # Example 63 | /// 64 | /// The following tensor has a rank of 2: 65 | /// 66 | /// ```{.text} 67 | /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]] 68 | /// ``` 69 | pub fn rank(&self) -> usize { 70 | self.dimsizes.len() 71 | } 72 | } 73 | 74 | impl From> for TensorShape { 75 | 76 | fn from(vector: Vec) -> TensorShape { 77 | 78 | TensorShape { 79 | capacity: vector.iter().fold(1, |acc, &dims| acc * dims), 80 | dimsizes: vector, 81 | } 82 | } 83 | } 84 | 85 | impl<'slice> From<&'slice [usize]> for TensorShape { 86 | 87 | fn from(slice: &[usize]) -> TensorShape { 88 | TensorShape { 89 | capacity: slice.iter().fold(1, |acc, &dims| acc * dims), 90 | dimsizes: slice.to_owned(), 91 | } 92 | } 93 | } 94 | 95 | impl From for TensorShape { 96 | 97 | fn from(dimensions: usize) -> TensorShape { 98 | TensorShape { 99 | capacity: dimensions, 100 | dimsizes: vec![dimensions], 101 | } 102 | } 103 | } 104 | 105 | macro_rules! shape { 106 | ($($length:expr),*) => ($(impl From<[usize; $length]> for TensorShape { 107 | fn from(array: [usize; $length]) -> TensorShape { 108 | 109 | TensorShape { 110 | capacity: array.iter().fold(1, |acc, &dims| acc * dims), 111 | dimsizes: array.to_vec(), 112 | } 113 | } 114 | })*) 115 | } 116 | 117 | shape!(0, 1, 2, 3, 4, 5, 6); -------------------------------------------------------------------------------- /src/tensor/tensor_type.rs: -------------------------------------------------------------------------------- 1 | use ocl::traits::OclPrm as PrimitiveType; 2 | 3 | /// A marker trait implemented by primitive types that usable within kernels. 4 | pub trait TensorType: PrimitiveType { 5 | // .. 6 | } 7 | 8 | impl TensorType for T { 9 | // .. 10 | } -------------------------------------------------------------------------------- /src/tensor/utility.rs: -------------------------------------------------------------------------------- 1 | use std::mem; 2 | 3 | pub(in super) unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T 4 | where T: ?Sized { 5 | 6 | mem::transmute::<&'a T, &'b T>(t) 7 | } 8 | 9 | pub(in super) unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T 10 | where T: ?Sized { 11 | 12 | mem::transmute::<&'a mut T, &'b mut T>(t) 13 | } -------------------------------------------------------------------------------- /tests/backend_specs.rs: -------------------------------------------------------------------------------- 1 | extern crate parenchyma; 2 | 3 | #[cfg(test)] 4 | mod backend_spec { 5 | mod native { 6 | use std::rc::Rc; 7 | use parenchyma::backend::Backend; 8 | use parenchyma::frameworks::Native; 9 | 10 | #[test] 11 | fn it_can_create_default_backend() { 12 | let backend: Result = Backend::new::(); 13 | assert!(backend.is_ok()); 14 | } 15 | 16 | #[test] 17 | fn it_can_use_ibackend_trait_object() { 18 | let backend: Rc = Rc::new(Backend::new::().unwrap()); 19 | use_ibackend(backend); 20 | } 21 | 22 | fn use_ibackend(backend: Rc) { 23 | let backend: Rc = backend.clone(); 24 | } 25 | } 26 | 27 | // #[cfg(feature = "cuda")] 28 | // mod cuda { 29 | // use co::*; 30 | // #[test] 31 | // fn it_can_create_default_backend() { 32 | // assert!(Backend::new::().is_ok()); 33 | // } 34 | // } 35 | 36 | // mod opencl { 37 | // //use parenchyma::{Backend, Framework, FrameworkCtor, OpenCL}; 38 | // use parenchyma::backend::Backend; 39 | // use parenchyma::frameworks::OpenCL; 40 | // use parenchyma::prelude::*; 41 | 42 | // #[test] 43 | // fn it_can_create_default_backend() { 44 | // let backend: Result = Backend::new::(); 45 | // assert!(backend.is_ok()); 46 | // } 47 | 48 | // #[test] 49 | // fn it_can_manually_create_backend() { 50 | // let framework = OpenCL::new().unwrap(); 51 | // let hardware = framework.hardware().to_vec(); 52 | // let backend: Backend = Backend::with(framework, hardware).unwrap(); 53 | // println!("{:?}", backend); 54 | // } 55 | // } 56 | } -------------------------------------------------------------------------------- /tests/framework_native_specs.rs: -------------------------------------------------------------------------------- 1 | extern crate parenchyma; 2 | 3 | #[cfg(test)] 4 | mod framework_native_spec { 5 | use parenchyma::frameworks::Native; 6 | use parenchyma::prelude::{Framework, FrameworkCtor}; 7 | 8 | #[test] 9 | fn it_works() { 10 | let framework: Native = Native::new().unwrap(); 11 | assert_eq!(framework.hardware().len(), 1); 12 | } 13 | } -------------------------------------------------------------------------------- /tests/shared_memory_specs.rs: -------------------------------------------------------------------------------- 1 | // extern crate parenchyma as pa; 2 | 3 | // #[cfg(test)] 4 | // mod shared_memory_spec { 5 | // use pa::{Backend, ErrorKind, Memory, Native, OpenCL, SharedTensor}; 6 | 7 | // pub fn write(memory: &mut Memory, data: &[f32]) { 8 | // let ndarray = unsafe { memory.as_mut_native_unchecked() }; 9 | // let buf = ndarray.as_slice_memory_order_mut().unwrap(); 10 | 11 | // for (index, datum) in data.iter().enumerate() { 12 | // buf[index] = *datum; 13 | // } 14 | // } 15 | 16 | // #[test] 17 | // fn it_creates_new_shared_memory_for_native() { 18 | // let ref host: Backend = Backend::new::().unwrap(); 19 | // let mut shared_data = SharedTensor::::new(10); 20 | // let tensor = shared_data.write(host).unwrap(); 21 | // assert_eq!(tensor.as_native().unwrap().len(), 10); 22 | // } 23 | 24 | // #[test] 25 | // //#[cfg(feature = "opencl")] 26 | // fn it_creates_new_shared_memory_for_opencl() { 27 | // let ref backend: Backend = Backend::new::().unwrap(); 28 | // let mut shared_data: SharedTensor = SharedTensor::new(10); 29 | // assert!(shared_data.write(backend).is_ok()); 30 | // } 31 | 32 | // #[test] 33 | // fn it_fails_on_initialized_memory_read() { 34 | // let ref host: Backend = Backend::new::().unwrap(); 35 | // let mut shared_data = SharedTensor::::new(10); 36 | // assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); 37 | // assert_eq!(shared_data.read_write(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); 38 | 39 | // // initialize memory 40 | // let _ = shared_data.write(host).unwrap(); 41 | // let _ = shared_data.dealloc(host).unwrap(); 42 | 43 | // assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory); 44 | // } 45 | 46 | // #[test] 47 | // //#[cfg(feature = "opencl")] 48 | // fn it_syncs_from_native_to_opencl_and_back() { 49 | // let ref host: Backend = Backend::new::().unwrap(); 50 | // let ref backend: Backend = Backend::new::().unwrap(); 51 | 52 | // let mut sh = SharedTensor::::new(3); 53 | // write(sh.write(host).unwrap(), &[1.0f32, 2.0, 123.456]); 54 | // let _ = sh.read(backend).unwrap(); 55 | 56 | // // It has not successfully synced to the device. 57 | // // Not the other way around. 58 | 59 | // //let _ = sh.dealloc(host).unwrap();// TODO ? 60 | // let _ = sh.dealloc(backend).unwrap(); 61 | 62 | // assert_eq!( 63 | // sh.read(host).unwrap().as_native().unwrap().as_slice_memory_order().unwrap(), 64 | // [1.0, 2.0, 123.456] 65 | // ); 66 | // } 67 | 68 | // #[test] 69 | // fn it_reshapes_correctly() { 70 | // let mut shared_data = SharedTensor::::new(10); 71 | // assert!(shared_data.reshape([5, 2]).is_ok()); 72 | // } 73 | 74 | // #[test] 75 | // fn it_returns_err_for_invalid_size_reshape() { 76 | // let mut shared_data = SharedTensor::::new(10); 77 | // assert!(shared_data.reshape([10, 2]).is_err()); 78 | // } 79 | // } --------------------------------------------------------------------------------