├── .gitignore
├── Cargo.toml
├── README.md
├── benches
    └── shared_tensor.rs
├── crates
    ├── parenchyma-blas
    │   ├── .gitignore
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── src
    │   │   ├── extension_package
    │   │   │   ├── axpby.rs
    │   │   │   ├── level1.rs
    │   │   │   ├── level2.rs
    │   │   │   ├── level3.rs
    │   │   │   ├── mod.rs
    │   │   │   └── transpose.rs
    │   │   ├── frameworks
    │   │   │   ├── mod.rs
    │   │   │   ├── native
    │   │   │   │   └── mod.rs
    │   │   │   └── open_cl
    │   │   │   │   ├── implementation
    │   │   │   │       ├── level1.rs
    │   │   │   │       └── mod.rs
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── package.rs
    │   │   │   │   └── source
    │   │   │   │       ├── common.cl
    │   │   │   │       ├── level1
    │   │   │   │           ├── level1.cl
    │   │   │   │           ├── xasum.cl
    │   │   │   │           ├── xaxpy.cl
    │   │   │   │           ├── xcopy.cl
    │   │   │   │           ├── xdot.cl
    │   │   │   │           ├── xnrm2.cl
    │   │   │   │           ├── xscal.cl
    │   │   │   │           └── xswap.cl
    │   │   │   │       └── level3
    │   │   │   │           ├── level3.cl
    │   │   │   │           ├── xgemm_direct_part1.cl
    │   │   │   │           ├── xgemm_direct_part2.cl
    │   │   │   │           └── xgemm_direct_part3.cl
    │   │   └── lib.rs
    │   └── tests
    │   │   └── blas_specs.rs
    ├── parenchyma-deep
    │   ├── .gitignore
    │   ├── Cargo.toml
    │   ├── README.md
    │   ├── src
    │   │   ├── extension_package
    │   │   │   ├── backward.rs
    │   │   │   ├── configuration.rs
    │   │   │   ├── convolution.rs
    │   │   │   ├── forward.rs
    │   │   │   └── mod.rs
    │   │   ├── frameworks
    │   │   │   ├── mod.rs
    │   │   │   ├── native
    │   │   │   │   └── mod.rs
    │   │   │   └── open_cl
    │   │   │   │   ├── _build.rs
    │   │   │   │   ├── _mod.rs
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── package.rs
    │   │   │   │   └── source
    │   │   │   │       ├── activation.cl
    │   │   │   │       ├── activationBackward.cl
    │   │   │   │       ├── convolution.cl
    │   │   │   │       └── softmax.cl
    │   │   └── lib.rs
    │   └── tests
    │   │   └── deep_specs.rs
    ├── parenchyma-ml
    │   ├── .gitignore
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── extension_package.rs
    │   │   ├── frameworks
    │   │       ├── mod.rs
    │   │       ├── native.rs
    │   │       └── open_cl.rs
    │   │   └── lib.rs
    └── parenchyma-tr
    │   ├── .gitignore
    │   ├── Cargo.toml
    │   ├── README.md
    │   └── src
    │       ├── lib.rs
    │       ├── transformer.rs
    │       └── transformers
    │           ├── audio.rs
    │           ├── image.rs
    │           ├── mod.rs
    │           └── word.rs
├── license
    ├── LICENSE-APACHE
    ├── LICENSE-MIT
    └── README.md
├── src
    ├── backend.rs
    ├── changelog.rs
    ├── compute_device.rs
    ├── context.rs
    ├── error.rs
    ├── extension_package.rs
    ├── framework.rs
    ├── frameworks
    │   ├── mod.rs
    │   ├── native
    │   │   ├── context.rs
    │   │   ├── device.rs
    │   │   ├── framework.rs
    │   │   ├── memory.rs
    │   │   └── mod.rs
    │   └── open_cl
    │   │   ├── context.rs
    │   │   ├── device.rs
    │   │   ├── error.rs
    │   │   ├── framework.rs
    │   │   ├── memory.rs
    │   │   └── mod.rs
    ├── hardware.rs
    ├── lib.rs
    ├── memory.rs
    └── tensor
    │   ├── into_tensor.rs
    │   ├── mod.rs
    │   ├── tensor_map.rs
    │   ├── tensor_memories.rs
    │   ├── tensor_shape.rs
    │   ├── tensor_type.rs
    │   └── utility.rs
└── tests
    ├── backend_specs.rs
    ├── framework_native_specs.rs
    └── shared_memory_specs.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "parenchyma"
 3 | version = "0.0.4"
 4 | authors = ["Jony <jonysy@users.noreply.github.com>"]
 5 | keywords = ["backend", "computation", "opencl", "cuda", "hpc"]
 6 | categories = ["science"]
 7 | description = "A high-performance computing (HPC) framework"
 8 | documentation = "https://docs.rs/parenchyma"
 9 | repository = "https://github.com/lychee-eng/parenchyma"
10 | license = "MIT/Apache-2.0"
11 | 
12 | [dependencies]
13 | # enum_primitive = "0.1.1"
14 | # futures = "0.1.11"
15 | # libloading = "0.3.2"
16 | log = "0.4"
17 | ndarray = "0.10.0"
18 | num = "0.2"
19 | ocl = "0.16.0"
20 | 
21 | [dev-dependencies]
22 | # compiletest_rs = "0.2.5"
23 | lazy_static = "1.0.0"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # parenchyma
 2 | 
 3 | [![Join the chat](https://badges.gitter.im/lychee-eng/parenchyma.svg)](https://gitter.im/lychee-eng/parenchyma)
 4 | ![Project Status](https://img.shields.io/badge/status-pre--alpha-green.svg)
 5 | [![](http://meritbadge.herokuapp.com/parenchyma)](https://crates.io/crates/parenchyma)
 6 | [![License](https://img.shields.io/crates/l/parenchyma.svg)](#license)
 7 | [![parenchyma](https://docs.rs/parenchyma/badge.svg)](https://docs.rs/parenchyma)
 8 | 
 9 | Parenchyma started off as a hard fork of [Collenchyma][collenchyma-repo] (hence the name), an 
10 | extensible HPC framework developed by the [Autumn team] as well as an amazing group 
11 | of [contributors][collenchyma-contributors]. Aside from the name and overall design, the two 
12 | libraries are quite dissimilar to each other (e.g., auto-sync (thanks 
13 | to [@alexandermorozov](/../../issues/2)), async transfers, the fallback mechanism, etc.). Therefore, before migrating 
14 | over, one should go through the documentation carefully as to not make the mistake of misusing 
15 | the framework. Not doing so may result in unintended behavior for which Parenchyma 
16 | developers/contributors are not responsible.
17 | 
18 | Many of the original comments used for documentation purposes remain in the code base along with 
19 | a few necessary additions/modifications.
20 | 
21 | > Disclaimer: Parenchyma is currently undergoing extensive refactoring and improvement. Therefore, 
22 | > it is likely that many of the features available in the original Collenchyma project may not yet 
23 | > be available in the Parenchyma project. It is also likely that certain features may never be 
24 | > available in the Parenchyma project, as the different approaches that are currently being 
25 | > considered may prove to be better than the original approach.
26 | 
27 | ### Tensor creation
28 | 
29 | The easiest way to create a tensor is to use the `array` macro: 
30 | 
31 | ```rust
32 | #[macro_use(array)]
33 | extern crate parenchyma;
34 | 
35 | use parenchyma::prelude::*;
36 | 
37 | let t: SharedTensor<i32> = array![
38 |     [
39 |         [1,2,3],
40 |         [4,5,6]
41 |     ],
42 |     [
43 |         [11,22,33],
44 |         [44,55,66]
45 |     ],
46 |     [
47 |         [111,222,333],
48 |         [444,555,666]
49 |     ],
50 |     [
51 |         [1111,2222,3333],
52 |         [4444,5555,6666]
53 |     ]
54 | ].into();
55 | 
56 | println!("{:?}", t);
57 | 
58 | // shape=[4, 2, 3], strides=[6, 3, 1], layout=C (0x1), type=i32
59 | //
60 | // [[[1, 2, 3],
61 | //   [4, 5, 6]],
62 | //  [[11, 22, 33],
63 | //   [44, 55, 66]],
64 | //  [[111, 222, 333],
65 | //   [444, 555, 666]],
66 | //  [[1111, 2222, 3333],
67 | //   [4444, 5555, 6666]]]
68 | ```
69 | 
70 | ### Synchronizing Data
71 | 
72 | Synchronizing data across multiple compute devices and backends is straightforward.
73 | 
74 | ```rust
75 | #[macro_use(array)]
76 | extern crate parenchyma;
77 | 
78 | use parenchyma::prelude::*;
79 | 
80 | let ref cuda: Backend = Backend::new::<Cuda>()?;
81 | 
82 | let t = array![[1.5, 2.3, 3.7], [4.8, 5.2, 6.9]].into();
83 | 
84 | t.synchronize(cuda)?;
85 | ```
86 | 
87 | ## License
88 | 
89 | Dual licensed under
90 |   * Apache License, Version 2.0 ([LICENSE-APACHE] or http://www.apache.org/licenses/LICENSE-2.0)
91 |   * MIT license ([LICENSE-MIT] or http://opensource.org/licenses/MIT)
92 | 
93 | [Autumn team]: https://github.com/autumnai
94 | [collenchyma-repo]: https://github.com/autumnai/collenchyma
95 | [collenchyma-contributors]: https://github.com/autumnai/collenchyma/graphs/contributors
96 | [LICENSE-APACHE]: ../../../license/blob/master/LICENSE-APACHE
97 | [LICENSE-MIT]: ../../../license/blob/master/LICENSE-MIT


--------------------------------------------------------------------------------
/benches/shared_tensor.rs:
--------------------------------------------------------------------------------
  1 | #![feature(test)]
  2 | 
  3 | extern crate parenchyma;
  4 | extern crate test;
  5 | 
  6 | use parenchyma::{Backend, Native, OpenCL, SharedTensor};
  7 | use test::Bencher;
  8 | 
  9 | fn native_backend() -> Backend {
 10 |     Backend::new::<Native>().unwrap()
 11 | }
 12 | 
 13 | fn opencl_backend() -> Backend {
 14 |     Backend::new::<OpenCL>().unwrap()
 15 | }
 16 | 
 17 | fn sync_back_and_forth(b: &mut Bencher, backend1: Backend, backend2: Backend, s: usize) {
 18 | 
 19 |     let mem = &mut SharedTensor::<u8>::new(s);
 20 | 
 21 |     // initialize and warm-up
 22 |     let _ = mem.write(&backend2).unwrap();
 23 |     let _ = mem.read_write(&backend1).unwrap();
 24 |     let _ = mem.read_write(&backend2).unwrap();
 25 | 
 26 |     b.bytes = s as u64 * 2; // we do two transfers per iteration
 27 | 
 28 |     b.iter(|| {
 29 |         let _ = mem.read_write(&backend1).unwrap();
 30 |         let _ = mem.read_write(&backend2).unwrap();
 31 |     });
 32 | }
 33 | 
 34 | fn unidirectional_sync(b: &mut Bencher, src: Backend, dst: Backend, size: usize) {
 35 | 
 36 |     let mem = &mut SharedTensor::<u8>::new(size);
 37 | 
 38 |     // initialize and warm-up
 39 |     let _ = mem.write(&src).unwrap();
 40 |     let _ = mem.read(&dst).unwrap();
 41 | 
 42 |     b.bytes = size as u64;
 43 | 
 44 |     b.iter(|| {
 45 |         let _ = mem.write(&src).unwrap();
 46 |         let _ = mem.read(&dst).unwrap();
 47 |     });
 48 | }
 49 | 
 50 | // #[inline(never)]
 51 | // fn bench_256_alloc_1mb_opencl_profile(b: &mut Bencher, device: &OpenCLDevice, size: usize) {
 52 | //     b.iter(|| 
 53 | //         for _ in 0..256 {
 54 | //             let _ = device.allocate_memory(size).unwrap(); });
 55 | // }
 56 | 
 57 | // // #[bench]
 58 | // // fn bench_256_alloc_1mb_opencl_cpu(b: &mut Bencher) {
 59 | // //     let opencl_backend = opencl_backend();
 60 | // //     let cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
 61 | 
 62 | // //     bench_256_alloc_1mb_opencl_profile(b, cpu, 1_048_576);
 63 | // // }
 64 | 
 65 | // // #[bench]
 66 | // // fn bench_256_alloc_1mb_opencl_gpu(b: &mut Bencher) {
 67 | // //     let opencl_backend = opencl_backend();
 68 | // //     let gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
 69 | 
 70 | // //     bench_256_alloc_1mb_opencl_profile(b, gpu, 1_048_576);
 71 | // // }
 72 | 
 73 | // #[bench]
 74 | // fn bench_256_alloc_1mb_opencl(b: &mut Bencher) {
 75 | //     let opencl_backend = opencl_backend();
 76 | //     let ref d = opencl_backend.devices()[0];
 77 | 
 78 | //     bench_256_alloc_1mb_opencl_profile(b, d, 1_048_576);
 79 | // }
 80 | 
 81 | #[bench]
 82 | fn bench_sync_1kb_native_opencl_back_and_forth(b: &mut Bencher) {
 83 |     sync_back_and_forth(b, opencl_backend(), native_backend(), 1024);
 84 | }
 85 | 
 86 | #[bench]
 87 | fn bench_sync_1kb_native_to_opencl(b: &mut Bencher) {
 88 |     unidirectional_sync(b, native_backend(), opencl_backend(), 1024);
 89 | }
 90 | 
 91 | #[bench]
 92 | fn bench_sync_1kb_opencl_to_native(b: &mut Bencher) {
 93 |     unidirectional_sync(b, opencl_backend(), native_backend(), 1024);
 94 | }
 95 | 
 96 | #[bench]
 97 | fn bench_sync_1mb_native_opencl_back_and_forth(b: &mut Bencher) {
 98 |     sync_back_and_forth(b, opencl_backend(), native_backend(), 1_048_576);
 99 | }
100 | 
101 | #[bench]
102 | fn bench_sync_1mb_native_to_opencl(b: &mut Bencher) {
103 |     unidirectional_sync(b, native_backend(), opencl_backend(), 1_048_576);
104 | }
105 | 
106 | #[bench]
107 | fn bench_sync_1mb_opencl_to_native(b: &mut Bencher) {
108 |     unidirectional_sync(b, opencl_backend(), native_backend(), 1_048_576);
109 | }
110 | 
111 | #[bench]
112 | fn bench_sync_128mb_native_opencl_back_and_forth(b: &mut Bencher) {
113 |     sync_back_and_forth(b, opencl_backend(), native_backend(), 128 * 1_048_576);
114 | }
115 | 
116 | #[bench]
117 | fn bench_sync_128mb_native_to_opencl(b: &mut Bencher) {
118 |     unidirectional_sync(b, native_backend(), opencl_backend(), 128 * 1_048_576);
119 | }
120 | 
121 | #[bench]
122 | fn bench_sync_128mb_opencl_to_native(b: &mut Bencher) {
123 |     unidirectional_sync(b, opencl_backend(), native_backend(), 128 * 1_048_576);
124 | }
125 | 
126 | // // fn bench_shared_tensor_access_time_first_(b: &mut Bencher, device: &OpenCLDevice) {
127 | 
128 | // //     let native_backend = native_backend();
129 | // //     let ref native_cpu = native_backend.devices()[0];
130 | 
131 | // //     let mut x = SharedTensor::<f32>::from(vec![128]);
132 | // //     x.write_only(native_cpu).unwrap();
133 | // //     x.write_only(device).unwrap();
134 | // //     x.read(native_cpu).unwrap();
135 | 
136 | // //     b.iter(|| {
137 | // //         let _ = x.read(native_cpu).unwrap();
138 | // //     })
139 | // // }
140 | 
141 | // // #[bench]
142 | // // fn bench_shared_tensor_access_time_first_cpu(b: &mut Bencher) {
143 | // //     let opencl_backend = opencl_backend();
144 | // //     let opencl_cpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Cpu).nth(0).unwrap();
145 | 
146 | // //     bench_shared_tensor_access_time_first_(b, opencl_cpu);
147 | // // }
148 | 
149 | // // #[bench]
150 | // // fn bench_shared_tensor_access_time_first_gpu(b: &mut Bencher) {
151 | // //     let opencl_backend = opencl_backend();
152 | // //     let opencl_gpu = opencl_backend.devices().iter().filter(|d| *d.kind() == Gpu).nth(0).unwrap();
153 | 
154 | // //     bench_shared_tensor_access_time_first_(b, opencl_gpu);
155 | // // }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/crates/parenchyma-blas/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "parenchyma-blas"
 3 | version = "0.0.1"
 4 | authors = ["Jony <jonysy@users.noreply.github.com>"]
 5 | license = "MIT/Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | ocl = "0.16.0"
 9 | rblas = "0.0.13"
10 | 
11 | [dependencies.parenchyma]
12 | path = "../../"
13 | version = "0.0.4"
14 | 
15 | [dev-dependencies]
16 | lazy_static = "1.1.0"


--------------------------------------------------------------------------------
/crates/parenchyma-blas/README.md:
--------------------------------------------------------------------------------
 1 | # parenchyma-blas
 2 | 
 3 | This package provides full BLAS (Basic Linear Algebra Subprograms) support for Parenchyma, so you 
 4 | can use BLAS on servers, desktops or mobiles, GPUs, FPGAs or CPUS, without worrying about OpenCL or 
 5 | CUDA support on the machine.
 6 | 
 7 | ## Provided Operations
 8 | 
 9 | This package provides the following operations to Parenchyma backends:
10 | 
11 | |           | CUDA (cuBLAS) | OpenCL    | Native (rblas)    |
12 | |---        |---            |---        |---                |
13 | | Level 1   | (collenchyma) | ✓         | ✓                 |
14 | | Level 2   | -             | -         | -                 |
15 | | Level 3   | (collenchyma) | (some)    | (some)            | 


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/axpby.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma::error::Result;
 2 | use parenchyma::tensor::SharedTensor;
 3 | 
 4 | /// Extends IBlas with Axpby
 5 | pub trait Axpby: super::Vector {
 6 |     /// Performs the operation y := a*x + b*y .
 7 |     ///
 8 |     /// Consists of a scal(b, y) followed by a axpby(a,x,y).
 9 |     fn axpby(&self, a: &SharedTensor, x: &SharedTensor, b: &SharedTensor, y: &mut SharedTensor) -> Result {
10 |         self.scal(b, y)?;
11 |         self.axpy(a, x, y)?;
12 |         Ok(())
13 |     }
14 | }
15 | 
16 | impl<A> Axpby for A where A: super::Vector {
17 |     // ..
18 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/level1.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma::error::Result;
 2 | use parenchyma::tensor::SharedTensor;
 3 | 
 4 | /// `Vector` consists of level 1 BLAS routines - vector operations on strided arrays.
 5 | pub trait Vector {
 6 |     /// Provides the asum operation.
 7 |     ///
 8 |     /// Computes the sum of the absolute values of the elements of `x`, and the saves the `result`.
 9 |     fn asum(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
10 |         unimplemented!("asum")
11 |     }
12 |     /// Provides the axpy operation.
13 |     ///
14 |     /// Computes a vector `x` times a constant `a` plus a vector `y` (i.e., `a * x + y`), and then
15 |     /// saves the result to `y`.
16 |     fn axpy(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) -> Result {
17 |         unimplemented!("axpy")
18 |     }
19 |     /// Provides the copy operation.
20 |     ///
21 |     /// Copies `from.len()` elements of vector `from` into vector `to`.
22 |     fn copy(&self, from: &SharedTensor, to: &mut SharedTensor) -> Result {
23 |         unimplemented!("copy")
24 |     }
25 |     /// Provides the dot operation.
26 |     ///
27 |     /// Computes the [dot product] over `x` and `y`, and then saves the `result`.
28 |     fn dot(&self, x: &SharedTensor, y: &SharedTensor, result: &mut SharedTensor) -> Result {
29 |         unimplemented!("dot")
30 |     }
31 |     /// Provides the nrm2 operation.
32 |     ///
33 |     /// Computes the L2 norm (i.e., the euclidean length of vector `x`), and then saves the `result`.
34 |     fn nrm2(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
35 |         unimplemented!("nrm2")
36 |     }
37 |     /// Provides the scal operation.
38 |     ///
39 |     /// Scales a vector `x` by a constant `a` (i.e., `a * x`).
40 |     fn scal(&self, a: &SharedTensor, x: &mut SharedTensor) -> Result {
41 |         unimplemented!("scal")
42 |     }
43 |     /// Provides the swap operation.
44 |     ///
45 |     /// Swaps the elements of vector `x` and vector `y`.
46 |     fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result {
47 |         unimplemented!("swap")
48 |     }
49 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/level2.rs:
--------------------------------------------------------------------------------
1 | /// `MatrixVector` consists of level 2 BLAS routines - a generalized matrix-vector multiplication 
2 | /// and more.
3 | pub trait MatrixVector { }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/level3.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma::error::Result;
 2 | use parenchyma::tensor::SharedTensor;
 3 | 
 4 | use super::Transposition;
 5 | 
 6 | // pub struct View<'a>(&'a Array<f32, IxDyn>);
 7 | // pub struct ViewMut<'a>(&'a mut Array<f32, IxDyn>);
 8 | // impl<'a> Matrix<f32> for View<'a> {
 9 | //     fn rows(&self) -> i32 {
10 | //         self.0.rows()
11 | //     }
12 | //     fn cols(&self) -> i32 {
13 | //         self.0.cols()
14 | //     }
15 | //     fn as_ptr(&self) -> *const f32 {
16 | //         unimplemented!()
17 | //     }
18 | //     fn as_mut_ptr(&self) -> *mut f32 {
19 | //         unimplemented!()
20 | //     }
21 | // }
22 | 
23 | pub struct GenericMatrix<'a> {
24 |     /// The factor of matrix A (scalar).
25 |     pub scalar: &'a SharedTensor,
26 |     /// Buffer object storing matrix A.
27 |     pub matrix: &'a SharedTensor,
28 |     /// How matrix A is to be transposed.
29 |     pub transposition: Transposition,
30 | }
31 | 
32 | /// The trait `Matrix` consists of level 3 BLAS routines - matrix-matrix operations, including a 
33 | /// general matrix multiplication.
34 | pub trait Matrix {
35 |     /// Computes a matrix-matrix product with general matrices.
36 |     ///
37 |     /// # Arguments
38 |     ///
39 |     /// * `alpha` - The factor of matrix A (scalar).
40 |     /// * `amatrix_transposition` - How matrix A is to be transposed.
41 |     /// * `amatrix` - The buffer object storing matrix A..
42 |     fn gemm(
43 |         self: &Self,
44 |         alpha: &SharedTensor,
45 |         amatrix_transposition: Transposition,
46 |         amatrix: &SharedTensor,
47 |         bmatrix_transposition: Transposition,
48 |         bmatrix: &SharedTensor,
49 |         beta: &SharedTensor,
50 |         cmatrix: &mut SharedTensor) -> Result {
51 |         unimplemented!()
52 |     }
53 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/mod.rs:
--------------------------------------------------------------------------------
 1 | pub use self::axpby::Axpby;
 2 | pub use self::level1::Vector;
 3 | pub use self::level2::MatrixVector;
 4 | pub use self::level3::{GenericMatrix, Matrix};
 5 | pub use self::transpose::Transposition;
 6 | 
 7 | mod axpby;
 8 | mod level1;
 9 | mod level2;
10 | mod level3;
11 | mod transpose;
12 | 
13 | use parenchyma::extension_package::ExtensionPackage;
14 | 
15 | /// The BLAS package.
16 | pub enum Package {
17 |     Native,
18 |     OpenCL(::frameworks::open_cl::OpenCLPackage),
19 | }
20 | 
21 | impl Package {
22 |     pub fn open_cl(&self) -> &::frameworks::open_cl::OpenCLPackage {
23 |         if let &Package::OpenCL(ref package) = self {
24 |             package
25 |         } else {
26 |             panic!("an Open CL package was expected, but another package was found.")
27 |         }
28 |     }
29 | }
30 | 
31 | /// Provides level 1, 2, and 3 BLAS operations.
32 | ///
33 | /// **note**: should be replaced with an actual trait alias ([RFC#1733]).
34 | ///
35 | /// [RFC#1733]: https://github.com/rust-lang/rfcs/pull/1733
36 | pub trait Extension: Axpby + Vector + MatrixVector + Matrix { }
37 | 
38 | impl ExtensionPackage for Package {
39 |     type Extension = Extension;
40 | 
41 |     fn package_name(&self) -> &'static str {
42 |         return "parenchyma/blas";
43 |     }
44 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/extension_package/transpose.rs:
--------------------------------------------------------------------------------
 1 | use rblas;
 2 | 
 3 | /// Possible transpose operations that can be applied in Level 2 and Level 3 BLAS operations.
 4 | #[derive(Clone, Copy, Debug, Eq, PartialEq)]
 5 | pub enum Transposition {
 6 |     /// Take the conjugate transpose of the matrix.
 7 |     ConjugateTranspose,
 8 |     /// Take the matrix as it is.
 9 |     NoTranspose,
10 |     /// Take the transpose of the matrix.
11 |     Transpose,
12 | }
13 | 
14 | impl Into<rblas::attribute::Transpose> for Transposition {
15 |     /// Converts a `Transposition` to an rblas `Transpose`.
16 |     fn into(self) -> rblas::attribute::Transpose {
17 |         match self {
18 |             Transposition::ConjugateTranspose => rblas::attribute::Transpose::ConjTrans,
19 |             Transposition::NoTranspose => rblas::attribute::Transpose::NoTrans,
20 |             Transposition::Transpose => rblas::attribute::Transpose::Trans,
21 |         }
22 |     }
23 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod native;
2 | pub mod open_cl;


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/native/mod.rs:
--------------------------------------------------------------------------------
  1 | use parenchyma::error::{Error, ErrorKind, Result};
  2 | use parenchyma::extension_package::Dependency;
  3 | use parenchyma::frameworks::NativeContext as Context;
  4 | use parenchyma::tensor::SharedTensor;
  5 | 
  6 | use rblas;
  7 | use rblas::math::mat::Mat;
  8 | use rblas::matrix::Matrix as IMatrix;
  9 | 
 10 | use super::super::{Extension, Package, Transposition};
 11 | use super::super::extension_package::{Matrix, MatrixVector, Vector};
 12 | 
 13 | impl<P> Extension for Context<P> where P: Dependency<Package> { }
 14 | 
 15 | impl<P> Vector for Context<P> where P: Dependency<Package> {
 16 |     fn asum(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 17 |         result.as_mut_slice_unsynched()?[0] = rblas::Asum::asum(x.as_slice()?);
 18 |         Ok(())
 19 |     }
 20 | 
 21 |     fn axpy(&self, a: &SharedTensor, x: &SharedTensor, y: &mut SharedTensor) -> Result {
 22 |         Ok(rblas::Axpy::axpy(
 23 |             a.as_slice()?.get(0)
 24 |                 .ok_or_else(|| Error::new(ErrorKind::Other, "Index out of bounds"))?, 
 25 |             x.as_slice()?, 
 26 |             y.as_mut_slice()?
 27 |         ))
 28 |     }
 29 | 
 30 |     fn copy(&self, from: &SharedTensor, to: &mut SharedTensor) -> Result {
 31 |         Ok(rblas::Copy::copy(
 32 |             from.as_slice()?, to.as_mut_slice_unsynched()?))
 33 |     }
 34 | 
 35 |     fn dot(&self, x: &SharedTensor, y: &SharedTensor, result: &mut SharedTensor) -> Result {
 36 |         result.as_mut_slice_unsynched()?[0] =  rblas::Dot::dot(x.as_slice()?, y.as_slice()?);
 37 |         Ok(())
 38 |     }
 39 | 
 40 |     fn nrm2(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 41 |         result.as_mut_slice_unsynched()?[0] =  rblas::Nrm2::nrm2(x.as_slice()?);
 42 |         Ok(())
 43 |     }
 44 | 
 45 |     fn scal(&self, a: &SharedTensor, x: &mut SharedTensor) -> Result {
 46 |         Ok(rblas::Scal::scal(
 47 |             a.as_slice()?.get(0)
 48 |                 .ok_or_else(|| Error::new(ErrorKind::Other, "Index out of bounds"))?, 
 49 |             x.as_mut_slice()?
 50 |         ))
 51 |     }
 52 | 
 53 |     fn swap(&self, x: &mut SharedTensor, y: &mut SharedTensor) -> Result {
 54 |         Ok(rblas::Swap::swap(x.as_mut_slice()?, y.as_mut_slice()?))
 55 |     }
 56 | }
 57 | 
 58 | impl<P> Matrix for Context<P> where P: Dependency<Package> {
 59 |     fn gemm(
 60 |         self: &Self,
 61 |         alpha: &SharedTensor,
 62 |         amatrix_transposition: Transposition,
 63 |         amatrix: &SharedTensor,
 64 |         bmatrix_transposition: Transposition,
 65 |         bmatrix: &SharedTensor,
 66 |         beta: &SharedTensor,
 67 |         cmatrix: &mut SharedTensor) -> Result {
 68 | 
 69 |         let a_0 = amatrix.shape().dimensions()[0] as i32;
 70 |         let a_1 = amatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
 71 | 
 72 |         let b_0 = bmatrix.shape().dimensions()[0] as i32;
 73 |         let b_1 = bmatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
 74 | 
 75 |         let c_0 = cmatrix.shape().dimensions()[0] as i32;
 76 |         let c_1 = cmatrix.shape().dimensions().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
 77 | 
 78 |         let input = as_matrix(amatrix.as_slice()?, a_0 as usize, a_1 as usize);
 79 |         let weights = as_matrix(bmatrix.as_slice()?, b_0 as usize, b_1 as usize);
 80 |         let mut output = as_matrix(cmatrix.as_slice()?, c_0 as usize, c_1 as usize);
 81 | 
 82 |         rblas::Gemm::gemm(
 83 |             &alpha.as_slice()?[0], 
 84 |             amatrix_transposition.into(), 
 85 |             &input, 
 86 | 
 87 |             bmatrix_transposition.into(),
 88 |             &weights,
 89 |             &beta.as_slice()?[0], 
 90 | 
 91 |             &mut output
 92 |         );
 93 | 
 94 |         read_from_matrix(&output, cmatrix.as_mut_slice()?);
 95 | 
 96 |         Ok(())
 97 |     }
 98 | }
 99 | 
100 | fn as_matrix(slice: &[f32], nrows: usize, ncols: usize) -> Mat<f32> {
101 |     let mut mat: Mat<f32> = Mat::new(nrows, ncols);
102 | 
103 |     for i in 0..nrows {
104 |         for j in 0..ncols {
105 |             let index = ncols * i + j;
106 |             unsafe {
107 |                 *mat.as_mut_ptr().offset(index as isize) = slice[index].clone();
108 |             }
109 |         }
110 |     }
111 | 
112 |     mat
113 | }
114 | 
115 | fn read_from_matrix(mat: &Mat<f32>, slice: &mut [f32]) {
116 |     let n = mat.rows();
117 |     let m = mat.cols();
118 |     for i in 0..n {
119 |         for j in 0..m {
120 |             let index = m * i + j;
121 |             slice[index] = mat[i][j].clone();
122 |         }
123 |     }
124 | }
125 | 
126 | impl<P> MatrixVector for Context<P> where P: Dependency<Package> { }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/implementation/level1.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma;
 2 | use parenchyma::open_cl::OpenCLContext;
 3 | use parenchyma::{Context, SharedTensor};
 4 | use parenchyma::error::Result;
 5 | 
 6 | use extension::Vector;
 7 | 
 8 | impl Vector for OpenCLContext {
 9 | 
10 |     fn asum(
11 |         &self, 
12 |         x: &SharedTensor<f32>, 
13 |         result: &mut SharedTensor<f32>) -> Result {
14 | 
15 |         unimplemented!()
16 |     }
17 | 
18 |     fn axpy(
19 |         &self, 
20 |         a: &SharedTensor<f32>, 
21 |         x: &SharedTensor<f32>, 
22 |         y: &mut SharedTensor<f32>) -> Result {
23 | 
24 |         let kernel: ::ocl::Kernel = unimplemented!();
25 | 
26 |         let n = x.shape().capacity;
27 | 
28 |         let alpha = parenchyma::tensor(self, alpha)?;
29 |         let x = parenchyma::tensor(self, x)?;
30 |         let y = parenchyma::tensor_mut(self, y)?;
31 | 
32 |         let offset = 0;
33 |         let inc = 1;
34 | 
35 |         kernel
36 |             .arg_scl(n)
37 |             .arg_buf(alpha)
38 |             .arg_buf(x).arg_scl(offset).arg_scl(inc)
39 |             .arg_buf(y).arg_scl(offset).arg_scl(inc)
40 |         //     //.gwo(..)
41 |         //     .gws([WGS, 1, 1])
42 |         //     .lws([WGS, 1, 1])
43 |         //     // todo The queue must be associated with a device associated with the kernel's program.
44 |             .queue(self.active_direct().queue().clone())
45 |             .enq()?;
46 | 
47 | 
48 |         Ok(())
49 |     }
50 | 
51 |     fn copy(
52 |         &self, 
53 |         from: &SharedTensor<f32>, 
54 |         to: &mut SharedTensor<f32>) -> Result {
55 | 
56 |         unimplemented!()
57 |     }
58 | 
59 |     fn dot(
60 |         &self, 
61 |         x: &SharedTensor<f32>, 
62 |         y: &SharedTensor<f32>, 
63 |         result: &mut SharedTensor<f32>) -> Result {
64 | 
65 |         unimplemented!()
66 |     }
67 | 
68 |     fn nrm2(
69 |         &self, 
70 |         x: &SharedTensor<f32>, 
71 |         result: &mut SharedTensor<f32>) -> Result {
72 | 
73 |         unimplemented!()
74 |     }
75 | 
76 |     fn scal(
77 |         &self, 
78 |         a: &SharedTensor<f32>, 
79 |         x: &mut SharedTensor<f32>) -> Result {
80 | 
81 |         unimplemented!()
82 |     }
83 | 
84 |     fn swap(
85 |         &self, 
86 |         x: &mut SharedTensor<f32>, 
87 |         y: &mut SharedTensor<f32>) -> Result {
88 | 
89 |         unimplemented!()
90 |     }
91 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/implementation/mod.rs:
--------------------------------------------------------------------------------
1 | mod level1;


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/package.rs:
--------------------------------------------------------------------------------
 1 | use ocl;
 2 | use std::ffi::CString;
 3 | use parenchyma::error::Result;
 4 | use parenchyma::frameworks::OpenCLContext;
 5 | 
 6 | // const WGS: usize = 64;
 7 | // const WGS1: usize = 64;
 8 | // const WGS2: usize = 64;
 9 | 
10 | // /// Caches instances of `Kernel`
11 | // #[derive(Debug)]
12 | // pub struct OpenCLPackage {
13 | //     pub(in super) program: ocl::Program,
14 | //     asum: [ocl::Kernel; 2],
15 | //     pub(in super) axpy: ocl::Kernel,
16 | //     copy: ocl::Kernel,
17 | //     dot: [ocl::Kernel; 2],
18 | //     nrm2: [ocl::Kernel; 2],
19 | //     scal: ocl::Kernel,
20 | //     swap: ocl::Kernel,
21 | 
22 | //     gemm_direct: Gemm,
23 | // }
24 | 
25 | // #[derive(Debug)]
26 | // pub struct Gemm {
27 | //     tt: ocl::Kernel,
28 | //     tn: ocl::Kernel,
29 | //     nt: ocl::Kernel,
30 | //     nn: ocl::Kernel,
31 | // }
32 | 
33 | /// Caches instances of `Kernel`
34 | #[derive(Debug)]
35 | pub struct OpenCLPackage {
36 |     pub(in frameworks::open_cl) program: ocl::Program,
37 | }
38 | 
39 | impl OpenCLPackage {
40 |     pub fn compile(cx: &mut OpenCLContext<()>) -> Result<OpenCLPackage> {
41 |         let program = cx.program(vec![
42 |             CString::new(include_str!("source/common.cl")).unwrap(),
43 | 
44 |             CString::new(include_str!("source/level1/level1.cl")).unwrap(),
45 |             CString::new(include_str!("source/level1/xasum.cl")).unwrap(),
46 |             CString::new(include_str!("source/level1/xaxpy.cl")).unwrap(),
47 |             CString::new(include_str!("source/level1/xcopy.cl")).unwrap(),
48 |             CString::new(include_str!("source/level1/xdot.cl")).unwrap(),
49 |             CString::new(include_str!("source/level1/xnrm2.cl")).unwrap(),
50 |             CString::new(include_str!("source/level1/xscal.cl")).unwrap(),
51 |             CString::new(include_str!("source/level1/xswap.cl")).unwrap(),
52 | 
53 |             CString::new(include_str!("source/level3/level3.cl")).unwrap(),
54 |             CString::new(include_str!("source/level3/xgemm_direct_part1.cl")).unwrap(),
55 |             CString::new(include_str!("source/level3/xgemm_direct_part2.cl")).unwrap(),
56 |             CString::new(include_str!("source/level3/xgemm_direct_part3.cl")).unwrap(),
57 |         ])?;
58 | 
59 |         // Ok(OpenCLPackage {
60 |         //     asum: [ocl::Kernel::new("Xasum", &program)?, ocl::Kernel::new("XasumEpilogue", &program)?],
61 |         //     axpy: ocl::Kernel::new("Xaxpy", &program)?, 
62 |         //     copy: ocl::Kernel::new("Xcopy", &program)?, 
63 |         //     dot: [ocl::Kernel::new("Xdot", &program)?, ocl::Kernel::new("XdotEpilogue", &program)?],
64 |         //     nrm2: [ocl::Kernel::new("Xnrm2", &program)?, ocl::Kernel::new("Xnrm2Epilogue", &program)?],
65 |         //     scal: ocl::Kernel::new("Xscal", &program)?,
66 |         //     swap: ocl::Kernel::new("Xswap", &program)?,
67 | 
68 |         //     gemm_direct: Gemm {
69 |         //         tt: ocl::Kernel::new("XgemmDirectTT", &program)?,
70 |         //         tn: ocl::Kernel::new("XgemmDirectTN", &program)?,
71 |         //         nt: ocl::Kernel::new("XgemmDirectNT", &program)?,
72 |         //         nn: ocl::Kernel::new("XgemmDirectNN", &program)?,
73 |         //     },
74 | 
75 |         //     program,
76 |         // })
77 | 
78 |         Ok(OpenCLPackage { program })
79 |     }
80 | }


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/common.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file contains the common defines and type-defs for the CLBlast OpenCL kernels.
 11 | //
 12 | // =================================================================================================
 13 | 
 14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 15 | // literal). Comment-out this line for syntax-highlighting when developing.
 16 | 
 17 | // =================================================================================================
 18 | 
 19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 20 | // this file is used outside of the CLBlast library.
 21 | #ifndef PRECISION
 22 |   #define PRECISION 32      // Data-types: half, single or double precision, complex or regular
 23 | #endif
 24 | 
 25 | // =================================================================================================
 26 | 
 27 | // Enable support for double-precision
 28 | #if PRECISION == 16
 29 |   #pragma OPENCL EXTENSION cl_khr_fp16: enable
 30 | #endif
 31 | 
 32 | // Enable support for double-precision
 33 | #if PRECISION == 64 || PRECISION == 6464
 34 |   #if __OPENCL_VERSION__ <= CL_VERSION_1_1
 35 |      #pragma OPENCL EXTENSION cl_khr_fp64: enable
 36 |   #endif
 37 | #endif
 38 | 
 39 | // Half-precision
 40 | #if PRECISION == 16
 41 |   typedef half real;
 42 |   typedef half2 real2;
 43 |   typedef half4 real4;
 44 |   typedef half8 real8;
 45 |   typedef half16 real16;
 46 |   #define ZERO 0
 47 |   #define ONE 1
 48 |   #define SMALLEST -1.0e14
 49 | 
 50 | // Single-precision
 51 | #elif PRECISION == 32
 52 |   typedef float real;
 53 |   typedef float2 real2;
 54 |   typedef float4 real4;
 55 |   typedef float8 real8;
 56 |   typedef float16 real16;
 57 |   #define ZERO 0.0f
 58 |   #define ONE 1.0f
 59 |   #define SMALLEST -1.0e37f
 60 | 
 61 | // Double-precision 
 62 | #elif PRECISION == 64
 63 |   typedef double real;
 64 |   typedef double2 real2;
 65 |   typedef double4 real4;
 66 |   typedef double8 real8;
 67 |   typedef double16 real16;
 68 |   #define ZERO 0.0
 69 |   #define ONE 1.0
 70 |   #define SMALLEST -1.0e37
 71 | 
 72 | // Complex single-precision
 73 | #elif PRECISION == 3232
 74 |   typedef struct cfloat {float x; float y;} real;
 75 |   typedef struct cfloat2 {real x; real y;} real2;
 76 |   typedef struct cfloat4 {real x; real y; real z; real w;} real4;
 77 |   typedef struct cfloat8 {real s0; real s1; real s2; real s3;
 78 |                           real s4; real s5; real s6; real s7;} real8;
 79 |   typedef struct cfloat16 {real s0; real s1; real s2; real s3;
 80 |                            real s4; real s5; real s6; real s7;
 81 |                            real s8; real s9; real sA; real sB;
 82 |                            real sC; real sD; real sE; real sF;} real16;
 83 |   #define ZERO 0.0f
 84 |   #define ONE 1.0f
 85 |   #define SMALLEST -1.0e37f
 86 | 
 87 | // Complex double-precision
 88 | #elif PRECISION == 6464
 89 |   typedef struct cdouble {double x; double y;} real;
 90 |   typedef struct cdouble2 {real x; real y;} real2;
 91 |   typedef struct cdouble4 {real x; real y; real z; real w;} real4;
 92 |   typedef struct cdouble8 {real s0; real s1; real s2; real s3;
 93 |                            real s4; real s5; real s6; real s7;} real8;
 94 |   typedef struct cdouble16 {real s0; real s1; real s2; real s3;
 95 |                             real s4; real s5; real s6; real s7;
 96 |                             real s8; real s9; real sA; real sB;
 97 |                             real sC; real sD; real sE; real sF;} real16;
 98 |   #define ZERO 0.0
 99 |   #define ONE 1.0
100 |   #define SMALLEST -1.0e37
101 | #endif
102 | 
103 | // Single-element version of a complex number
104 | #if PRECISION == 3232
105 |   typedef float singlereal;
106 | #elif PRECISION == 6464
107 |   typedef double singlereal;
108 | #else
109 |   typedef real singlereal;
110 | #endif
111 | 
112 | // Converts a 'real argument' value to a 'real' value as passed to the kernel. Normally there is no
113 | // conversion, but half-precision is not supported as kernel argument so it is converted from float.
114 | #if PRECISION == 16
115 |   typedef float real_arg;
116 |   #define GetRealArg(x) (half)x
117 | #else
118 |   typedef real real_arg;
119 |   #define GetRealArg(x) x
120 | #endif
121 | 
122 | // =================================================================================================
123 | 
124 | // Don't use the non-IEEE754 compliant OpenCL built-in mad() instruction per default. For specific
125 | // devices, this is enabled (see src/routine.cc).
126 | #ifndef USE_CL_MAD
127 |   #define USE_CL_MAD 0
128 | #endif
129 | 
130 | // Sets a variable to zero
131 | #if PRECISION == 3232 || PRECISION == 6464
132 |   #define SetToZero(a) a.x = ZERO; a.y = ZERO
133 | #else
134 |   #define SetToZero(a) a = ZERO
135 | #endif
136 | 
137 | // Sets a variable to zero (only the imaginary part)
138 | #if PRECISION == 3232 || PRECISION == 6464
139 |   #define ImagToZero(a) a.y = ZERO
140 | #else
141 |   #define ImagToZero(a) 
142 | #endif
143 | 
144 | // Sets a variable to one
145 | #if PRECISION == 3232 || PRECISION == 6464
146 |   #define SetToOne(a) a.x = ONE; a.y = ZERO
147 | #else
148 |   #define SetToOne(a) a = ONE
149 | #endif
150 | 
151 | // Determines whether a variable is zero
152 | #if PRECISION == 3232 || PRECISION == 6464
153 |   #define IsZero(a) ((a.x == ZERO) && (a.y == ZERO))
154 | #else
155 |   #define IsZero(a) (a == ZERO)
156 | #endif
157 | 
158 | // The absolute value (component-wise)
159 | #if PRECISION == 3232 || PRECISION == 6464
160 |   #define AbsoluteValue(value) value.x = fabs(value.x); value.y = fabs(value.y)
161 | #else
162 |   #define AbsoluteValue(value) value = fabs(value)
163 | #endif
164 | 
165 | // Adds two complex variables
166 | #if PRECISION == 3232 || PRECISION == 6464
167 |   #define Add(c, a, b) c.x = a.x + b.x; c.y = a.y + b.y
168 | #else
169 |   #define Add(c, a, b) c = a + b
170 | #endif
171 | 
172 | // Multiply two complex variables (used in the defines below)
173 | #if PRECISION == 3232 || PRECISION == 6464
174 |   #define MulReal(a, b) a.x*b.x - a.y*b.y
175 |   #define MulImag(a, b) a.x*b.y + a.y*b.x
176 | #endif
177 | 
178 | // The scalar multiply function
179 | #if PRECISION == 3232 || PRECISION == 6464
180 |   #define Multiply(c, a, b) c.x = MulReal(a,b); c.y = MulImag(a,b)
181 | #else
182 |   #define Multiply(c, a, b) c = a * b
183 | #endif
184 | 
185 | // The scalar multiply-add function
186 | #if PRECISION == 3232 || PRECISION == 6464
187 |   #define MultiplyAdd(c, a, b) c.x += MulReal(a,b); c.y += MulImag(a,b)
188 | #else
189 |   #if USE_CL_MAD == 1
190 |     #define MultiplyAdd(c, a, b) c = mad(a, b, c)
191 |   #else
192 |     #define MultiplyAdd(c, a, b) c += a * b
193 |   #endif
194 | #endif
195 | 
196 | // The scalar AXPBY function
197 | #if PRECISION == 3232 || PRECISION == 6464
198 |   #define AXPBY(e, a, b, c, d) e.x = MulReal(a,b) + MulReal(c,d); e.y = MulImag(a,b) + MulImag(c,d)
199 | #else
200 |   #define AXPBY(e, a, b, c, d) e = a*b + c*d
201 | #endif
202 | 
203 | // The complex conjugate operation for complex transforms
204 | #if PRECISION == 3232 || PRECISION == 6464
205 |   #define COMPLEX_CONJUGATE(value) value.x = value.x; value.y = -value.y
206 | #else
207 |   #define COMPLEX_CONJUGATE(value) 
208 | #endif
209 | 
210 | // =================================================================================================
211 | 
212 | // Shuffled workgroup indices to avoid partition camping, see below. For specific devices, this is
213 | // enabled (see src/routine.cc).
214 | #ifndef USE_STAGGERED_INDICES
215 |   #define USE_STAGGERED_INDICES 0
216 | #endif
217 | 
218 | // Staggered/shuffled group indices to avoid partition camping (AMD GPUs). Formula's are taken from:
219 | // http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf
220 | // More details: https://github.com/CNugteren/CLBlast/issues/53
221 | #if USE_STAGGERED_INDICES == 1
222 |   inline size_t GetGroupIDFlat() {
223 |     return get_group_id(0) + get_num_groups(0) * get_group_id(1);
224 |   }
225 |   inline size_t GetGroupID1() {
226 |     return (GetGroupIDFlat()) % get_num_groups(1);
227 |   }
228 |   inline size_t GetGroupID0() {
229 |     return ((GetGroupIDFlat() / get_num_groups(1)) + GetGroupID1()) % get_num_groups(0);
230 |   }
231 | #else
232 |   inline size_t GetGroupID1() { return get_group_id(1); }
233 |   inline size_t GetGroupID0() { return get_group_id(0); }
234 | #endif
235 | 
236 | // =================================================================================================
237 | 
238 | // End of the C++11 raw string literal
239 | 
240 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/level1.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file contains the common functions and parameters specific for level 1 BLAS kernels.
 11 | //
 12 | // =================================================================================================
 13 | 
 14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 15 | // literal). Comment-out this line for syntax-highlighting when developing.
 16 | 
 17 | // =================================================================================================
 18 | 
 19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 20 | // this kernel file is used outside of the CLBlast library.
 21 | #ifndef WGS
 22 |   #define WGS 64     // The local work-group size
 23 | #endif
 24 | #ifndef WPT
 25 |   #define WPT 1      // The amount of work-per-thread
 26 | #endif
 27 | #ifndef VW
 28 |   #define VW 1       // Vector width of vectors X and Y
 29 | #endif
 30 | 
 31 | // =================================================================================================
 32 | 
 33 | // Data-widths
 34 | #if VW == 1
 35 |   typedef real realV;
 36 | #elif VW == 2
 37 |   typedef real2 realV;
 38 | #elif VW == 4
 39 |   typedef real4 realV;
 40 | #elif VW == 8
 41 |   typedef real8 realV;
 42 | #elif VW == 16
 43 |   typedef real16 realV;
 44 | #endif
 45 | 
 46 | // =================================================================================================
 47 | 
 48 | // The vectorized multiply function
 49 | inline realV MultiplyVector(realV cvec, const real aval, const realV bvec) {
 50 |   #if VW == 1
 51 |     Multiply(cvec, aval, bvec);
 52 |   #elif VW == 2
 53 |     Multiply(cvec.x, aval, bvec.x);
 54 |     Multiply(cvec.y, aval, bvec.y);
 55 |   #elif VW == 4
 56 |     Multiply(cvec.x, aval, bvec.x);
 57 |     Multiply(cvec.y, aval, bvec.y);
 58 |     Multiply(cvec.z, aval, bvec.z);
 59 |     Multiply(cvec.w, aval, bvec.w);
 60 |   #elif VW == 8
 61 |     Multiply(cvec.s0, aval, bvec.s0);
 62 |     Multiply(cvec.s1, aval, bvec.s1);
 63 |     Multiply(cvec.s2, aval, bvec.s2);
 64 |     Multiply(cvec.s3, aval, bvec.s3);
 65 |     Multiply(cvec.s4, aval, bvec.s4);
 66 |     Multiply(cvec.s5, aval, bvec.s5);
 67 |     Multiply(cvec.s6, aval, bvec.s6);
 68 |     Multiply(cvec.s7, aval, bvec.s7);
 69 |   #elif VW == 16
 70 |     Multiply(cvec.s0, aval, bvec.s0);
 71 |     Multiply(cvec.s1, aval, bvec.s1);
 72 |     Multiply(cvec.s2, aval, bvec.s2);
 73 |     Multiply(cvec.s3, aval, bvec.s3);
 74 |     Multiply(cvec.s4, aval, bvec.s4);
 75 |     Multiply(cvec.s5, aval, bvec.s5);
 76 |     Multiply(cvec.s6, aval, bvec.s6);
 77 |     Multiply(cvec.s7, aval, bvec.s7);
 78 |     Multiply(cvec.s8, aval, bvec.s8);
 79 |     Multiply(cvec.s9, aval, bvec.s9);
 80 |     Multiply(cvec.sA, aval, bvec.sA);
 81 |     Multiply(cvec.sB, aval, bvec.sB);
 82 |     Multiply(cvec.sC, aval, bvec.sC);
 83 |     Multiply(cvec.sD, aval, bvec.sD);
 84 |     Multiply(cvec.sE, aval, bvec.sE);
 85 |     Multiply(cvec.sF, aval, bvec.sF);
 86 |   #endif
 87 |   return cvec;
 88 | }
 89 | 
 90 | // The vectorized multiply-add function
 91 | inline realV MultiplyAddVector(realV cvec, const real aval, const realV bvec) {
 92 |   #if VW == 1
 93 |     MultiplyAdd(cvec, aval, bvec);
 94 |   #elif VW == 2
 95 |     MultiplyAdd(cvec.x, aval, bvec.x);
 96 |     MultiplyAdd(cvec.y, aval, bvec.y);
 97 |   #elif VW == 4
 98 |     MultiplyAdd(cvec.x, aval, bvec.x);
 99 |     MultiplyAdd(cvec.y, aval, bvec.y);
100 |     MultiplyAdd(cvec.z, aval, bvec.z);
101 |     MultiplyAdd(cvec.w, aval, bvec.w);
102 |   #elif VW == 8
103 |     MultiplyAdd(cvec.s0, aval, bvec.s0);
104 |     MultiplyAdd(cvec.s1, aval, bvec.s1);
105 |     MultiplyAdd(cvec.s2, aval, bvec.s2);
106 |     MultiplyAdd(cvec.s3, aval, bvec.s3);
107 |     MultiplyAdd(cvec.s4, aval, bvec.s4);
108 |     MultiplyAdd(cvec.s5, aval, bvec.s5);
109 |     MultiplyAdd(cvec.s6, aval, bvec.s6);
110 |     MultiplyAdd(cvec.s7, aval, bvec.s7);
111 |   #elif VW == 16
112 |     MultiplyAdd(cvec.s0, aval, bvec.s0);
113 |     MultiplyAdd(cvec.s1, aval, bvec.s1);
114 |     MultiplyAdd(cvec.s2, aval, bvec.s2);
115 |     MultiplyAdd(cvec.s3, aval, bvec.s3);
116 |     MultiplyAdd(cvec.s4, aval, bvec.s4);
117 |     MultiplyAdd(cvec.s5, aval, bvec.s5);
118 |     MultiplyAdd(cvec.s6, aval, bvec.s6);
119 |     MultiplyAdd(cvec.s7, aval, bvec.s7);
120 |     MultiplyAdd(cvec.s8, aval, bvec.s8);
121 |     MultiplyAdd(cvec.s9, aval, bvec.s9);
122 |     MultiplyAdd(cvec.sA, aval, bvec.sA);
123 |     MultiplyAdd(cvec.sB, aval, bvec.sB);
124 |     MultiplyAdd(cvec.sC, aval, bvec.sC);
125 |     MultiplyAdd(cvec.sD, aval, bvec.sD);
126 |     MultiplyAdd(cvec.sE, aval, bvec.sE);
127 |     MultiplyAdd(cvec.sF, aval, bvec.sF);
128 |   #endif
129 |   return cvec;
130 | }
131 | 
132 | // =================================================================================================
133 | 
134 | // End of the C++11 raw string literal
135 | 
136 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xasum.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file contains the Xasum kernel. It implements a absolute sum computation using reduction
 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is loaded,
 12 | // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
 13 | // is executed with a single workgroup only, computing the final result.
 14 | //
 15 | // =================================================================================================
 16 | 
 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 18 | // literal). Comment-out this line for syntax-highlighting when developing.
 19 | 
 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 21 | // this kernel file is used outside of the CLBlast library.
 22 | #ifndef WGS1
 23 |   #define WGS1 64     // The local work-group size of the main kernel
 24 | #endif
 25 | #ifndef WGS2
 26 |   #define WGS2 64     // The local work-group size of the epilogue kernel
 27 | #endif
 28 | 
 29 | // =================================================================================================
 30 | 
 31 | // The main reduction kernel, performing the loading and the majority of the operation
 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
 33 | void Xasum(const int n,
 34 |            const __global real* restrict xgm, const int x_offset, const int x_inc,
 35 |            __global real* output) {
 36 |   __local real lm[WGS1];
 37 |   const int lid = get_local_id(0);
 38 |   const int wgid = get_group_id(0);
 39 |   const int num_groups = get_num_groups(0);
 40 | 
 41 |   // Performs loading and the first steps of the reduction
 42 |   real acc;
 43 |   SetToZero(acc);
 44 |   int id = wgid*WGS1 + lid;
 45 |   while (id < n) {
 46 |     real x = xgm[id*x_inc + x_offset];
 47 |     #if defined(ROUTINE_SUM) // non-absolute version
 48 |     #else
 49 |       AbsoluteValue(x);
 50 |     #endif
 51 |     Add(acc, acc, x);
 52 |     id += WGS1*num_groups;
 53 |   }
 54 |   lm[lid] = acc;
 55 |   barrier(CLK_LOCAL_MEM_FENCE);
 56 | 
 57 |   // Performs reduction in local memory
 58 |   #pragma unroll
 59 |   for (int s=WGS1/2; s>0; s=s>>1) {
 60 |     if (lid < s) {
 61 |       Add(lm[lid], lm[lid], lm[lid + s]);
 62 |     }
 63 |     barrier(CLK_LOCAL_MEM_FENCE);
 64 |   }
 65 | 
 66 |   // Stores the per-workgroup result
 67 |   if (lid == 0) {
 68 |     output[wgid] = lm[0];
 69 |   }
 70 | }
 71 | 
 72 | // =================================================================================================
 73 | 
 74 | // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 75 | // be launched with a single workgroup only.
 76 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 77 | void XasumEpilogue(const __global real* restrict input,
 78 |                    __global real* asum, const int asum_offset) {
 79 |   __local real lm[WGS2];
 80 |   const int lid = get_local_id(0);
 81 | 
 82 |   // Performs the first step of the reduction while loading the data
 83 |   Add(lm[lid], input[lid], input[lid + WGS2]);
 84 |   barrier(CLK_LOCAL_MEM_FENCE);
 85 | 
 86 |   // Performs reduction in local memory
 87 |   #pragma unroll
 88 |   for (int s=WGS2/2; s>0; s=s>>1) {
 89 |     if (lid < s) {
 90 |       Add(lm[lid], lm[lid], lm[lid + s]);
 91 |     }
 92 |     barrier(CLK_LOCAL_MEM_FENCE);
 93 |   }
 94 | 
 95 |   // Computes the absolute value and stores the final result
 96 |   if (lid == 0) {
 97 |     #if PRECISION == 3232 || PRECISION == 6464
 98 |       asum[asum_offset].x = lm[0].x + lm[0].y; // the result is a non-complex number
 99 |     #else
100 |       asum[asum_offset] = lm[0];
101 |     #endif
102 |   }
103 | }
104 | 
105 | // =================================================================================================
106 | 
107 | // End of the C++11 raw string literal
108 | 
109 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xaxpy.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file contains the Xaxpy kernel. It contains one fast vectorized version in case of unit
11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
12 | // support vector data-types.
13 | //
14 | // This kernel uses the level-1 BLAS common tuning parameters.
15 | //
16 | // =================================================================================================
17 | 
18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
19 | // literal). Comment-out this line for syntax-highlighting when developing.
20 | 
21 | // =================================================================================================
22 | 
23 | // Full version of the kernel with offsets and strided accesses
24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
25 | void Xaxpy(const int n, 
26 |            const __global real* arg_alpha,
27 |            const __global real* restrict xgm, const int x_offset, const int x_inc,
28 |            __global real* ygm, const int y_offset, const int y_inc) {
29 |   const real alpha = GetRealArg(arg_alpha[0]);
30 | 
31 |   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
32 |   #pragma unroll
33 |   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
34 |     real xvalue = xgm[id*x_inc + x_offset];
35 |     MultiplyAdd(ygm[id*y_inc + y_offset], alpha, xvalue);
36 |   }
37 | }
38 | 
39 | // =================================================================================================
40 | 
41 | // End of the C++11 raw string literal
42 | 
43 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xcopy.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file contains the Xcopy kernel. It contains one fast vectorized version in case of unit
11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
12 | // support vector data-types.
13 | //
14 | // This kernel uses the level-1 BLAS common tuning parameters.
15 | //
16 | // =================================================================================================
17 | 
18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
19 | // literal). Comment-out this line for syntax-highlighting when developing.
20 | 
21 | // =================================================================================================
22 | 
23 | // Full version of the kernel with offsets and strided accesses
24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
25 | void Xcopy(const int n,
26 |            const __global real* restrict xgm, const int x_offset, const int x_inc,
27 |            __global real* ygm, const int y_offset, const int y_inc) {
28 | 
29 |   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
30 |   #pragma unroll
31 |   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
32 |     ygm[id*y_inc + y_offset] = xgm[id*x_inc + x_offset];
33 |   }
34 | }
35 | 
36 | // =================================================================================================
37 | 
38 | // End of the C++11 raw string literal
39 | 
40 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xdot.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file contains the Xdot kernel. It implements a dot-product computation using reduction
 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X and Y vectors are
 12 | // multiplied, followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
 13 | // is executed with a single workgroup only, computing the final result.
 14 | //
 15 | // =================================================================================================
 16 | 
 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 18 | // literal). Comment-out this line for syntax-highlighting when developing.
 19 | 
 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 21 | // this kernel file is used outside of the CLBlast library.
 22 | #ifndef WGS1
 23 |   #define WGS1 64     // The local work-group size of the main kernel
 24 | #endif
 25 | #ifndef WGS2
 26 |   #define WGS2 64     // The local work-group size of the epilogue kernel
 27 | #endif
 28 | 
 29 | // =================================================================================================
 30 | 
 31 | // The main reduction kernel, performing the multiplication and the majority of the sum operation
 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
 33 | void Xdot(const int n,
 34 |           const __global real* restrict xgm, const int x_offset, const int x_inc,
 35 |           const __global real* restrict ygm, const int y_offset, const int y_inc,
 36 |           __global real* output, const int do_conjugate) {
 37 |   __local real lm[WGS1];
 38 |   const int lid = get_local_id(0);
 39 |   const int wgid = get_group_id(0);
 40 |   const int num_groups = get_num_groups(0);
 41 | 
 42 |   // Performs multiplication and the first steps of the reduction
 43 |   real acc;
 44 |   SetToZero(acc);
 45 |   int id = wgid*WGS1 + lid;
 46 |   while (id < n) {
 47 |     real x = xgm[id*x_inc + x_offset];
 48 |     real y = ygm[id*y_inc + y_offset];
 49 |     if (do_conjugate) { COMPLEX_CONJUGATE(x); }
 50 |     MultiplyAdd(acc, x, y);
 51 |     id += WGS1*num_groups;
 52 |   }
 53 |   lm[lid] = acc;
 54 |   barrier(CLK_LOCAL_MEM_FENCE);
 55 | 
 56 |   // Performs reduction in local memory
 57 |   #pragma unroll
 58 |   for (int s=WGS1/2; s>0; s=s>>1) {
 59 |     if (lid < s) {
 60 |       Add(lm[lid], lm[lid], lm[lid + s]);
 61 |     }
 62 |     barrier(CLK_LOCAL_MEM_FENCE);
 63 |   }
 64 | 
 65 |   // Stores the per-workgroup result
 66 |   if (lid == 0) {
 67 |     output[wgid] = lm[0];
 68 |   }
 69 | }
 70 | 
 71 | // =================================================================================================
 72 | 
 73 | // The epilogue reduction kernel, performing the final bit of the sum operation. This kernel has to
 74 | // be launched with a single workgroup only.
 75 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 76 | void XdotEpilogue(const __global real* restrict input,
 77 |                   __global real* dot, const int dot_offset) {
 78 |   __local real lm[WGS2];
 79 |   const int lid = get_local_id(0);
 80 | 
 81 |   // Performs the first step of the reduction while loading the data
 82 |   Add(lm[lid], input[lid], input[lid + WGS2]);
 83 |   barrier(CLK_LOCAL_MEM_FENCE);
 84 | 
 85 |   // Performs reduction in local memory
 86 |   #pragma unroll
 87 |   for (int s=WGS2/2; s>0; s=s>>1) {
 88 |     if (lid < s) {
 89 |       Add(lm[lid], lm[lid], lm[lid + s]);
 90 |     }
 91 |     barrier(CLK_LOCAL_MEM_FENCE);
 92 |   }
 93 | 
 94 |   // Stores the final result
 95 |   if (lid == 0) {
 96 |     dot[dot_offset] = lm[0];
 97 |   }
 98 | }
 99 | 
100 | // =================================================================================================
101 | 
102 | // End of the C++11 raw string literal
103 | 
104 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xnrm2.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // =================================================================================================
  3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
  4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
  5 | // width of 100 characters per line.
  6 | //
  7 | // Author(s):
  8 | //   Cedric Nugteren <www.cedricnugteren.nl>
  9 | //
 10 | // This file contains the Xnrm2 kernel. It implements a squared norm computation using reduction
 11 | // kernels. Reduction is split in two parts. In the first (main) kernel the X vector is squared,
 12 | // followed by a per-thread and a per-workgroup reduction. The second (epilogue) kernel
 13 | // is executed with a single workgroup only, computing the final result.
 14 | //
 15 | // =================================================================================================
 16 | 
 17 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
 18 | // literal). Comment-out this line for syntax-highlighting when developing.
 19 | 
 20 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
 21 | // this kernel file is used outside of the CLBlast library.
 22 | #ifndef WGS1
 23 |   #define WGS1 64     // The local work-group size of the main kernel
 24 | #endif
 25 | #ifndef WGS2
 26 |   #define WGS2 64     // The local work-group size of the epilogue kernel
 27 | #endif
 28 | 
 29 | // =================================================================================================
 30 | 
 31 | // The main reduction kernel, performing the multiplication and the majority of the operation
 32 | __kernel __attribute__((reqd_work_group_size(WGS1, 1, 1)))
 33 | void Xnrm2(const int n,
 34 |            const __global real* restrict xgm, const int x_offset, const int x_inc,
 35 |            __global real* output) {
 36 |   __local real lm[WGS1];
 37 |   const int lid = get_local_id(0);
 38 |   const int wgid = get_group_id(0);
 39 |   const int num_groups = get_num_groups(0);
 40 | 
 41 |   // Performs multiplication and the first steps of the reduction
 42 |   real acc;
 43 |   SetToZero(acc);
 44 |   int id = wgid*WGS1 + lid;
 45 |   while (id < n) {
 46 |     real x1 = xgm[id*x_inc + x_offset];
 47 |     real x2 = x1;
 48 |     COMPLEX_CONJUGATE(x2);
 49 |     MultiplyAdd(acc, x1, x2);
 50 |     id += WGS1*num_groups;
 51 |   }
 52 |   lm[lid] = acc;
 53 |   barrier(CLK_LOCAL_MEM_FENCE);
 54 | 
 55 |   // Performs reduction in local memory
 56 |   #pragma unroll
 57 |   for (int s=WGS1/2; s>0; s=s>>1) {
 58 |     if (lid < s) {
 59 |       Add(lm[lid], lm[lid], lm[lid + s]);
 60 |     }
 61 |     barrier(CLK_LOCAL_MEM_FENCE);
 62 |   }
 63 | 
 64 |   // Stores the per-workgroup result
 65 |   if (lid == 0) {
 66 |     output[wgid] = lm[0];
 67 |   }
 68 | }
 69 | 
 70 | // =================================================================================================
 71 | 
 72 | // The epilogue reduction kernel, performing the final bit of the operation. This kernel has to
 73 | // be launched with a single workgroup only.
 74 | __kernel __attribute__((reqd_work_group_size(WGS2, 1, 1)))
 75 | void Xnrm2Epilogue(const __global real* restrict input,
 76 |                    __global real* nrm2, const int nrm2_offset) {
 77 |   __local real lm[WGS2];
 78 |   const int lid = get_local_id(0);
 79 | 
 80 |   // Performs the first step of the reduction while loading the data
 81 |   Add(lm[lid], input[lid], input[lid + WGS2]);
 82 |   barrier(CLK_LOCAL_MEM_FENCE);
 83 | 
 84 |   // Performs reduction in local memory
 85 |   #pragma unroll
 86 |   for (int s=WGS2/2; s>0; s=s>>1) {
 87 |     if (lid < s) {
 88 |       Add(lm[lid], lm[lid], lm[lid + s]);
 89 |     }
 90 |     barrier(CLK_LOCAL_MEM_FENCE);
 91 |   }
 92 | 
 93 |   // Computes the square root and stores the final result
 94 |   if (lid == 0) {
 95 |     #if PRECISION == 3232 || PRECISION == 6464
 96 |       nrm2[nrm2_offset].x = sqrt(lm[0].x); // the result is a non-complex number
 97 |     #else
 98 |       nrm2[nrm2_offset] = sqrt(lm[0]);
 99 |     #endif
100 |   }
101 | }
102 | 
103 | // =================================================================================================
104 | 
105 | // End of the C++11 raw string literal
106 | 
107 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xscal.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file contains the Xscal kernel. It contains one fast vectorized version in case of unit
11 | // strides (incx=1) and no offsets (offx=0). Another version is more general, but doesn't support
12 | // vector data-types.
13 | //
14 | // This kernel uses the level-1 BLAS common tuning parameters.
15 | //
16 | // =================================================================================================
17 | 
18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
19 | // literal). Comment-out this line for syntax-highlighting when developing.
20 | 
21 | // =================================================================================================
22 | 
23 | // Full version of the kernel with offsets and strided accesses
24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
25 | void Xscal(const int n, const __global real* arg_alpha,
26 |            __global real* xgm, const int x_offset, const int x_inc) {
27 |   const real alpha = GetRealArg(arg_alpha[0]);
28 | 
29 |   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
30 |   #pragma unroll
31 |   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
32 |     real xvalue = xgm[id*x_inc + x_offset];
33 |     real result;
34 |     Multiply(result, alpha, xvalue);
35 |     xgm[id*x_inc + x_offset] = result;
36 |   }
37 | }
38 | 
39 | // =================================================================================================
40 | 
41 | // End of the C++11 raw string literal
42 | 
43 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level1/xswap.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file contains the Xswap kernel. It contains one fast vectorized version in case of unit
11 | // strides (incx=incy=1) and no offsets (offx=offy=0). Another version is more general, but doesn't
12 | // support vector data-types.
13 | //
14 | // This kernel uses the level-1 BLAS common tuning parameters.
15 | //
16 | // =================================================================================================
17 | 
18 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
19 | // literal). Comment-out this line for syntax-highlighting when developing.
20 | 
21 | // =================================================================================================
22 | 
23 | // Full version of the kernel with offsets and strided accesses
24 | __kernel __attribute__((reqd_work_group_size(WGS, 1, 1)))
25 | void Xswap(const int n,
26 |            __global real* xgm, const int x_offset, const int x_inc,
27 |            __global real* ygm, const int y_offset, const int y_inc) {
28 | 
29 |   // Loops over the work that needs to be done (allows for an arbitrary number of threads)
30 |   #pragma unroll
31 |   for (int id = get_global_id(0); id<n; id += get_global_size(0)) {
32 |     real temp = xgm[id*x_inc + x_offset];
33 |     xgm[id*x_inc + x_offset] = ygm[id*y_inc + y_offset];
34 |     ygm[id*y_inc + y_offset] = temp;
35 |   }
36 | }
37 | 
38 | // =================================================================================================
39 | 
40 | // End of the C++11 raw string literal
41 | 
42 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/frameworks/open_cl/source/level3/level3.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // =================================================================================================
 3 | // This file is part of the CLBlast project. The project is licensed under Apache Version 2.0. This
 4 | // project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
 5 | // width of 100 characters per line.
 6 | //
 7 | // Author(s):
 8 | //   Cedric Nugteren <www.cedricnugteren.nl>
 9 | //
10 | // This file contains the common functions and parameters specific for level 3 BLAS kernels.
11 | //
12 | // =================================================================================================
13 | 
14 | // Enables loading of this file using the C++ pre-processor's #include (C++11 standard raw string
15 | // literal). Comment-out this line for syntax-highlighting when developing.
16 | 
17 | // =================================================================================================
18 | 
19 | // Parameters set by the tuner or by the database. Here they are given a basic default value in case
20 | // this kernel file is used outside of the CLBlast library.
21 | 
22 | // For the 'fast' copy kernel
23 | #ifndef COPY_DIMX
24 |   #define COPY_DIMX 8      // Local workgroup size in the first dimension (x)
25 | #endif
26 | #ifndef COPY_DIMY
27 |   #define COPY_DIMY 8      // Local workgroup size in the second dimension (y)
28 | #endif
29 | #ifndef COPY_WPT
30 |   #define COPY_WPT 1       // Work per thread in the first dimension (x)
31 | #endif
32 | #ifndef COPY_VW
33 |   #define COPY_VW 1        // Vector width in the second dimension (y)
34 | #endif
35 | 
36 | // For the padding/copy kernels and the conversion kernels
37 | #ifndef PAD_DIMX
38 |   #define PAD_DIMX 8      // Local workgroup size in the first dimension (x)
39 | #endif
40 | #ifndef PAD_DIMY
41 |   #define PAD_DIMY 8      // Local workgroup size in the second dimension (y)
42 | #endif
43 | #ifndef PAD_WPTX
44 |   #define PAD_WPTX 1      // Work per thread in the first dimension (x)
45 | #endif
46 | #ifndef PAD_WPTY
47 |   #define PAD_WPTY 1      // Work per thread in the second dimension (y)
48 | #endif
49 | 
50 | // For the 'fast' transpose kernel
51 | #ifndef TRA_DIM
52 |   #define TRA_DIM 8       // Number of local threads in the two dimensions (x,y)
53 | #endif
54 | #ifndef TRA_WPT
55 |   #define TRA_WPT 1       // Work per thread in one dimension and vector-width in the other
56 | #endif
57 | #ifndef TRA_PAD
58 |   #define TRA_PAD 0       // Padding of the local memory to avoid bank-conflicts
59 | #endif
60 | #ifndef TRA_SHUFFLE
61 |   #define TRA_SHUFFLE 0   // Shuffling of the global indices to avoid global memory bank-conflicts
62 | #endif
63 | 
64 | // For the padding/transpose kernels
65 | #ifndef PADTRA_TILE
66 |   #define PADTRA_TILE 8   // Number of local threads in the two dimensions (x,y)
67 | #endif
68 | #ifndef PADTRA_WPT
69 |   #define PADTRA_WPT 1    // Amount of work per thread
70 | #endif
71 | #ifndef PADTRA_PAD
72 |   #define PADTRA_PAD 0    // Padding of the local memory to avoid bank-conflicts
73 | #endif
74 | 
75 | // =================================================================================================
76 | 
77 | // End of the C++11 raw string literal
78 | 
79 | // =================================================================================================


--------------------------------------------------------------------------------
/crates/parenchyma-blas/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Parenchyma extension package for backend-agnostic BLAS operations.
 2 | //!
 3 | //! Provides backend-agnostic [BLAS] operations for [Parenchyma].
 4 | //!
 5 | //! BLAS (Basic Linear Algebra Subprograms) is a specification that prescribes a set of low-level
 6 | //! routines for performing common linear algebra operations such as vector addition, scalar
 7 | //! multiplication, dot products, linear combinations, and matrix multiplication. They are the de
 8 | //! facto standard low-level routines for linear algebra libraries; the routines have bindings for
 9 | //! both C and Fortran. Although the BLAS specification is general, BLAS implementations are often
10 | //! optimized for speed on a particular machine, so using them can bring substantial performance
11 | //! benefits. BLAS implementations will take advantage of special floating point hardware such as
12 | //! vector registers or SIMD instructions.<br/>
13 | //!
14 | //! # Overview
15 | //!
16 | //! A Parenchyma extension package provides functionality through two types:
17 | //!
18 | //! * __Package__
19 | //! This enum provides the actual initialized functions.
20 | //!
21 | //! * __Extension__
22 | //! This trait provides methods that specify the exact backend-agnostic behavior of a collection of
23 | //! operations. Since a shared tensor completely manages memory, tensors can simply be passed in as
24 | //! arguments for the fastest possible execution.
25 | //!
26 | //! Aside from the generic functionality provided by the two traits, the extension can be further
27 | //! extended.
28 | //!
29 | //! For more information, read the documentation.
30 | //!
31 | //! # Example Usage
32 | //!
33 | //! ```ignore
34 | //! #[macro_use(array)]
35 | //! extern crate parenchyma;
36 | //! extern crate parenchyma_blas as blas;
37 | //!
38 | //! use parenchyma::frameworks::Native;
39 | //! use parenchyma::prelude::*;
40 | //!
41 | //! let backend: Backend<blas::Package> = Backend::new::<Native>()?;
42 | //! let ref x: SharedTensor = array![[1.5, 2.5, 3.5], [4.5, 5.5, 6.6]].into();
43 | //! let ref mut result: SharedTensor = array![0.0].into();
44 | //!
45 | //! backend.asum(x, result)?;
46 | //!
47 | //! println!("{:?}", result);
48 | //! ```
49 | //!
50 | //! [BLAS]: https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms
51 | //! [Parenchyma]: https://github.com/lychee-eng/parenchyma
52 | #![allow(unused_variables)]
53 | #![feature(non_modrs_mods, type_ascription)]
54 | 
55 | extern crate ocl;
56 | extern crate parenchyma;
57 | extern crate rblas;
58 | 
59 | pub use self::extension_package::{Extension, GenericMatrix, Package, Transposition};
60 | pub mod frameworks;
61 | 
62 | mod extension_package;


--------------------------------------------------------------------------------
/crates/parenchyma-blas/tests/blas_specs.rs:
--------------------------------------------------------------------------------
  1 | #![feature(rustc_private)]
  2 | 
  3 | #[macro_use]
  4 | extern crate lazy_static;
  5 | #[macro_use(array)]
  6 | extern crate parenchyma;
  7 | extern crate parenchyma_blas;
  8 | 
  9 | #[cfg(test)]
 10 | mod blas_specification_native {
 11 |     use parenchyma::frameworks::Native;
 12 |     use parenchyma::prelude::*;
 13 |     use parenchyma_blas::*;
 14 | 
 15 |     struct TestBackend(Backend<Package>);
 16 | 
 17 |     impl ::std::ops::Deref for TestBackend {
 18 |         type Target = Backend<Package>;
 19 |         fn deref(&self) -> &Self::Target {
 20 |             &self.0
 21 |         }
 22 |     }
 23 |     unsafe impl Sync for TestBackend { }
 24 | 
 25 |     lazy_static! {
 26 |         static ref BACKEND: TestBackend = TestBackend(Backend::new::<Native<_>>().unwrap());
 27 |     }
 28 | 
 29 |     #[test]
 30 |     fn it_computes_correct_asum_on_native_for_f32() {
 31 |         let ref x = array![1., -2., 3.].into();
 32 |         let ref mut result = SharedTensor::scalar(0.0);
 33 |         BACKEND.asum(x, result).unwrap();
 34 |         assert_eq!(&[6.], result.as_slice().unwrap());
 35 |     }
 36 | 
 37 |     #[test]
 38 |     fn it_computes_correct_axpy_on_native_for_f32() {
 39 |         let ref a = SharedTensor::scalar(2.0);
 40 |         let ref x = array![1., 2., 3.].into();
 41 |         let ref mut y = array![1., 2., 3.].into();
 42 |         BACKEND.axpy(a, x, y).unwrap();
 43 |         assert_eq!(&[3., 6., 9.], y.as_slice().unwrap());
 44 |     }
 45 | 
 46 |     #[test]
 47 |     fn it_computes_correct_copy_on_native_for_f32() {
 48 |         let ref mut x = array![1., 2., 3.].into();
 49 |         let ref mut y = SharedTensor::from([3]);
 50 |         BACKEND.copy(x, y).unwrap();
 51 |         assert_eq!(&[1., 2., 3.], y.as_slice().unwrap());
 52 |     }
 53 | 
 54 |     #[test]
 55 |     fn it_computes_correct_dot_on_native_for_f32() {
 56 |         let ref x = array![1., 2., 3.].into();
 57 |         let ref y = array![1., 2., 3.].into();
 58 |         let ref mut result = SharedTensor::from([]);
 59 |         BACKEND.dot(x, y, result).unwrap();
 60 |         assert_eq!(&[14.], result.as_slice().unwrap());
 61 |     }
 62 | 
 63 |     #[test]
 64 |     fn it_computes_correct_nrm2_on_native_for_f32() {
 65 |         let ref x = array![1., 2., 2.].into();
 66 |         let ref mut result = SharedTensor::from([]);
 67 |         BACKEND.nrm2(x, result).unwrap();
 68 |         assert_eq!(&[3.], result.as_slice().unwrap());
 69 |     }
 70 | 
 71 |     #[test]
 72 |     fn it_computes_correct_scal_on_native_for_f32() {
 73 |         let ref a = array![2.].into();
 74 |         let ref mut x = array![1., 2., 3.].into();
 75 |         BACKEND.scal(a, x).unwrap();
 76 |         assert_eq!(&[2., 4., 6.], x.as_slice().unwrap());
 77 |     }
 78 | 
 79 |     #[test]
 80 |     fn it_computes_correct_swap_on_native_for_f32() {
 81 |         let ref mut x = array![1., 2., 3.].into();
 82 |         let ref mut y = array![3., 2., 1.].into();
 83 |         BACKEND.swap(x, y).unwrap();
 84 |         assert_eq!(&[3., 2., 1.], x.as_slice().unwrap());
 85 |         assert_eq!(&[1., 2., 3.], y.as_slice().unwrap());
 86 |     }
 87 | 
 88 |     #[test]
 89 |     fn it_computes_correct_gemm_on_native_for_f32() {
 90 | 
 91 |         let ref alpha = array![1.0].into();
 92 |         let ref amat = 
 93 |             array![
 94 |                 [2.0, 5.0], 
 95 |                 [2.0, 5.0], 
 96 |                 [2.0, 5.0]
 97 |             ].into();
 98 | 
 99 |         let ref beta = array![0.0].into();
100 |         let ref bmat =
101 |             array![
102 |                 [4.0, 1.0, 1.0],
103 |                 [4.0, 1.0, 1.0]
104 |             ].into();
105 | 
106 |         let ref mut cmat = SharedTensor::from([3, 3]);
107 |         let transposition = Transposition::NoTranspose;
108 | 
109 |         BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap();
110 | 
111 |         assert_eq!(&[28., 7., 7., 28., 7., 7., 28., 7., 7.], cmat.as_slice().unwrap());
112 |     }
113 | 
114 |     #[test]
115 |     fn it_computes_correct_transpose_gemm_on_native_for_f32() {
116 |         
117 |         let ref alpha = array![1.0].into();
118 |         let ref amat =
119 |             array![
120 |                 [2.0, 5.0], 
121 |                 [2.0, 5.0], 
122 |                 [2.0, 5.0]
123 |             ].into();
124 | 
125 |         let ref beta = array![0.0].into();
126 |         let ref bmat =
127 |             array![
128 |                 [4.0, 1.0, 1.0],
129 |                 [4.0, 1.0, 1.0]
130 |             ].into();
131 | 
132 |         let ref mut cmat = SharedTensor::from([2, 2]);
133 |         let transposition = Transposition::Transpose;
134 | 
135 |         BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap();
136 | 
137 |         assert_eq!(&[12., 12., 30., 30.], cmat.as_slice().unwrap());
138 |     }
139 | }
140 | 
141 | #[cfg(test)]
142 | mod blas_specification_opencl {
143 |     use parenchyma::frameworks::OpenCL;
144 |     use parenchyma::hardware::{Hardware, HardwareKind};
145 |     use parenchyma::prelude::*;
146 |     use parenchyma_blas::*;
147 | 
148 |     struct TestBackend(Backend<Package>);
149 | 
150 |     impl ::std::ops::Deref for TestBackend {
151 |         type Target = Backend<Package>;
152 |         fn deref(&self) -> &Self::Target {
153 |             &self.0
154 |         }
155 |     }
156 |     unsafe impl Sync for TestBackend { }
157 | 
158 |     lazy_static! {
159 |         static ref BACKEND: TestBackend = {
160 |             let mut backend: Backend<Package> = Backend::new::<OpenCL<_>>().unwrap();
161 |             // required here!
162 |             backend.select(&|hardware| hardware.kind == HardwareKind::GPU);
163 |             TestBackend(backend)
164 |         };
165 |     }
166 | 
167 |     #[test]
168 |     fn it_computes_correct_axpy_on_opencl_for_f32() {
169 |         let ref a = SharedTensor::scalar(2.0);
170 |         let ref x = array![1., 2., 3.].into();
171 |         let ref mut y = array![1., 2., 3.].into();
172 |         BACKEND.axpy(a, x, y).unwrap();
173 |         assert_eq!(&[3., 6., 9.], y.as_slice().unwrap());
174 |     }
175 | 
176 |     #[test]
177 |     fn it_computes_correct_copy_on_opencl_for_f32() {
178 |         let ref mut x = array![1., 2., 3.].into();
179 |         let ref mut y = SharedTensor::from([3]);
180 |         BACKEND.copy(x, y).unwrap();
181 |         assert_eq!(&[1., 2., 3.], y.as_slice().unwrap());
182 |     }
183 | 
184 |     #[test]
185 |     fn it_computes_correct_gemm_on_opencl_for_f32() {
186 | 
187 |         let ref alpha = array![1.0].into();
188 |         let ref amat = 
189 |             array![
190 |                 [2.0, 5.0], 
191 |                 [2.0, 5.0], 
192 |                 [2.0, 5.0]
193 |             ].into();
194 | 
195 |         let ref beta = array![0.0].into();
196 |         let ref bmat =
197 |             array![
198 |                 [4.0, 1.0, 1.0],
199 |                 [4.0, 1.0, 1.0]
200 |             ].into();
201 | 
202 |         let ref mut cmat = SharedTensor::from([3, 3]);
203 |         let transposition = Transposition::NoTranspose;
204 | 
205 |         BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap();
206 | 
207 |         assert_eq!(&[28., 7., 7., 28., 7., 7., 28., 7., 7.], cmat.as_slice().unwrap());
208 |     }
209 | 
210 |     #[test]
211 |     fn it_computes_correct_transpose_gemm_on_opencl_for_f32() {
212 |         
213 |         let ref alpha = array![1.0].into();
214 |         let ref amat =
215 |             array![
216 |                 [2.0, 5.0], 
217 |                 [2.0, 5.0], 
218 |                 [2.0, 5.0]
219 |             ].into();
220 | 
221 |         let ref beta = array![0.0].into();
222 |         let ref bmat =
223 |             array![
224 |                 [4.0, 1.0, 1.0],
225 |                 [4.0, 1.0, 1.0]
226 |             ].into();
227 | 
228 |         let ref mut cmat = SharedTensor::from([2, 2]);
229 |         let transposition = Transposition::Transpose;
230 | 
231 |         BACKEND.gemm(alpha, transposition, amat, transposition, bmat, beta, cmat).unwrap();
232 | 
233 |         assert_eq!(&[12., 12., 30., 30.], cmat.as_slice().unwrap());
234 |     }
235 | 
236 |     #[test]
237 |     fn it_computes_correct_scal_on_opencl_for_f32() {
238 |         let ref a = array![2.].into();
239 |         let ref mut x = array![1., 2., 3.].into();
240 |         BACKEND.scal(a, x).unwrap();
241 |         assert_eq!(&[2., 4., 6.], x.as_slice().unwrap());
242 |     }
243 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock
3 | 


--------------------------------------------------------------------------------
/crates/parenchyma-deep/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "parenchyma-deep"
 3 | version = "0.1.0"
 4 | authors = ["Jony <jonysy@users.noreply.github.com>"]
 5 | license = "MIT/Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | ocl = "0.16.0"
 9 | 
10 | [dependencies.parenchyma]
11 | path = "../../"
12 | version = "0.0.4"
13 | 
14 | [dev-dependencies]
15 | lazy_static = "1.1.0"


--------------------------------------------------------------------------------
/crates/parenchyma-deep/README.md:
--------------------------------------------------------------------------------
 1 | # parenchyma-deep
 2 | 
 3 | This package provides full NN support for Parenchyma, so you can use NN on servers, desktops or 
 4 | mobiles, GPUs, FPGAs or CPUS, without worrying about OpenCL or CUDA support on the machine.
 5 | 
 6 | ## Provided Operations
 7 | 
 8 | This package provides the following operations to Parenchyma backends:
 9 | 
10 | |                       | CUDA (cuDNN)  | OpenCL    | Native (rust) |
11 | |---                    |---            |---        |---            |
12 | | Sigmoid               | (collenchyma) | -         | ✓             |
13 | | Sigmoid (pointwise)   | (collenchyma) | -         |               |
14 | | ReLU                  | (collenchyma) | -         | ✓             |
15 | | ReLU (pointwise)      | (collenchyma) | -         |               |
16 | | Tanh                  | (collenchyma) | -         | ✓             |
17 | | Tanh (pointwise)      | (collenchyma) | -         |               |
18 | |                       |               |           |               |
19 | | Normalization (LRN)   | (collenchyma) | -         | -             |
20 | |                       |               |           |               |
21 | | Convolution           | (collenchyma) | -         | -             |
22 | |                       |               |           |               |
23 | | Softmax               | (collenchyma) | -         | ✓             |
24 | | Log Softmax           | (collenchyma) | -         | ✓             |
25 | |                       |               |           |               |
26 | | Pooling Max           | (collenchyma) | -         | -             |
27 | | Pooling Avg           | (collenchyma) | -         | -             |


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/extension_package/backward.rs:
--------------------------------------------------------------------------------
  1 | use parenchyma::error::Result;
  2 | use parenchyma::prelude::SharedTensor;
  3 | use super::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration};
  4 | 
  5 | pub trait Backward {
  6 |     /// Computes the gradient of a [CNN convolution] over the input tensor `x` with respect 
  7 |     /// to the data.
  8 |     ///
  9 |     /// Saves the result to `result_diff`.
 10 |     ///
 11 |     /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
 12 |     fn convolution_grad_data(
 13 |         self: &Self,
 14 |         filter: &SharedTensor, 
 15 |         x_diff: &SharedTensor,
 16 |         result_diff: &mut SharedTensor, 
 17 |         workspace: &mut SharedTensor<u8>, 
 18 |         configuration: &ConvolutionConfiguration) -> Result {
 19 |         unimplemented!()
 20 |     }
 21 |     /// Computes the gradient of a [CNN convolution][convolution] with respect to the filter.
 22 |     ///
 23 |     /// Saves the result to `filter_diff`.
 24 |     ///
 25 |     /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
 26 |     fn convolution_grad_filter(
 27 |         self: &Self, 
 28 |         src_data: &SharedTensor, 
 29 |         dest_diff: &SharedTensor, 
 30 |         filter_diff: &mut SharedTensor, 
 31 |         workspace: &mut SharedTensor<u8>, 
 32 |         configuration: &ConvolutionConfiguration) -> Result {
 33 |         unimplemented!()
 34 |     }
 35 |     /// Computes the gradient of a logarithmic softmax over the input tensor `x`.
 36 |     ///
 37 |     /// Saves the result to `result_diff`.
 38 |     fn log_softmax_grad(
 39 |         self: &Self, 
 40 |         x: &SharedTensor, 
 41 |         x_diff: &SharedTensor, 
 42 |         result_diff: &mut SharedTensor) -> Result {
 43 |         unimplemented!()
 44 |     }
 45 |     /// Computes the gradient of a [LRN][lrn] over the input Tensor `x` with complete memory management.
 46 |     /// [lrn]: https://en.wikipedia.org/wiki/lrnal_neural_network
 47 |     ///
 48 |     /// Saves the result to `result_diff`.
 49 |     ///
 50 |     /// For a no-memory managed version see `lrn_grad_plain`.
 51 |     fn lrn_grad(
 52 |         self: &Self,
 53 |         x: &SharedTensor, 
 54 |         x_diff: &SharedTensor, 
 55 |         result: &SharedTensor, 
 56 |         result_diff: &mut SharedTensor, 
 57 |         configuration: &LrnConfiguration) -> Result {
 58 |         unimplemented!()
 59 |     }
 60 |     /// Computes the gradient of [max pooling] over the input Tensor `x`.
 61 |     ///
 62 |     /// Saves the result to `result_diff`.
 63 |     ///
 64 |     /// [max pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer
 65 |     fn pooling_max_grad(
 66 |         self: &Self, 
 67 |         x: &SharedTensor, 
 68 |         x_diff: &SharedTensor, 
 69 |         result: &SharedTensor, 
 70 |         result_diff: &mut SharedTensor, 
 71 |         configuration: &PoolingConfiguration) -> Result {
 72 |         unimplemented!()
 73 |     }
 74 |     /// Computes the gradient of [ReLU] over the input tensor `x`.
 75 |     ///
 76 |     /// Saves the result to `result_diff`.
 77 |     ///
 78 |     /// [ReLU]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
 79 |     fn relu_grad(
 80 |         self: &Self, 
 81 |         x: &SharedTensor, 
 82 |         x_diff: &SharedTensor,
 83 |         result: &SharedTensor,
 84 |         result_diff: &mut SharedTensor) -> Result {
 85 |         unimplemented!()
 86 |     }
 87 |     /// Computes the gradient of [ReLU] over the input tensor `x`.
 88 |     ///
 89 |     /// Saves the result back to `x_diff`.
 90 |     ///
 91 |     /// [ReLU]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
 92 |     fn relu_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result {
 93 |         unimplemented!()
 94 |     }
 95 |     /// Computes the gradient of a [sigmoid function] over the input tensor `x`.
 96 |     ///
 97 |     /// Saves the result to `result_diff`.
 98 |     ///
 99 |     /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function
100 |     fn sigmoid_grad(
101 |         self: &Self, 
102 |         x: &SharedTensor, 
103 |         x_diff: &SharedTensor,
104 |         result: &SharedTensor,
105 |         result_diff: &mut SharedTensor) -> Result {
106 |         unimplemented!()
107 |     }
108 |     /// Computes the gradient of a [sigmoid function] over the input tensor `x`.
109 |     ///
110 |     /// Saves the result back to `x_diff`.
111 |     ///
112 |     /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function
113 |     fn sigmoid_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result {
114 |         unimplemented!()
115 |     }
116 |     /// Computes the gradient of a [softmax] over the input tensor `x`.
117 |     ///
118 |     /// Saves the result to `result_diff`.
119 |     ///
120 |     /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function
121 |     fn softmax_grad(
122 |         self: &Self,
123 |         x: &SharedTensor, 
124 |         x_diff: &SharedTensor, 
125 |         result_diff: &mut SharedTensor) -> Result {
126 |         unimplemented!()
127 |     }
128 |     /// Computes the gradient of [tanh] over the input Tensor `x`.
129 |     ///
130 |     /// Saves the result to `result_diff`.
131 |     ///
132 |     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
133 |     fn tanh_grad(
134 |         self: &Self, 
135 |         x: &SharedTensor, 
136 |         x_diff: &SharedTensor, 
137 |         result: &SharedTensor,
138 |         result_diff: &mut SharedTensor) -> Result {
139 |         unimplemented!()
140 |     }
141 |     /// Computes the gradient of [tanh] over the input Tensor `x`.
142 |     ///
143 |     /// Saves the result back to `x_diff`.
144 |     ///
145 |     /// [tanh]: https://en.wikipedia.org/wiki/Hyperbolic_function
146 |     fn tanh_pointwise_grad(&self, x: &SharedTensor, x_diff: &mut SharedTensor) -> Result {
147 |         unimplemented!()
148 |     }
149 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/extension_package/configuration.rs:
--------------------------------------------------------------------------------
 1 | #[derive(Clone, Copy, Debug)]
 2 | pub struct ConvolutionConfiguration;
 3 | 
 4 | // impl ConvolutionConfiguration {
 5 | //     /// Creates a new convolution configuration, which needs to be passed to further 
 6 | //     /// convolution operations.
 7 | //     pub fn new<P>(
 8 | //         backend: &Backend<P>, 
 9 | //         src: &SharedTensor, 
10 | //         dest: &SharedTensor,
11 | //         filter: &mut SharedTensor,
12 | //         algo_forward: ConvForwardAlgo,
13 | //         algo_backward_filter: ConvBackwardDataAlgo,
14 | //         algo_backward_data: ConvBackwardDataAlgo,
15 | //         stride: &[i32],
16 | //         zero_padding: &[i32]) -> Result<ConvolutionConfiguration> {
17 | 
18 | //         unimplemented!()
19 | //     }
20 | // }
21 | 
22 | #[derive(Clone, Copy, Debug)]
23 | pub struct LrnConfiguration;
24 | 
25 | #[derive(Clone, Copy, Debug)]
26 | pub struct PoolingConfiguration;


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/extension_package/convolution.rs:
--------------------------------------------------------------------------------
 1 | /// Different algorithms to compute the gradient with respect to the filter.
 2 | #[derive(Clone, Copy, Debug)]
 3 | pub enum ConvBackwardDataAlgo {
 4 |     /// Attempt to automatically find the best algorithm of all the other available ones.
 5 |     Auto,
 6 |     /// Compute the convolution as matrix product without forming the matrix that holds the input data.
 7 |     ///
 8 |     /// Does not need any memory workspace.
 9 |     ///
10 |     /// The results are deterministic.
11 |     ImplicitGemm,
12 |     /// Compute the convolution as sum of matrix product without forming the matrix that holds the input data.
13 |     ///
14 |     /// Does not need any memory workspace.
15 |     ///
16 |     /// The results are non-deterministic.
17 |     ImplicitGemmSum,
18 |     /// Compute the convolution as Fast-Fourier Transform.
19 |     ///
20 |     /// Needs a significant memory workspace.
21 |     ///
22 |     /// The results are deterministic.
23 |     Fft,
24 |     /// Compute the convolution as Fast-Fourier Transform with 32x32 tiles.
25 |     ///
26 |     /// Needs a significant memory workspace.
27 |     ///
28 |     /// The results are deterministic.
29 |     FftTiling,
30 | }
31 | 
32 | /// Different algorithms to compute the gradient with respect to the filter.
33 | #[derive(Clone, Copy, Debug)]
34 | pub enum ConvBackwardFilterAlgo {
35 |     /// Attempt to automatically find the best algorithm of all the other available ones.
36 |     Auto,
37 |     /// Compute the convolution as matrix product without forming the matrix that holds the input data.
38 |     ///
39 |     /// Does not need any memory workspace.
40 |     ///
41 |     /// The results are deterministic.
42 |     ImplicitGemm,
43 |     /// Compute the convolution as sum of matrix product without forming the matrix that holds the input data.
44 |     ///
45 |     /// Does not need any memory workspace.
46 |     ///
47 |     /// The results are non-deterministic.
48 |     ImplicitGemmSum,
49 |     /// Similar to `ImplicitGEMMSum` but needs some workspace to precompile the implicit indices.
50 |     ///
51 |     /// The results are non-deterministic.
52 |     ImplicitPrecompiledGemmSum,
53 |     /// Compute the convolution as Fast-Fourier Transform.
54 |     ///
55 |     /// Needs a significant memory workspace.
56 |     ///
57 |     /// The results are deterministic.
58 |     Fft,
59 | }
60 | 
61 | /// Different algorithms to compute the convolution forward algorithm.
62 | #[derive(Clone, Copy, Debug)]
63 | pub enum ConvForwardAlgo {
64 |     /// Attempt to automatically find the best algorithm of all the other available ones.
65 |     Auto,
66 |     /// Compute the convolution as explicit matrix product.
67 |     ///
68 |     /// Needs a significant memory workspace.
69 |     Gemm,
70 |     /// Compute the convolution as matrix product without forming the matrix that holds the input data.
71 |     ///
72 |     /// Does not need any memory workspace.
73 |     ImplicitGemm,
74 |     /// Similar to `ImplicitGEMM` but needs some workspace to precompile the implicit indices.
75 |     ImplicitPrecompiledGemm,
76 |     /// Compute the convolution as Fast-Fourier Transform.
77 |     ///
78 |     /// Needs a significant memory workspace.
79 |     Fft,
80 |     /// Compute the convolution as Fast-Fourier Transform with 32x32 tiles.
81 |     ///
82 |     /// Needs a significant memory workspace.
83 |     FftTiling,
84 |     /// Compute the convolution without implicit or explicit matrix-multiplication. **Do not try to use this**.
85 |     ///
86 |     /// Listed in cuDNN docs but cuDNN does not provide a implementation.
87 |     Direct,
88 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/extension_package/forward.rs:
--------------------------------------------------------------------------------
  1 | use parenchyma::error::Result;
  2 | use parenchyma::prelude::SharedTensor;
  3 | use super::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration};
  4 | 
  5 | pub trait Forward {
  6 |     /// Computes a [CNN convolution] over the input tensor `x`, and then saves the `result`.
  7 |     ///
  8 |     /// [CNN convolution]: https://en.wikipedia.org/wiki/Convolutional_neural_network
  9 |     fn convolution(
 10 |         self: &Self, 
 11 |         filter: &SharedTensor, 
 12 |         x: &SharedTensor, 
 13 |         result: &mut SharedTensor,
 14 |         workspace: &mut SharedTensor<u8>,
 15 |         configuration: &ConvolutionConfiguration) -> Result {
 16 |         unimplemented!()
 17 |     }
 18 |     /// Computes the exponential linear unit [new] over tensor `x`.
 19 |     ///
 20 |     /// Saves the `result`.
 21 |     fn elu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 22 |         unimplemented!()
 23 |     }
 24 |     /// Computes a logarithmic softmax over the input tensor `x`, and then saves the `result`.
 25 |     fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 26 |         unimplemented!()
 27 |     }
 28 |     /// Computes a [local response normalization] over the input tensor `x`.
 29 |     ///
 30 |     /// Saves the result to `result`.
 31 |     ///
 32 |     /// [local response normalization]: https://en.wikipedia.org/wiki/lrnal_neural_network
 33 |     fn lrn(
 34 |         self: &Self, 
 35 |         x: &SharedTensor, 
 36 |         result: &mut SharedTensor, 
 37 |         configuration: &LrnConfiguration) -> Result {
 38 |         unimplemented!()
 39 |     }
 40 |     /// Computes non-linear down-sampling ([max pooling]) over the input tensor `x`.
 41 |     ///
 42 |     /// Saves the result to `result`.
 43 |     ///
 44 |     /// [max pooling]: https://en.wikipedia.org/wiki/Convolutional_neural_network#Pooling_layer
 45 |     fn pooling_max(
 46 |         self: &Self, 
 47 |         x: &SharedTensor, 
 48 |         result: &mut SharedTensor, 
 49 |         configuration: &PoolingConfiguration) -> Result {
 50 |         unimplemented!()
 51 |     }
 52 |     /// Computes the [rectified linear units] over tensor `x`.
 53 |     ///
 54 |     /// Saves the `result`.
 55 |     ///
 56 |     /// [rectified linear units]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
 57 |     fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 58 |         unimplemented!()
 59 |     }
 60 |     /// Computes the [rectified linear units] over the input Tensor `x`.
 61 |     ///
 62 |     /// note: pointwise operations overwrite the input with the result of the operation.
 63 |     ///
 64 |     /// [rectified linear units]: https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
 65 |     fn relu_pointwise(&self, x: &mut SharedTensor) -> Result {
 66 |         unimplemented!()
 67 |     }
 68 |     /// Computes the [sigmoid function] over tensor `x`.
 69 |     ///
 70 |     /// Saves the `result`.
 71 |     ///
 72 |     /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function
 73 |     fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 74 |         unimplemented!()
 75 |     }
 76 |     /// Computes the [sigmoid function][sigmoid] over the input tensor `x`.
 77 |     ///
 78 |     /// note: pointwise operations overwrite the input with the result of the operation.
 79 |     ///
 80 |     /// [sigmoid function]: https://en.wikipedia.org/wiki/Sigmoid_function
 81 |     fn sigmoid_pointwise(&self, x: &mut SharedTensor) -> Result {
 82 |         unimplemented!()
 83 |     }
 84 |     /// Computes a [softmax] over the input tensor `x`.
 85 |     ///
 86 |     /// Saves the result to `result`.
 87 |     ///
 88 |     /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function
 89 |     fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 90 |         unimplemented!()
 91 |     }
 92 |     /// Computes the [hyperbolic tangent] over tensor `x`.
 93 |     ///
 94 |     /// Saves the `result`.
 95 |     ///
 96 |     /// [hyperbolic tangent]: https://en.wikipedia.org/wiki/Hyperbolic_function
 97 |     fn tanh(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 98 |         unimplemented!()
 99 |     }
100 |     /// Computes the [hyperbolic tangent][tanh] over the input Tensor `x`.
101 |     ///
102 |     /// note: pointwise operations overwrite the input with the result of the operation.
103 |     ///
104 |     /// [hyperbolic tangent]: https://en.wikipedia.org/wiki/Hyperbolic_function
105 |     fn tanh_pointwise(&self, x: &mut SharedTensor) -> Result {
106 |         unimplemented!()
107 |     }
108 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/extension_package/mod.rs:
--------------------------------------------------------------------------------
 1 | pub use self::backward::Backward;
 2 | pub use self::configuration::{ConvolutionConfiguration, LrnConfiguration, PoolingConfiguration};
 3 | pub use self::convolution::{ConvBackwardDataAlgo, ConvBackwardFilterAlgo, ConvForwardAlgo};
 4 | pub use self::forward::Forward;
 5 | 
 6 | mod backward;
 7 | mod configuration;
 8 | mod convolution;
 9 | mod forward;
10 | 
11 | use parenchyma::extension_package::ExtensionPackage;
12 | 
13 | /// The BLAS package.
14 | pub enum Package {
15 |     OpenCL(::frameworks::open_cl::OpenCLPackage),
16 | }
17 | 
18 | impl Package {
19 |     pub fn open_cl(&self) -> &::frameworks::open_cl::OpenCLPackage {
20 |         match self {
21 |             &Package::OpenCL(ref package) => package
22 |         }
23 |     }
24 | }
25 | 
26 | /// Provides the functionality for a backend to support DNN related operations.
27 | pub trait Extension: Backward + Forward {
28 |     // ..
29 | }
30 | 
31 | impl ExtensionPackage for Package {
32 |     type Extension = Extension;
33 | 
34 |     fn package_name(&self) -> &'static str {
35 |         return "parenchyma/deep";
36 |     }
37 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod native;
2 | pub mod open_cl;


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/native/mod.rs:
--------------------------------------------------------------------------------
  1 | use parenchyma::error::Result;
  2 | use parenchyma::extension_package::Dependency;
  3 | use parenchyma::frameworks::NativeContext as Context;
  4 | use parenchyma::tensor::SharedTensor;
  5 | use super::super::{Extension, Package};
  6 | use super::super::extension_package::{Backward, Forward};
  7 | 
  8 | impl<P> Backward for Context<P> where 
  9 |     P: Dependency<Package> {
 10 |     fn log_softmax_grad(
 11 |         self: &Self, 
 12 |         x: &SharedTensor, 
 13 |         x_diff: &SharedTensor, 
 14 |         result_diff: &mut SharedTensor) -> Result {
 15 |         let x_slice = x.as_slice().unwrap();
 16 |         let x_diff_slice = x_diff.as_slice().unwrap();
 17 |         let mut sum = 0.0;
 18 |         for &grad_val in x_diff_slice.iter() {
 19 |             sum += grad_val;
 20 |         }
 21 |         let res = x_slice.iter().zip(x_diff_slice.iter())
 22 |             .map(|(x_val, x_diff_val)| {
 23 |                 x_diff_val - x_val.exp() * sum
 24 |             });
 25 |         result_diff.write_iter(res)?;
 26 |         Ok(())
 27 |     }
 28 | 
 29 |     fn relu_grad(
 30 |         self: &Self, 
 31 |         x: &SharedTensor, 
 32 |         x_diff: &SharedTensor,
 33 |         result: &SharedTensor,
 34 |         result_diff: &mut SharedTensor) -> Result {
 35 |         let res = x.as_slice().unwrap().iter()
 36 |             .zip(x_diff.as_slice().unwrap().iter())
 37 |             .map(|(x, dx)| if *x > 0.0 { *dx } else { 0.0 });
 38 |         result_diff.write_iter(res)?;
 39 |         Ok(())
 40 |     }
 41 | 
 42 |     fn sigmoid_grad(
 43 |         self: &Self, 
 44 |         x: &SharedTensor, 
 45 |         x_diff: &SharedTensor,
 46 |         result: &SharedTensor,
 47 |         result_diff: &mut SharedTensor) -> Result {
 48 |         let res = x.as_slice().unwrap().iter().zip(x_diff.as_slice().unwrap().iter())
 49 |             .map(|(t, dt)| *t * (1.0 -*t) * *dt);
 50 |         result_diff.write_iter(res)?;
 51 |         Ok(())
 52 |     }
 53 | 
 54 |     fn softmax_grad(
 55 |         self: &Self,
 56 |         x: &SharedTensor, 
 57 |         x_diff: &SharedTensor, 
 58 |         result_diff: &mut SharedTensor) -> Result {
 59 |         let mut dot = 0.0;
 60 |         let sig_data_slice = x.as_slice().unwrap();
 61 |         let sig_dx_slice = x_diff.as_slice().unwrap();
 62 |         for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) {
 63 |             dot += t * dt;
 64 |         }
 65 |         let res = sig_data_slice.iter().zip(sig_dx_slice.iter()).map(|(t, dt)| t * (dt - dot));
 66 |         result_diff.write_iter(res)?;
 67 |         Ok(())
 68 |     }
 69 | 
 70 |     fn tanh_grad(
 71 |         self: &Self, 
 72 |         x: &SharedTensor, 
 73 |         x_diff: &SharedTensor, 
 74 |         result: &SharedTensor,
 75 |         result_diff: &mut SharedTensor) -> Result {
 76 |         let res = x.as_slice().unwrap().iter()
 77 |             .zip(x_diff.as_slice().unwrap().iter())
 78 |             .map(|(x, dx)| (1.0 - x.powi(2)) * *dx);
 79 |         result_diff.write_iter(res)?;
 80 |         Ok(())
 81 |     }
 82 | }
 83 | 
 84 | impl<P> Forward for Context<P> where 
 85 |     P: Dependency<Package> {
 86 |     fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
 87 |         let mut max_input = ::std::f32::NEG_INFINITY;
 88 |         for &input in x.as_slice().unwrap() {
 89 |             max_input = max_input.max(input);
 90 |         }
 91 |         let mut logsum = 0.;
 92 |         for exp in x.as_slice().unwrap().iter().map(|t| (-(max_input - t)).exp()) {
 93 |             logsum += exp;
 94 |         }
 95 |         logsum = max_input + logsum.ln();
 96 |         let res = x.as_slice().unwrap().iter().map(|t| t - logsum);
 97 |         result.write_iter(res)?;
 98 |         Ok(())
 99 |     }
100 | 
101 |     fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
102 |         let res = x.as_slice().unwrap().iter().map(|elem| elem.max(0.0));
103 |         result.write_iter(res)?;
104 |         Ok(())
105 |     }
106 | 
107 |     fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
108 |         let res = x.as_slice().unwrap().iter().map(|x| 1.0 / (1.0 + (-*x).exp()));
109 |         result.write_iter(res)?;
110 |         Ok(())
111 |     }
112 | 
113 |     fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
114 |         let mut exps = Vec::with_capacity(x.shape().capacity());
115 |         let mut sum = 0.0;
116 |         for exp in x.as_slice().unwrap().iter().map(|t| t.exp()) {
117 |             exps.push(exp);
118 |             sum += exp;
119 |         }
120 |         let res = exps.iter().map(|t| t / sum);
121 |         result.write_iter(res)?;
122 |         Ok(())
123 |     }
124 | 
125 |     fn tanh(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
126 |         let res = x.as_slice().unwrap().iter().map(|elem| elem.tanh());
127 |         result.write_iter(res)?;
128 |         Ok(())
129 |     }
130 | }
131 | 
132 | impl<P> Extension for Context<P> where 
133 |     P: Dependency<Package> {
134 |     // ..
135 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/_build.rs:
--------------------------------------------------------------------------------
 1 | use package::ParenchymaDeep;
 2 | use parenchyma::{Build, Result};
 3 | use parenchyma::opencl::OpenCLContext;
 4 | use parenchyma::utility::Uninitialized;
 5 | use super::Package;
 6 | 
 7 | impl Build<OpenCLContext<Uninitialized>> for ParenchymaDeep {
 8 | 
 9 |     fn build(cx: &mut OpenCLContext<Uninitialized>) -> Result<ParenchymaDeep> {
10 | 
11 |         let program = cx.create_program(&[
12 |             include_str!("source/activation.cl"),
13 |             include_str!("source/activationBackward.cl"),
14 |             include_str!("source/convolution.cl")
15 |         ])?;
16 | 
17 |         let cl_package = Package {
18 |             tanh: program.create_kernel("tanh_float")?,
19 |             sigmoid: program.create_kernel("sigmoid_float")?,
20 |             relu: program.create_kernel("relu_float")?,
21 |             elu: program.create_kernel("elu_float")?,
22 | 
23 |             tanh_backward: program.create_kernel("tanh_backward_float")?,
24 |             sigmoid_backward: program.create_kernel("sigmoid_backward_float")?,
25 |             relu_backward: program.create_kernel("relu_backward_float")?,
26 |             elu_backward: program.create_kernel("elu_backward_float")?,
27 | 
28 |             convolution: program.create_kernel("convolve_ints")?,
29 | 
30 |             program,
31 |         };
32 | 
33 |         Ok(ParenchymaDeep { cl: cl_package })
34 |     }
35 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/_mod.rs:
--------------------------------------------------------------------------------
  1 | mod build;
  2 | 
  3 | use extension::{ActivationMode, Backward, Deep, Forward};
  4 | use package::ParenchymaDeep;
  5 | 
  6 | use parenchyma::{Result, SharedTensor};
  7 | use parenchyma::opencl::OpenCLContext;
  8 | use parenchyma::opencl::high;
  9 | use parenchyma::utility::Uninitialized;
 10 | 
 11 | #[derive(Debug)]
 12 | pub struct Package {
 13 |     program: high::Program,
 14 | 
 15 |     // === activation
 16 | 
 17 |     tanh: high::Kernel,
 18 |     sigmoid: high::Kernel,
 19 |     relu: high::Kernel,
 20 |     elu: high::Kernel,
 21 | 
 22 |     // === activation backward
 23 | 
 24 |     tanh_backward: high::Kernel,
 25 |     sigmoid_backward: high::Kernel,
 26 |     relu_backward: high::Kernel,
 27 |     elu_backward: high::Kernel,
 28 | 
 29 |     // == conv
 30 |     convolution: high::Kernel,
 31 | }
 32 | 
 33 | impl Deep for OpenCLContext<ParenchymaDeep> { }
 34 | 
 35 | impl Forward for OpenCLContext<ParenchymaDeep> {
 36 | 
 37 |     fn activation(
 38 |         &self, 
 39 |         mode: ActivationMode, 
 40 |         input: &SharedTensor, 
 41 |         output: &mut SharedTensor) -> Result {
 42 | 
 43 |         use extension::ActivationMode::*;
 44 | 
 45 |         let kernel = match mode {
 46 |             Tanh => unsafe { &self.package().cl.tanh },
 47 |             Sigmoid => unsafe { &self.package().cl.sigmoid },
 48 |             ReLu => unsafe { &self.package().cl.relu },
 49 |             Elu => unsafe { &self.package().cl.elu },
 50 |         };
 51 | 
 52 |         let length = input.shape.capacity();
 53 | 
 54 |         kernel.set_arg(0, input.read(self)?)?;
 55 |         kernel.set_arg(1, output.write(self)?)?;
 56 |         kernel.set_arg(2, &length)?;
 57 | 
 58 |         let global_work = &[length];
 59 |         let local_work = &[];
 60 | 
 61 |         // TODO event_wait_list
 62 |         let events = &[];
 63 | 
 64 |         // TODO
 65 |         let event = self.device().queue()
 66 |             .enqueue_nd_range_kernel(kernel, global_work, local_work, events)?;
 67 | 
 68 |         Ok(())
 69 |     }
 70 | }
 71 | 
 72 | impl Backward for OpenCLContext<ParenchymaDeep> {
 73 | 
 74 |     fn activation_backward(
 75 |         &self, 
 76 |         mode: ActivationMode, 
 77 |         input: &SharedTensor, 
 78 |         input_diff: &SharedTensor, 
 79 |         output_diff: &mut SharedTensor) -> Result {
 80 | 
 81 |         use extension::ActivationMode::*;
 82 | 
 83 |         let kernel = match mode {
 84 |             Tanh => unsafe { &self.package().cl.tanh_backward },
 85 |             Sigmoid => unsafe { &self.package().cl.sigmoid_backward },
 86 |             ReLu => unsafe { &self.package().cl.relu_backward },
 87 |             Elu => unsafe { &self.package().cl.elu_backward },
 88 |         };
 89 | 
 90 |         let length = input.shape.capacity();
 91 | 
 92 |         kernel.set_arg(0, input.read(self)?)?;
 93 |         kernel.set_arg(1, input_diff.read(self)?)?;
 94 |         kernel.set_arg(2, output_diff.write(self)?)?;
 95 |         kernel.set_arg(3, &length)?;
 96 | 
 97 |         let global_work = &[length];
 98 |         let local_work = &[];
 99 | 
100 |         // TODO event_wait_list
101 |         let events = &[];
102 | 
103 | 
104 |         // TODO
105 |         let event = self.device().queue()
106 |             .enqueue_nd_range_kernel(kernel, global_work, local_work, events)?;
107 | 
108 |         Ok(())
109 |     }
110 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/mod.rs:
--------------------------------------------------------------------------------
  1 | pub use self::package::OpenCLPackage;
  2 | 
  3 | mod package;
  4 | 
  5 | use super::super::{Extension, Package};
  6 | use super::super::extension_package::{Backward, Forward};
  7 | 
  8 | use ocl;
  9 | use parenchyma::error::Result;
 10 | use parenchyma::extension_package::{Dependency, ExtensionPackageCtor};
 11 | use parenchyma::frameworks::{OpenCLContext as Context, OpenCLMemory as Memory};
 12 | use parenchyma::tensor::{self, SharedTensor};
 13 | 
 14 | impl ExtensionPackageCtor<Context<()>> for super::super::Package {
 15 |     fn package(target: &mut Context<()>) -> Result<Self> {
 16 |         OpenCLPackage::compile(target).map(Package::OpenCL)
 17 |     }
 18 | }
 19 | 
 20 | impl<P> Backward for Context<P> where 
 21 |     P: Dependency<Package> {
 22 |     fn log_softmax_grad(
 23 |         &self, 
 24 |         x: &SharedTensor, 
 25 |         x_diff: &SharedTensor, 
 26 |         result: &mut SharedTensor) -> Result {
 27 | 
 28 |         let n = x.shape().capacity;
 29 |         let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
 30 |         let x_diff: &Memory<_> = tensor::reference(x_diff, /*on:*/ self.device())?;
 31 |         let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
 32 | 
 33 |         unsafe {
 34 |             ocl::Kernel::new("log_softmax_backward_float", &self.extension_package().dependency().open_cl().program)?
 35 |                 .arg_buf(x)
 36 |                 .arg_buf(x_diff)
 37 |                 .arg_buf(result)
 38 |                 .arg_scl(n as i32)
 39 | 
 40 |                 .gws([1, 1, 1])
 41 |                 .lws([1, 1, 1])
 42 |                 .queue(self.device().queue().clone())
 43 |                 .enq()?;
 44 |         }
 45 | 
 46 |         Ok(())
 47 |     }
 48 | 
 49 |     // fn relu_grad(
 50 |     //     self: &Self, 
 51 |     //     x: &SharedTensor, 
 52 |     //     x_diff: &SharedTensor,
 53 |     //     result: &SharedTensor,
 54 |     //     result_diff: &mut SharedTensor) -> Result {
 55 |     //     let res = x.as_slice().unwrap().iter()
 56 |     //         .zip(x_diff.as_slice().unwrap().iter())
 57 |     //         .map(|(x, dx)| if *x > 0.0 { *dx } else { 0.0 });
 58 |     //     result_diff.write_iter(res)?;
 59 |     //     Ok(())
 60 |     // }
 61 | 
 62 |     fn sigmoid_grad(
 63 |         self: &Self, 
 64 |         x: &SharedTensor, 
 65 |         x_diff: &SharedTensor,
 66 |         _: &SharedTensor,
 67 |         result_diff: &mut SharedTensor) -> Result {
 68 | 
 69 |         let n = x.shape().capacity;
 70 |         let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
 71 |         let x_diff: &Memory<_> = tensor::reference(x_diff, /*on:*/ self.device())?;
 72 |         // let result: &Memory<_> = tensor::reference(result, /*on:*/ self.device())?;
 73 |         let result_diff: &mut Memory<_> = tensor::mut_reference(result_diff, /*on:*/ self.device())?;
 74 | 
 75 |         unsafe {
 76 |             ocl::Kernel::new("sigmoid_backward_float", &self.extension_package().dependency().open_cl().program)?
 77 |                 .arg_buf(x)
 78 |                 .arg_buf(x_diff)
 79 |                 .arg_buf(result_diff)
 80 |                 .arg_scl(n as i32)
 81 | 
 82 |                 .gws([n])
 83 |                 .queue(self.device().queue().clone())
 84 |                 .enq()?;
 85 |         }
 86 | 
 87 |         Ok(())
 88 |     }
 89 | 
 90 |     // fn softmax_grad(
 91 |     //     self: &Self,
 92 |     //     x: &SharedTensor, 
 93 |     //     x_diff: &SharedTensor, 
 94 |     //     result_diff: &mut SharedTensor) -> Result {
 95 |     //     let mut dot = 0.0;
 96 |     //     let sig_data_slice = x.as_slice().unwrap();
 97 |     //     let sig_dx_slice = x_diff.as_slice().unwrap();
 98 |     //     for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) {
 99 |     //         dot += t * dt;
100 |     //     }
101 |     //     let res = sig_data_slice.iter().zip(sig_dx_slice.iter()).map(|(t, dt)| t * (dt - dot));
102 |     //     result_diff.write_iter(res)?;
103 |     //     Ok(())
104 |     // }
105 | 
106 |     // fn tanh_grad(
107 |     //     self: &Self, 
108 |     //     x: &SharedTensor, 
109 |     //     x_diff: &SharedTensor, 
110 |     //     result: &SharedTensor,
111 |     //     result_diff: &mut SharedTensor) -> Result {
112 |     //     let res = x.as_slice().unwrap().iter()
113 |     //         .zip(x_diff.as_slice().unwrap().iter())
114 |     //         .map(|(x, dx)| (1.0 - x.powi(2)) * *dx);
115 |     //     result_diff.write_iter(res)?;
116 |     //     Ok(())
117 |     // }
118 | }
119 | 
120 | impl<P> Forward for Context<P> where 
121 |     P: Dependency<Package> {
122 |     // fn elu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
123 |     //     let n = x.shape().capacity;
124 |     //     let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
125 |     //     let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
126 | 
127 |     //     unsafe {
128 |     //         ocl::Kernel::new("elu_float", &self.extension_package().dependency().open_cl().program)?
129 |     //             .arg_buf(x)
130 |     //             .arg_buf(result)
131 |     //             .arg_scl(n as i32)
132 | 
133 |     //             .gws([n])
134 |     //             .queue(self.device().queue().clone())
135 |     //             .enq()?;
136 |     //     }
137 | 
138 |     //     Ok(())
139 |     // }
140 | 
141 |     fn log_softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
142 |         let n = x.shape().capacity;
143 |         let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
144 |         let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
145 | 
146 |         unsafe {
147 |             ocl::Kernel::new("log_softmax_float", &self.extension_package().dependency().open_cl().program)?
148 |                 .arg_buf(x)
149 |                 .arg_buf(result)
150 |                 .arg_scl(n as i32)
151 | 
152 |                 .gws([1, 1, 1])
153 |                 .lws([1, 1, 1])
154 |                 .queue(self.device().queue().clone())
155 |                 .enq()?;
156 |         }
157 | 
158 |         Ok(())
159 |     }
160 | 
161 |     // fn relu(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
162 |     //     let n = x.shape().capacity;
163 |     //     let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
164 |     //     let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
165 | 
166 |     //     unsafe {
167 |     //         ocl::Kernel::new("relu_float", &self.extension_package().dependency().open_cl().program)?
168 |     //             .arg_buf(x)
169 |     //             .arg_buf(result)
170 |     //             .arg_scl(n as i32)
171 | 
172 |     //             .gws([n])
173 |     //             .queue(self.device().queue().clone())
174 |     //             .enq()?;
175 |     //     }
176 | 
177 |     //     Ok(())
178 |     // }
179 | 
180 |     fn sigmoid(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
181 |         let n = x.shape().capacity;
182 |         let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
183 |         let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
184 | 
185 |         unsafe {
186 |             ocl::Kernel::new("sigmoid_float", &self.extension_package().dependency().open_cl().program)?
187 |                 .arg_buf(x)
188 |                 .arg_buf(result)
189 |                 .arg_scl(n as i32)
190 | 
191 |                 .gws([n])
192 |                 .queue(self.device().queue().clone())
193 |                 .enq()?;
194 |         }
195 | 
196 |         Ok(())
197 |     }
198 | 
199 |     // fn softmax(&self, x: &SharedTensor, result: &mut SharedTensor) -> Result {
200 |     //     let n = x.shape().capacity;
201 |     //     let x: &Memory<_> = tensor::reference(x, /*on:*/ self.device())?;
202 |     //     let result: &mut Memory<_> = tensor::mut_reference(result, /*on:*/ self.device())?;
203 | 
204 |     //     unsafe {
205 |     //         ocl::Kernel::new("softmax_float", &self.extension_package().dependency().open_cl().program)?
206 |     //             .arg_buf(x)
207 |     //             .arg_buf(result)
208 |     //             .arg_scl(n as i32)
209 | 
210 |     //             .gws([1, 1, 1])
211 |     //             .lws([1, 1, 1])
212 |     //             .queue(self.device().queue().clone())
213 |     //             .enq()?;
214 |     //     }
215 | 
216 |     //     Ok(())
217 |     // }
218 | }
219 | 
220 | impl<P> Extension for Context<P> where 
221 |     P: Dependency<Package> {
222 |     // ..
223 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/package.rs:
--------------------------------------------------------------------------------
 1 | use ocl;
 2 | use std::ffi::CString;
 3 | use parenchyma::error::Result;
 4 | use parenchyma::frameworks::OpenCLContext;
 5 | 
 6 | /// Caches instances of `Kernel`
 7 | #[derive(Debug)]
 8 | pub struct OpenCLPackage {
 9 |     pub(in frameworks::open_cl) program: ocl::Program,
10 | }
11 | 
12 | impl OpenCLPackage {
13 |     pub fn compile(cx: &mut OpenCLContext<()>) -> Result<OpenCLPackage> {
14 |         let program = cx.program(vec![
15 |             CString::new(include_str!("source/activation.cl")).unwrap(),
16 |             CString::new(include_str!("source/activationBackward.cl")).unwrap(),
17 |             CString::new(include_str!("source/convolution.cl")).unwrap(),
18 |             CString::new(include_str!("source/softmax.cl")).unwrap()
19 |         ])?;
20 | 
21 |         // let cl_package = Package {
22 |         //     tanh: program.create_kernel("tanh_float")?,
23 |         //     sigmoid: program.create_kernel("sigmoid_float")?,
24 |         //     relu: program.create_kernel("relu_float")?,
25 |         //     elu: program.create_kernel("elu_float")?,
26 | 
27 |         //     tanh_backward: program.create_kernel("tanh_backward_float")?,
28 |         //     sigmoid_backward: program.create_kernel("sigmoid_backward_float")?,
29 |         //     relu_backward: program.create_kernel("relu_backward_float")?,
30 |         //     elu_backward: program.create_kernel("elu_backward_float")?,
31 | 
32 |         //     convolution: program.create_kernel("convolve_ints")?,
33 | 
34 |         //     program,
35 |         // };
36 | 
37 |         Ok(OpenCLPackage { program })
38 |     }
39 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/source/activation.cl:
--------------------------------------------------------------------------------
 1 | #define ACTIVATION_TYPE(function, type) \
 2 | kernel void function##_##type(global const type* in, global type* out, const uintptr_t len) \
 3 | { \
 4 |     const uintptr_t current = get_global_id(0); \
 5 |     if(current >= len) { \
 6 |         return void(); \
 7 |     } \
 8 |     out[current] = function(in[current]); \
 9 | } \
10 | 
11 | #define ACTIVATION(function) ACTIVATION_TYPE(function, float) ACTIVATION_TYPE(function, double) \
12 | 
13 | // =================================================================================================
14 | 
15 | ACTIVATION(tanh)
16 | 
17 | #define sigmoid(x) (1 / (1 + exp(-x)))
18 | ACTIVATION(sigmoid)
19 | 
20 | #define relu(x) (x > 0 ? x : 0)
21 | ACTIVATION(relu)
22 | 
23 | #define elu(x) (x > 0 ? x : exp(x) - 1)
24 | ACTIVATION(elu)


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/source/activationBackward.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // TODO newline required for some reason..
 3 | #define BACKWARD_WITH_TYPE(name, type, activationDeriv) \
 4 | kernel void name##_backward_##type(global const type* in, global const type* inDiff, global type* outDiff, const uintptr_t len) \
 5 | { \
 6 |     const uintptr_t current = get_global_id(0); \
 7 |     if(current >= len) { \
 8 |         return void(); \
 9 |     } \
10 |     outDiff[current] = activationDeriv(in[current]) * inDiff[current]; \
11 | } \
12 | 
13 | #define BACKWARD(name, deriv) \ 
14 | BACKWARD_WITH_TYPE(name, float, deriv) BACKWARD_WITH_TYPE(name, double, deriv) \
15 | 
16 | // =================================================================================================
17 | 
18 | #define tanhDeriv(x) (1 - x * x)
19 | BACKWARD(tanh, tanhDeriv)
20 | 
21 | #define sigmoidDeriv(x) (x * (1 - x))
22 | BACKWARD(sigmoid, sigmoidDeriv)
23 | 
24 | #define reluDeriv(x) (x > 0 ? 1 : 0)
25 | BACKWARD(relu, reluDeriv)
26 | 
27 | #define eluDeriv(x) (x > 0 ? 1 : x + 1)
28 | BACKWARD(elu, eluDeriv)


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/source/convolution.cl:
--------------------------------------------------------------------------------
  1 | 
  2 | // TODO newline required for some reason..
  3 | 
  4 | // Copyright Hugh Perkins 2014, 2015 hughperkins at gmail
  5 | //
  6 | // This Source Code Form is subject to the terms of the Mozilla Public License, 
  7 | // v. 2.0. If a copy of the MPL was not distributed with this file, You can 
  8 | // obtain one at http://mozilla.org/MPL/2.0/.
  9 | 
 10 | // expected defines:
 11 | // one of: [ TANH | RELU | LINEAR ]
 12 | // BIASED (or not)
 13 | 
 14 | void kernel convolve_ints(global const int *p_imageSize, global const int *p_filterSize,
 15 |       global const int *image, global const int *filter, global int *result) {
 16 |     int id = get_global_id(0);
 17 |     int imageSize = p_imageSize[0];
 18 |     int filterSize = p_filterSize[0];
 19 |     int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize);
 20 |     int localid = id % (imageSize * imageSize);
 21 |     int row = localid / imageSize;
 22 |     int col = localid % imageSize;
 23 |     int halfFilterSize = filterSize >> 1;
 24 |     int sum = 0;
 25 |     int minm = max(-halfFilterSize, -row);
 26 |     int maxm = min(halfFilterSize, imageSize - 1 - row);
 27 |     int minn = max(-halfFilterSize, -col);
 28 |     int maxn = min(halfFilterSize, imageSize - 1 - col);
 29 |     int m = minm;
 30 |     while(m <= maxm) {
 31 |         int x = (row + m);
 32 |         int ximage = imageOffset + x * imageSize;
 33 |         int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize;
 34 |         int n = minn;
 35 |         while(n <= maxn) {
 36 |             int y = col + n;
 37 |             sum += image[ ximage + y] * filter[ filterrowoffset + n ];
 38 |             n++;
 39 |         }
 40 |         m++;
 41 |     }
 42 |     result[id] = sum;
 43 | }
 44 | 
 45 | void kernel convolve_floats(global const int *p_imageSize, global const int *p_filterSize,
 46 |       global const float *image, global const float *filter, global float *result) {
 47 |     int id = get_global_id(0);
 48 |     int imageSize = p_imageSize[0];
 49 |     int filterSize = p_filterSize[0];
 50 |     int imageOffset = id / (imageSize * imageSize) * (imageSize * imageSize);
 51 |     int localid = id % (imageSize * imageSize);
 52 |     int row = localid / imageSize;
 53 |     int col = localid % imageSize;
 54 |     int halfFilterSize = filterSize >> 1;
 55 |     float sum = 0;
 56 |     int minm = max(-halfFilterSize, -row);
 57 |     int maxm = min(halfFilterSize, imageSize - 1 - row);
 58 |     int minn = max(-halfFilterSize, -col);
 59 |     int maxn = min(halfFilterSize, imageSize - 1 - col);
 60 |     int m = minm;
 61 |     while(m <= maxm) {
 62 |         int x = (row + m);
 63 |         int ximage = imageOffset + x * imageSize;
 64 |         int filterrowoffset = (m+halfFilterSize) * filterSize + halfFilterSize;
 65 |         int n = minn;
 66 |         while(n <= maxn) {
 67 |             int y = col + n;
 68 |             sum += image[ ximage + y] * filter[ filterrowoffset + n ];
 69 |             n++;
 70 |         }
 71 |         m++;
 72 |     }
 73 |     result[id] = sum;
 74 | }
 75 | 
 76 | void kernel convolve_imagecubes_int(global const int *p_numInputPlanes, global const int *p_numFilters, 
 77 |       global const int *p_imageSize, global const int *p_filterSize,
 78 |       global const int *images, global const int *filters, global int *output) {
 79 |     int globalId = get_global_id(0);
 80 | 
 81 |     int numInputPlanes = p_numInputPlanes[0];
 82 |     int numFilters = p_numFilters[0];
 83 |     int imageSize = p_imageSize[0];
 84 |     int filterSize = p_filterSize[0];
 85 |     int imageSizeSquared = imageSize * imageSize;
 86 | 
 87 |     int outputImage2Id = globalId / imageSizeSquared;
 88 |     int filterId = outputImage2Id % numFilters;
 89 |     int inputImage3Id = outputImage2Id / numFilters;
 90 | 
 91 |     int filterOffset = filterId * filterSize * filterSize;
 92 |     int inputImage3Offset = inputImage3Id * numInputPlanes * imageSizeSquared;
 93 | 
 94 |     // intraimage coords
 95 |     int localid = globalId % imageSizeSquared;
 96 |     int row = localid / imageSize;
 97 |     int col = localid % imageSize;
 98 | 
 99 |     int halfFilterSize = filterSize >> 1;
100 |     int sum = 0;
101 |     int minm = max(-halfFilterSize, -row);
102 |     int maxm = min(halfFilterSize, imageSize - 1 - row);
103 |     int minn = max(-halfFilterSize, -col);
104 |     int maxn = min(halfFilterSize, imageSize - 1 - col);
105 |     int plane = 0;
106 |     while(plane < numInputPlanes) {
107 |         int inputImageOffset = inputImage3Offset + plane * imageSizeSquared;
108 |         int filterPlaneOffset = filterOffset + plane * filterSize * filterSize;
109 |         int m = minm;
110 |         while(m <= maxm) {
111 |             int y = row + m;
112 |             int inputimagerowoffset = inputImageOffset + y * imageSize;
113 |             int filterrowoffset = filterPlaneOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
114 |             int n = minn;
115 |             while(n <= maxn) {
116 |                 int x = col + n;
117 |                 sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ];
118 |                 n++;
119 |             }
120 |             m++;
121 |         }
122 |         plane++;
123 |     }
124 |     output[globalId] = sum;
125 | }
126 | 
127 | // receive images as a stack of images
128 | // globalid = n * numfilters * imagesize * imagesize + filter * imagesize * imagesize + imagerow * imagesize + imagecol
129 | //                                 globalid              globalid
130 | //  inputimage3 1 inputimage2 1----filter 1             -> outputimage2 1   outputimage3 1
131 | //                inputimage2 2_/\_filter 2             -> outputimage2 2
132 | //  inputimage3 2 inputimage2 3    filter 1             -> outputimage2 3   outputimage3 2
133 | //                inputimage2 4    filter 2             -> outputimage2 4
134 | //
135 | // each outputimage is only written once, by a combination of:
136 | // - one inputimage3
137 | // - one filter
138 | // each inputimage3 is mapped to each filter once, each time writing to one outputimage
139 | //
140 | // images is:
141 | //       numimages * numinputplanes * imagesizesquared
142 | // filters is:
143 | //       numfilters * numinputplanes * filtersizesquared
144 | // outputs is:
145 | //       numimages * numfilters * outputimagesizesquared
146 | 
147 | // images are organized like [imageId][plane][row][col]
148 | // filters are organized like [filterid][plane][filterrow][filtercol]
149 | // output are organized like [imageid][filterid][row][col]
150 | void kernel convolve_imagecubes_float( 
151 |       const int numInputPlanes, const int numFilters, 
152 |       const int imageSize, const int filterSize,
153 |       global const float *images, global const float *filters, global float *output) {
154 |     int globalId = get_global_id(0);
155 | 
156 |     int imageSizeSquared = imageSize * imageSize;
157 | 
158 |     int outputImage2Id = globalId / imageSizeSquared;
159 |     int filterId = outputImage2Id % numFilters;
160 |     int inputImage3Id = outputImage2Id / numFilters;
161 | 
162 |     int filterOffset = filterId * filterSize * filterSize;
163 |     int inputImage3Offset = inputImage3Id * numInputPlanes * imageSizeSquared;
164 | 
165 |     // intraimage coords
166 |     int localid = globalId % imageSizeSquared;
167 |     int row = localid / imageSize;
168 |     int col = localid % imageSize;
169 | 
170 |     int halfFilterSize = filterSize >> 1;
171 |     float sum = 0;
172 |     // m should vary from -halfFilterSize through 0 to halfFilterSize 
173 |     // n too...
174 |     int minm = max(-halfFilterSize, -row);
175 |     int maxm = min(halfFilterSize, imageSize - 1 - row);
176 |     int minn = max(-halfFilterSize, -col);
177 |     int maxn = min(halfFilterSize, imageSize - 1 - col);
178 |     int inputPlane = 0;
179 |     while(inputPlane < numInputPlanes) {
180 |         int inputImageOffset = inputImage3Offset + inputPlane * imageSizeSquared;
181 |         int m = minm;
182 |         while(m <= maxm) {
183 |             int y = row + m;
184 |             int inputimagerowoffset = inputImageOffset + y * imageSize;
185 |             int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
186 |             int n = minn;
187 |             while(n <= maxn) {
188 |                 int x = col + n;
189 |                 sum += images[ inputimagerowoffset + x] * filters[ filterrowoffset + n ];
190 |                 n++;
191 |             }
192 |             m++;
193 |         }
194 |         inputPlane++;
195 |     }
196 | 
197 |     output[globalId] = sum;
198 | }
199 | 
200 | void kernel convolve_imagecubes_float_nopadzeros( 
201 |       const int numInputPlanes, const int numFilters, 
202 |       const int inputSize, const int filterSize,
203 |       global const float *images, global const float *filters, global float *output) {
204 |     int globalId = get_global_id(0);
205 | 
206 |     int inputSizeSquared = inputSize * inputSize;
207 |     int outputSize = inputSize - filterSize + 1;
208 |     int outputSizeSquared = outputSize * outputSize;
209 | 
210 |     int outputImage2Id = globalId / outputSizeSquared;
211 |     int filterId = outputImage2Id % numFilters;
212 |     int inputImage3Id = outputImage2Id / numFilters;
213 | 
214 |     int filterOffset = filterId * filterSize * filterSize;
215 |     int inputImage3Offset = inputImage3Id * numInputPlanes * inputSizeSquared;
216 | 
217 |     // intraimage coords
218 |     int localid = globalId % outputSizeSquared;
219 |     int outputRow = localid / outputSize;
220 |     int outputCol = localid % outputSize;
221 | 
222 |     int halfFilterSize = filterSize >> 1;
223 |     float sum = 0;
224 |     int minm = -halfFilterSize;
225 |     int maxm = halfFilterSize;
226 |     int minn = -halfFilterSize;
227 |     int maxn = halfFilterSize;
228 |     int inputPlane = 0;
229 |     while(inputPlane < numInputPlanes) {
230 |         int inputImageOffset = inputImage3Offset + inputPlane * inputSizeSquared;
231 |         int m = minm;
232 |         while(m <= maxm) {
233 |             int inputRow = outputRow + m + halfFilterSize;
234 |             int inputimagerowoffset = inputImageOffset + inputRow * inputSize;
235 |             int filterrowoffset = filterOffset + (m+halfFilterSize) * filterSize + halfFilterSize;
236 |             int n = minn;
237 |             while(n <= maxn) {
238 |                 int inputCol = outputCol + n + halfFilterSize;
239 |                 sum += images[ inputimagerowoffset + inputCol] * filters[ filterrowoffset + n ];
240 |                 n++;
241 |             }
242 |             m++;
243 |         }
244 |         inputPlane++;
245 |     }
246 |     output[globalId] = sum;
247 | }
248 | 
249 | 


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/frameworks/open_cl/source/softmax.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | // TODO write a parallel reduction: 
 3 | //
 4 | // * find the max element in the array
 5 | // * ..
 6 | 
 7 | #define SOFTMAX(interfn) \
 8 | __kernel __attribute((reqd_work_group_size(1, 1, 1))) \
 9 | void interfn##_float(__global float* x, __global float* result, const uintptr_t len) { \
10 |     float in_max = -MAXFLOAT; \
11 |     float sum = 0.0; \
12 |     uintptr_t i; \
13 |     for(i = 0; i < len; i++) { \
14 |         float current = x[i]; \
15 |         in_max = (in_max > current) ? in_max : current; \
16 |     } \
17 |     for(i = 0; i < len; i++) { \
18 |         float current = exp(x[i] - in_max); \
19 |         sum += current; \
20 |         result[i] = current; \
21 |     } \
22 |     for(i = 0; i < len; i++) { \
23 |         result[i] = interfn(result[i] / sum); \
24 |     } \
25 | } \
26 | 
27 | #define softmax(x) (x)
28 | SOFTMAX(softmax)
29 | 
30 | #define log_softmax(x) (log(x))
31 | SOFTMAX(log_softmax)
32 | 
33 | __kernel void log_softmax_backward_float(
34 |     __global float* x, 
35 |     __global float* x_diff, 
36 |     __global float* result,
37 |     const uintptr_t len)
38 | {
39 |     float sum = 0.0;
40 |     uintptr_t i;
41 |     for(i = 0; i < len; i++) {
42 |         sum += x_diff[i];
43 |     }
44 |     for(i = 0; i < len; i++) {
45 |         result[i] = x_diff[i] - exp(x[i]) * sum;
46 |     }
47 | }


--------------------------------------------------------------------------------
/crates/parenchyma-deep/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Parenchyma extension package for backend-agnostic deep neural network (NN) operations.
 2 | 
 3 | #![allow(unused_variables)]
 4 | #![feature(non_modrs_mods)]
 5 | 
 6 | extern crate ocl;
 7 | extern crate parenchyma;
 8 | 
 9 | pub use self::extension_package::{Extension, Package};
10 | pub mod frameworks;
11 | 
12 | mod extension_package;


--------------------------------------------------------------------------------
/crates/parenchyma-deep/tests/deep_specs.rs:
--------------------------------------------------------------------------------
  1 | #![feature(rustc_private)]
  2 | 
  3 | #[macro_use]
  4 | extern crate lazy_static;
  5 | extern crate parenchyma;
  6 | extern crate parenchyma_deep;
  7 | 
  8 | #[cfg(test)]
  9 | mod deep_specification_native {
 10 |     use parenchyma::frameworks::Native;
 11 |     use parenchyma::prelude::*;
 12 |     use parenchyma_deep::*;
 13 | 
 14 |     struct TestBackend(Backend<Package>);
 15 |     impl ::std::ops::Deref for TestBackend {
 16 |         type Target = Backend<Package>;
 17 |         fn deref(&self) -> &Self::Target { &self.0 }
 18 |     }
 19 |     unsafe impl Sync for TestBackend { }
 20 | 
 21 |     lazy_static! {
 22 |         static ref BACKEND: TestBackend = TestBackend(Backend::new::<Native<_>>().unwrap());
 23 |     }
 24 | 
 25 |     fn get_memory() -> (SharedTensor, SharedTensor) {
 26 |         let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap();
 27 |         let result: SharedTensor = SharedTensor::from([1, 1, 3]);
 28 |         (x, result)
 29 |     }
 30 | 
 31 |     fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){
 32 |         let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
 33 |         let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
 34 |         let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
 35 |         let result_diff = SharedTensor::from([1, 1, 3]);
 36 |         (x, x_diff, result, result_diff)
 37 |     }
 38 | 
 39 |     fn get_memory_softmax() -> (SharedTensor, SharedTensor) {
 40 |         let x = SharedTensor::with([1, 1, 4], vec![1.0; 4]).unwrap();
 41 |         let result: SharedTensor = SharedTensor::from([1, 1, 4]);
 42 |         (x, result)
 43 |     }
 44 | 
 45 |     #[test]
 46 |     fn it_computes_correct_log_softmax_on_for_f32() {
 47 |         let (mut x, mut result) = get_memory_softmax();
 48 |         BACKEND.log_softmax(&mut x, &mut result).unwrap();
 49 |         assert_eq!(&[-1.3862944, -1.3862944, -1.3862944, -1.3862944], result.as_slice().unwrap());
 50 |     }
 51 | 
 52 |     #[test]
 53 |     fn it_computes_correct_log_softmax_grad_on_for_f32() {
 54 |         let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory();
 55 |         BACKEND.log_softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap();
 56 |         assert_eq!(&[-9.873127, -9.873127, -27.556225], result_diff.as_slice().unwrap());
 57 |     }
 58 | 
 59 |     #[test]
 60 |     fn it_computes_correct_relu_on_for_f32() {
 61 |         let (mut x, mut result) = get_memory();
 62 |         BACKEND.relu(&mut x, &mut result).unwrap();
 63 |         assert_eq!(&[1., 1., 2.], result.as_slice().unwrap());
 64 |     }
 65 | 
 66 |     #[test]
 67 |     fn it_computes_correct_relu_grad_on_for_f32() {
 68 |         let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory();
 69 |         BACKEND.relu_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap();
 70 |         assert_eq!(&[1., 1., 2.], result_diff.as_slice().unwrap());
 71 |     }
 72 | 
 73 |     #[test]
 74 |     fn it_computes_correct_sigmoid_on_for_f32() {
 75 |         let (mut x, mut result) = get_memory();
 76 |         BACKEND.sigmoid(&mut x, &mut result).unwrap();
 77 |         assert_eq!(&[0.7310585786, 0.7310586, 0.880797], result.as_slice().unwrap());
 78 |     }
 79 | 
 80 |     #[test]
 81 |     fn it_computes_correct_sigmoid_grad_on_for_f32() {
 82 |         let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory();
 83 |         BACKEND.sigmoid_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap();
 84 |         assert_eq!(&[0., 0., -4.], result_diff.as_slice().unwrap());
 85 |     }
 86 | 
 87 |     #[test]
 88 |     fn it_computes_correct_softmax_on_for_f32() {
 89 |         let (mut x, mut result) = get_memory_softmax();
 90 |         BACKEND.softmax(&mut x, &mut result).unwrap();
 91 |         assert_eq!(&[0.25, 0.25, 0.25, 0.25], result.as_slice().unwrap());
 92 |     }
 93 | 
 94 |     #[test]
 95 |     fn it_computes_correct_softmax_grad_on_for_f32() {
 96 |         let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory();
 97 |         BACKEND.softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap();
 98 |         assert_eq!(&[-5., -5., -8.], result_diff.as_slice().unwrap());
 99 |     }
100 | }
101 | 
102 | #[cfg(test)]
103 | mod deep_specification_opencl {
104 |     use parenchyma::frameworks::OpenCL;
105 |     use parenchyma::hardware::{Hardware, HardwareKind};
106 |     use parenchyma::prelude::*;
107 |     use parenchyma_deep::*;
108 | 
109 |     struct TestBackend(Backend<Package>);
110 |     impl ::std::ops::Deref for TestBackend {
111 |         type Target = Backend<Package>;
112 |         fn deref(&self) -> &Self::Target { &self.0 }
113 |     }
114 |     unsafe impl Sync for TestBackend { }
115 | 
116 |     lazy_static! {
117 |         static ref BACKEND: TestBackend = {
118 |             let mut backend: Backend<Package> = Backend::new::<OpenCL<_>>().unwrap();
119 |             // required here!
120 |             backend.select(&|hardware| hardware.kind == HardwareKind::GPU);
121 |             TestBackend(backend)
122 |         };
123 |     }
124 | 
125 |     fn get_memory() -> (SharedTensor, SharedTensor) {
126 |         let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap();
127 |         let result: SharedTensor = SharedTensor::from([1, 1, 3]);
128 |         (x, result)
129 |     }
130 | 
131 |     fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){
132 |         let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
133 |         let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
134 |         let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
135 |         let result_diff = SharedTensor::from([1, 1, 3]);
136 |         (x, x_diff, result, result_diff)
137 |     }
138 | 
139 |     fn get_memory_softmax() -> (SharedTensor, SharedTensor) {
140 |         let x = SharedTensor::with([1, 1, 4], vec![1.0; 4]).unwrap();
141 |         let result: SharedTensor = SharedTensor::from([1, 1, 4]);
142 |         (x, result)
143 |     }
144 | 
145 |     #[test]
146 |     fn it_computes_correct_log_softmax_on_for_f32() {
147 |         let (mut x, mut result) = get_memory_softmax();
148 |         BACKEND.log_softmax(&mut x, &mut result).unwrap();
149 |         assert_eq!(&[-1.3862944, -1.3862944, -1.3862944, -1.3862944], result.as_slice().unwrap());
150 |     }
151 | 
152 |     #[test]
153 |     fn it_computes_correct_relu_on_for_f32() {
154 |         let (mut x, mut result) = get_memory();
155 |         BACKEND.relu(&mut x, &mut result).unwrap();
156 |         assert_eq!(&[1., 1., 2.], result.as_slice().unwrap());
157 |     }
158 | 
159 |     #[test]
160 |     fn it_computes_correct_sigmoid_on_for_f32() {
161 |         let (mut x, mut result) = get_memory();
162 |         BACKEND.sigmoid(&mut x, &mut result).unwrap();
163 |         assert_eq!(&[0.7310585786, 0.7310586, 0.880797], result.as_slice().unwrap());
164 |     }
165 | 
166 |     #[test]
167 |     fn it_computes_correct_softmax_on_for_f32() {
168 |         let (mut x, mut result) = get_memory_softmax();
169 |         BACKEND.softmax(&mut x, &mut result).unwrap();
170 |         assert_eq!(&[0.25, 0.25, 0.25, 0.25], result.as_slice().unwrap());
171 |     }
172 | }
173 | 
174 | #[cfg(test)]
175 | mod deep_specification_backward_opencl {
176 |     use parenchyma::frameworks::OpenCL;
177 |     use parenchyma::hardware::{Hardware, HardwareKind};
178 |     use parenchyma::prelude::*;
179 |     use parenchyma_deep::*;
180 | 
181 |     struct TestBackend(Backend<Package>);
182 |     impl ::std::ops::Deref for TestBackend {
183 |         type Target = Backend<Package>;
184 |         fn deref(&self) -> &Self::Target { &self.0 }
185 |     }
186 |     unsafe impl Sync for TestBackend { }
187 | 
188 |     lazy_static! {
189 |         static ref BACKEND: TestBackend = {
190 |             let mut backend: Backend<Package> = Backend::new::<OpenCL<_>>().unwrap();
191 |             // required here!
192 |             backend.select(&|hardware| hardware.kind == HardwareKind::GPU);
193 |             TestBackend(backend)
194 |         };
195 |     }
196 | 
197 |     fn get_memory() -> (SharedTensor, SharedTensor) {
198 |         let x = SharedTensor::with([1, 1, 3], &[1., 1., 2.][..]).unwrap();
199 |         let result: SharedTensor = SharedTensor::from([1, 1, 3]);
200 |         (x, result)
201 |     }
202 | 
203 |     fn get_grad_memory() -> (SharedTensor, SharedTensor, SharedTensor, SharedTensor){
204 |         let x = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
205 |         let x_diff = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
206 |         let result = SharedTensor::with([1, 1, 3], &[1.0, 1.0, 2.0][..]).unwrap();
207 |         let result_diff = SharedTensor::from([1, 1, 3]);
208 |         (x, x_diff, result, result_diff)
209 |     }
210 | 
211 |     #[test]
212 |     fn it_computes_correct_log_softmax_grad_on_for_f32() {
213 |         let (mut x, mut x_diff, _, mut result_diff) = get_grad_memory();
214 |         BACKEND.log_softmax_grad(&mut x, &mut x_diff, &mut result_diff).unwrap();
215 |         assert_eq!(&[-9.873127, -9.873127, -27.556223], result_diff.as_slice().unwrap());
216 |     }
217 | 
218 |     #[test]
219 |     fn it_computes_correct_sigmoid_grad_on_for_f32() {
220 |         let (mut x, mut x_diff, mut result, mut result_diff) = get_grad_memory();
221 |         BACKEND.sigmoid_grad(&mut x, &mut x_diff, &mut result, &mut result_diff).unwrap();
222 |         assert_eq!(&[0., 0., -4.], result_diff.as_slice().unwrap());
223 |     }
224 | }


--------------------------------------------------------------------------------
/crates/parenchyma-ml/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | /target
3 | **/*.rs.bk
4 | Cargo.lock
5 | 


--------------------------------------------------------------------------------
/crates/parenchyma-ml/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "parenchyma-ml"
 3 | version = "0.1.0"
 4 | authors = ["Jony <jonysy@users.noreply.github.com>"]
 5 | license = "MIT/Apache-2.0"
 6 | 
 7 | [dependencies.parenchyma]
 8 | path = "../../"
 9 | version = "0.0.4"
10 | 
11 | [dependencies.parenchyma-blas]
12 | path = "../parenchyma-blas"
13 | 
14 | [dependencies.parenchyma-deep]
15 | path = "../parenchyma-deep"


--------------------------------------------------------------------------------
/crates/parenchyma-ml/src/extension_package.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma::extension_package::{Dependency, ExtensionPackage};
 2 | use super::{parenchyma_blas, parenchyma_deep};
 3 | 
 4 | /// The machine learning package.
 5 | pub struct Package {
 6 |     /// The BLAS package.
 7 |     pub(crate) blas: parenchyma_blas::Package,
 8 |     /// The Deep NN package.
 9 |     pub(crate) deep: parenchyma_deep::Package,
10 | }
11 | 
12 | impl Dependency<parenchyma_blas::Package> for Package {
13 |     fn dependency(&self) -> &parenchyma_blas::Package {
14 |         &self.blas
15 |     }
16 | }
17 | 
18 | impl Dependency<parenchyma_deep::Package> for Package {
19 |     fn dependency(&self) -> &parenchyma_deep::Package {
20 |         &self.deep
21 |     }
22 | }
23 | 
24 | /// **note**: should be replaced with an actual trait alias ([RFC#1733]).
25 | ///
26 | /// [RFC#1733]: https://github.com/rust-lang/rfcs/pull/1733
27 | pub trait Dependencies: 
28 |     Dependency<parenchyma_blas::Package> + 
29 |     Dependency<parenchyma_deep::Package> {
30 |     //..
31 | }
32 | 
33 | impl<D> Dependencies for D
34 |     where D:
35 |     Dependency<parenchyma_blas::Package> + 
36 |     Dependency<parenchyma_deep::Package> {
37 |     // ..
38 | }
39 | 
40 | pub trait Extension 
41 |     where Self: 
42 |     parenchyma_blas::Extension + 
43 |     parenchyma_deep::Extension {
44 |     // ..
45 | }
46 | 
47 | impl ExtensionPackage for Package {
48 |     type Extension = Extension;
49 |     fn package_name(&self) -> &'static str {
50 |         return "parenchyma/ml";
51 |     }
52 | }


--------------------------------------------------------------------------------
/crates/parenchyma-ml/src/frameworks/mod.rs:
--------------------------------------------------------------------------------
1 | mod native;
2 | mod open_cl;


--------------------------------------------------------------------------------
/crates/parenchyma-ml/src/frameworks/native.rs:
--------------------------------------------------------------------------------
1 | use parenchyma::frameworks::NativeContext as Context;
2 | use super::super::{Dependencies, Extension};
3 | 
4 | impl<P> Extension for Context<P> where P: Dependencies { }


--------------------------------------------------------------------------------
/crates/parenchyma-ml/src/frameworks/open_cl.rs:
--------------------------------------------------------------------------------
 1 | use super::super::{Dependencies, Extension, Package};
 2 | 
 3 | use parenchyma::error::Result;
 4 | use parenchyma::extension_package::ExtensionPackageCtor;
 5 | use parenchyma::frameworks::OpenCLContext as Context;
 6 | use parenchyma_blas::Package as BLASPackage;
 7 | use parenchyma_blas::frameworks::open_cl::OpenCLPackage as OpenCLBLASPackage;
 8 | use parenchyma_deep::Package as DeepPackage;
 9 | use parenchyma_deep::frameworks::open_cl::OpenCLPackage as OpenCLDeepPackage;
10 | 
11 | impl<P> Extension for Context<P> where P: Dependencies { }
12 | 
13 | impl ExtensionPackageCtor<Context<()>> for Package {
14 |     fn package(target: &mut Context<()>) -> Result<Self> {
15 |         let blas = OpenCLBLASPackage::compile(target).map(BLASPackage::OpenCL)?;
16 |         let deep = OpenCLDeepPackage::compile(target).map(DeepPackage::OpenCL)?;
17 | 
18 |         Ok(Package { blas, deep })
19 |     }
20 | }


--------------------------------------------------------------------------------
/crates/parenchyma-ml/src/lib.rs:
--------------------------------------------------------------------------------
 1 | //! Parenchyma extension bundle for backend-agnostic operations related to machine learning (ML).
 2 | //!
 3 | //! A Parenchyma package that bundles the BLAS and Deep NN packages together to make one convenient
 4 | //! ML package.
 5 | //!
 6 | //! # Example Usage
 7 | //!
 8 | //! ```ignore
 9 | //! extern crate parenchyma;
10 | //! extern crate parenchyma_ml;
11 | //! 
12 | //! #[macro_use]
13 | //! use parenchyma::prelude::*;
14 | //! use extension_package::package::Package as MachLrnPackage;
15 | //! 
16 | //! // Initialize an OpenCL or CUDA backend packaged with the NN extension.
17 | //! let backend = BackendConfig::<MachLrnPackage>::new::<OpenCL>()?;
18 | //! 
19 | //! // Initialize two tensors.
20 | //! let ref x: SharedTensor = array![3.5, 12.4, 0.5, 6.5].into();
21 | //! let ref mut result: SharedTensor = data.shape().into();
22 | //! 
23 | //! // Run the sigmoid operation, provided by the NN extension, on your OpenCL/CUDA enabled 
24 | //! // GPU (or CPU, which is possible through OpenCL)
25 | //! backend.sigmoid(x, result)?;
26 | //! 
27 | //! // Print the result: `[0.97068775, 0.9999959, 0.62245935, 0.9984988] shape=[4], strides=[1]`
28 | //! println!("{:?}", result);
29 | //! ```
30 | 
31 | extern crate parenchyma;
32 | extern crate parenchyma_blas;
33 | extern crate parenchyma_deep;
34 | 
35 | pub use self::extension_package::{Dependencies, Extension, Package};
36 | 
37 | mod extension_package;
38 | mod frameworks;


--------------------------------------------------------------------------------
/crates/parenchyma-tr/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | Cargo.lock


--------------------------------------------------------------------------------
/crates/parenchyma-tr/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "parenchyma-tr"
 3 | version = "0.1.0"
 4 | authors = ["Jony <jonysy@users.noreply.github.com>"]
 5 | license = "MIT/Apache-2.0"
 6 | 
 7 | [dependencies]
 8 | image = "0.17.0"
 9 | 
10 | [dependencies.parenchyma]
11 | path = "../parenchyma"


--------------------------------------------------------------------------------
/crates/parenchyma-tr/README.md:
--------------------------------------------------------------------------------
1 | # parenchyma-tr
2 | 
3 | **parenchyma-tr** _ processing data for machine learning tasks and for making the output data 
4 | easier to work with.


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! A framework for pre and post processing machine intelligence based data
2 | 
3 | extern crate image;
4 | extern crate parenchyma;
5 | 
6 | pub use self::transformer::Transformer;
7 | 
8 | mod transformer;
9 | mod transformers;


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/transformer.rs:
--------------------------------------------------------------------------------
 1 | use parenchyma::error::Result;
 2 | use parenchyma::prelude::SharedTensor;
 3 | 
 4 | /// An trait for dealing with transformers so that any transformable data type can be 
 5 | /// transformed into a `SharedTensor`.
 6 | pub trait Transformer {
 7 |     /// Returns the non-numeric data as a vector.
 8 |     fn as_vector(&self) -> Vec<f32>;
 9 |     /// Transforms (possibly non-numeric) data into a numeric `SharedTensor` with the provided
10 |     /// `shape`.
11 |     ///
12 |     /// # Returns
13 |     ///
14 |     /// An `Error` is returned if the expected capacity (defined by the `shape`) differs from the
15 |     /// observed one.
16 |     fn transform(&self, shape: &[usize]) -> Result<SharedTensor<f32>> {
17 |         SharedTensor::with(shape, self.as_vector())
18 |     }
19 | }


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/transformers/audio.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonysy/parenchyma/d6043971a0b4cdea0430b4d0face7be9cf2ccde9/crates/parenchyma-tr/src/transformers/audio.rs


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/transformers/image.rs:
--------------------------------------------------------------------------------
1 | use image::DynamicImage;
2 | 
3 | use super::super::Transformer;
4 | 
5 | impl Transformer for DynamicImage {
6 |     fn as_vector(&self) -> Vec<f32> {
7 |         self.raw_pixels().iter().map(|&elem| elem as f32).collect()
8 |     }
9 | }


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/transformers/mod.rs:
--------------------------------------------------------------------------------
1 | mod audio;
2 | mod image;
3 | mod word;


--------------------------------------------------------------------------------
/crates/parenchyma-tr/src/transformers/word.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonysy/parenchyma/d6043971a0b4cdea0430b4d0face7be9cf2ccde9/crates/parenchyma-tr/src/transformers/word.rs


--------------------------------------------------------------------------------
/license/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016 Project Developers & Contributors
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/license/README.md:
--------------------------------------------------------------------------------
1 | Licensed under either
2 | 
3 |  * Apache license, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
4 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)


--------------------------------------------------------------------------------
/src/backend.rs:
--------------------------------------------------------------------------------
  1 | //! The `Backend` is the heart of Parenchyma. It provides an interface for running parallel 
  2 | //! computations on one or more devices. It is the main and highest struct of Parenchyma.
  3 | //!
  4 | //! The `Backend` type is an abstraction over a [framework](./trait.Framework.html) and is used as 
  5 | //! a way to interact with your devices. You can create a backend for computation by first choosing 
  6 | //! a specific [framework](./trait.Framework.html) such as Open CL and afterwards selecting one or 
  7 | //! many available hardware to create a backend. A backend provides you with the functionality of 
  8 | //! managing the memory of the devices and copying memory objects to/from the host. Additionally, 
  9 | //! backends allow you to execute operations in parallel through kernel functions on the device(s) 
 10 | //! of the backend.
 11 | //!
 12 | //! # Architecture
 13 | //!
 14 | //! Backends are initialized by providing a framework and a selection of devices compatible with 
 15 | //! the framework to the [`Backend::new`](#method.new) associated function, or by simply 
 16 | //! calling [`Backend::default`](#method.default). The framework determines which devices are 
 17 | //! available and how parallel kernel functions can be executed.
 18 | //!
 19 | //! # Example
 20 | //!
 21 | //! ```
 22 | //! extern crate parenchyma;
 23 | //!
 24 | //! use parenchyma::frameworks::Native;
 25 | //! use parenchyma::prelude::*;
 26 | //!
 27 | //! // The `new` function initializes the framework on which it's called.
 28 | //! let framework: Native = Native::new().unwrap();
 29 | //! // The available frameworks can be obtained through the chosen `framework`.
 30 | //! let hardware = framework.hardware().to_vec();
 31 | //! // A ready to go backend can be created from the framework and hardware. It's worth noting that
 32 | //! // configuration options will be available in future versions.
 33 | //! let backend: Backend = Backend::with(framework, hardware).unwrap();
 34 | //! ```
 35 | 
 36 | use std::fmt;
 37 | use std::ops::Deref;
 38 | 
 39 | use super::compute_device::ComputeDevice;
 40 | use super::context::{Context, ContextCtor};
 41 | use super::error::{Error, ErrorKind, Result};
 42 | use super::extension_package::ExtensionPackage;
 43 | use super::framework::{Framework, FrameworkCtor};
 44 | use super::hardware::Hardware;
 45 | 
 46 | /// The representation of the backend.
 47 | pub struct Backend<Package = ()> {
 48 |     /// Provides the Framework.
 49 |     ///
 50 |     /// The Framework implementation such as OpenCL, CUDA, etc. defines, which should be 
 51 |     /// used and determines which hardwares will be available and how parallel kernel 
 52 |     /// functions can be executed.
 53 |     framework: Box<Framework>,
 54 |     /// The context associated with the `framework`.
 55 |     ///
 56 |     /// Contexts are the heart of both OpenCL and CUDA applications. Contexts are created from one 
 57 |     /// or more devices that are capable of executing methods and synchronizing memory. See 
 58 |     /// the `Context` trait for more information.
 59 |     context: Box<Context<Package=Package>>,
 60 |     /// All _activatable_ hardware provided to the context.
 61 |     ///
 62 |     /// A cache of the hardware selection which is used as a representation of each framework's 
 63 |     /// list of available devices when selecting a new active device.
 64 |     selection: Vec<Hardware>
 65 | }
 66 | 
 67 | impl<P> Backend<P> where P: ExtensionPackage {
 68 |     /// Constructs a backend of the provided type with its default configurations. 
 69 |     ///
 70 |     /// # Return value
 71 |     ///
 72 |     /// The return value is a backend if the process goes well; otherwise, it returns 
 73 |     /// a simple error.
 74 |     pub fn new<F>() -> Result<Self>
 75 |         where F: FrameworkCtor,
 76 |               F::Context: ContextCtor<P,F=F> {
 77 | 
 78 |         let framework = F::new()?;
 79 |         let hardware = framework.hardware().to_vec();
 80 |         Self::with(framework, hardware)
 81 |     }
 82 | 
 83 |     /// Constructs a backend from the specified `framework` and `selection`.
 84 |     ///
 85 |     /// # Arguments
 86 |     ///
 87 |     /// * `framework` - One of the available frameworks.
 88 |     /// * `selection` - A selection of hardware provided by the specified `framework`.
 89 |     ///
 90 |     /// # Return value
 91 |     ///
 92 |     /// The return value is a backend if the process goes well; otherwise, it returns 
 93 |     /// a simple error.
 94 |     pub fn with<F>(framework: F, selection: Vec<Hardware>) -> Result<Self> 
 95 |         where F: FrameworkCtor, 
 96 |               F::Context: ContextCtor<P,F=F>, {
 97 | 
 98 |         info!("[PARENCHYMA] Constructing a backend using the {} framework", framework.name());
 99 |         let context = box F::Context::new(&framework, &selection)? as Box<Context<Package=P>>;
100 |         let framework = box framework as Box<Framework>;
101 |         Ok(Self { framework, context, selection })
102 |     }
103 | }
104 | 
105 | impl<P> Backend<P> where P: ExtensionPackage {
106 |     /// Returns the active framework's active context's active device.
107 |     pub fn active_device(&self) -> &dyn ComputeDevice {
108 |         self.context.active_codev()
109 |     }
110 | 
111 |     /// Simply returns the selected hardware.
112 |     pub fn selection(&self) -> &[Hardware] {
113 |         &self.selection
114 |     }
115 | 
116 |     /// Select the first device that meets the specified requirements.
117 |     ///
118 |     /// # Example
119 |     ///
120 |     /// ```rust
121 |     /// use parenchyma::{Backend, HardwareKind, Native};
122 |     ///
123 |     /// let mut native: Backend = Backend::new::<Native>().unwrap();
124 |     /// assert!(native.select(|hardware| hardware.kind == HardwareKind::CPU).is_ok());
125 |     /// ```
126 |     pub fn select(&mut self, pred: &Fn(&Hardware) -> bool) -> Result {
127 | 
128 |         let nth = {
129 |             self.selection().iter().enumerate()
130 |                 .filter(|&(_, h)| pred(h)).map(|(i, _)| i).nth(0)
131 |         };
132 | 
133 |         match nth {
134 |             Some(n) => self.context.activate(n),
135 |             _ => {
136 |                 let message = "There are no devices matching the specified criteria.";
137 |                 Err(Error::new(ErrorKind::Other, message))
138 |             }
139 |         }
140 |     }
141 |     
142 |     /// Synchronizes backend.
143 |     pub fn synchronize(&self) -> Result {
144 |         Ok(())
145 |     }
146 | }
147 | 
148 | impl<P> Deref for Backend<P> where P: ExtensionPackage {
149 |     type Target = P::Extension;
150 |     
151 |     fn deref<'a>(&'a self) -> &'a Self::Target {
152 |         self.context.extension()
153 |     }
154 | }
155 | 
156 | impl<E> fmt::Debug for Backend<E> {
157 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
158 |         write!(f, "A backend provided by the {} framework", self.framework.name())
159 |     }
160 | }


--------------------------------------------------------------------------------
/src/changelog.rs:
--------------------------------------------------------------------------------
 1 | //! Project changelog (YEAR-MONTH-DAY)
 2 | 
 3 | /// Release 0.0.4 (2017-11-08)
 4 | ///
 5 | /// * Partially sketched out a transfer matrix addressing issue#23
 6 | /// * Simplified the complicated extension/build system resolving issue#25
 7 | ///     * The new extension/build system allows for framework specific backends.
 8 | /// * Worked on a OpenCL solution to issue#16
 9 | /// * Removed ndarray as it's not needed, which closes issue#20
10 | ///     * Mapped memory process doesn't work well with ndarray + reshaping a tensor means reshaping
11 | /// the native array
12 | /// * Lazy synchronization via auto-sync has been fully integrated
13 | /// * Implemented logic around pinned memory with unpinned memory fallback
14 | pub mod r0_0_4 {}
15 | 
16 | /// Release 0.0.3 (2017-03-04)
17 | ///
18 | /// * Implemented an OpenCL API wrapper
19 | /// * Partially implemented a CUDA API wrapper
20 | /// * Partially implemented native support
21 | /// * Worked on a fallback mechanism (see issue#15)
22 | ///     * No longer requires framework related feature flags (from the original Collenchyma project)
23 | ///     * No longer requires backends parameterized by a framework
24 | /// * New memory access API
25 | ///     * Implemented auto-sync
26 | ///     * Use a tensor lib (ndarray) as the underlying native memory representation
27 | /// * Add `Bundle` logic
28 | /// * Removed `IBinary`/`HashMap` technique. Use structs instead
29 | pub mod r0_0_3 {}


--------------------------------------------------------------------------------
/src/compute_device.rs:
--------------------------------------------------------------------------------
 1 | //! Provides a representation for one or many ready to use compute devices.
 2 | 
 3 | use std::any::{Any, TypeId};
 4 | 
 5 | use super::error::Result;
 6 | use super::memory::Memory;
 7 | use super::tensor::TensorShape;
 8 | 
 9 | /// An device capable of processing data.
10 | ///
11 | /// A compute device can be a single device, or multiple devices treated as a single device.
12 | ///
13 | /// ## Load Balancing Multiple Devices
14 | ///
15 | /// todo..
16 | pub trait ComputeDevice: Any + Allocate<f64> + Allocate<f32> { }
17 | 
18 | /// Implemented by allocators.
19 | pub trait Allocate<T> {
20 |     /// Allocates memory on the device.
21 |     fn allocate(&self, shape: &TensorShape) -> Result<Box<Memory<T>>>;
22 | }
23 | 
24 | impl ComputeDevice {
25 |     /// Returns `true` if the boxed type is the same as `T`.
26 |     #[inline]
27 |     pub fn is<T>(&self) -> bool where T: ComputeDevice {
28 |         // Get TypeId of the type this function is instantiated with
29 |         let t = TypeId::of::<T>();
30 |         // Get TypeId of the type in the trait object
31 |         let boxed = self.get_type_id();
32 |         // Compare both TypeIds on equality
33 |         t == boxed
34 |     }
35 | 
36 |     /// Returns some reference to the boxed value if it is of type `T`, or
37 |     /// `None` if it isn't.
38 |     #[inline]
39 |     pub fn downcast_ref<T>(&self) -> Option<&T> where T: ComputeDevice {
40 |         if self.is::<T>() {
41 |             unsafe {
42 |                 Some(&*(self as *const ComputeDevice as *const T))
43 |             }
44 |         } else {
45 |             None
46 |         }
47 |     }
48 | }


--------------------------------------------------------------------------------
/src/context.rs:
--------------------------------------------------------------------------------
 1 | //! Contexts are the heart of both OpenCL and CUDA applications. Contexts provide a container for
 2 | //! objects such as memory, command-queues, programs/modules and kernels.
 3 | //!
 4 | //! You can create a context encapsulating a selection of hardware via a [`Backend`].
 5 | //!
 6 | //! [`Backend`]: ./struct.Backend.html
 7 | 
 8 | use super::compute_device::ComputeDevice;
 9 | use super::error::Result;
10 | use super::extension_package::ExtensionPackage;
11 | use super::hardware::Hardware;
12 | 
13 | /// A trait implemented by all contexts.
14 | pub trait Context: 'static {
15 |     /// The extension package built for the framework's context.
16 |     type Package: ExtensionPackage;
17 |     /// Returns the active device.
18 |     fn active_codev(&self) -> &ComputeDevice;
19 |     /// Returns the package extension.
20 |     fn extension(&self) -> &<Self::Package as ExtensionPackage>::Extension;
21 |     /// Set the device at the specified `index` as the active device.
22 |     ///
23 |     /// Only one device can be the _active_ device - the device in which operations are executed -
24 |     /// if used through the context.
25 |     fn activate(&mut self, index: usize) -> Result;
26 | }
27 | 
28 | /// The non-object-safe part of the `Context`.
29 | ///
30 | /// todo: generic associated types may help here..
31 | pub trait ContextCtor<Package>
32 |     where Self: Context<Package=Package> + Sized, 
33 |           Package: ExtensionPackage {
34 |     /// The framework representation for the context.
35 |     type F;
36 |     /// Constructs a new context from the `framework` and the `selection` of hardware.
37 |     fn new(framework: &Self::F, selection: &[Hardware]) -> Result<Self>;
38 | }


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
  1 | //! Types for working with errors.
  2 | 
  3 | use std::{error, fmt};
  4 | use std::ops::Deref;
  5 | 
  6 | /// A specialized `Result` typedef.
  7 | pub type Result<T = (), E = Error> = ::std::result::Result<T, E>;
  8 | 
  9 | /// The error structure used by the Parenchyma crate.
 10 | #[derive(Debug)]
 11 | pub struct Error {
 12 |     kind: ErrorKind,
 13 |     /// A boxed sendable, syncable `Error`.
 14 |     inner: Option<Box<::std::error::Error + Send + Sync>>,
 15 | }
 16 | 
 17 | /// A set of general categories.
 18 | #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 19 | pub enum ErrorKind {
 20 |     /// A framework-specific error.
 21 |     ///
 22 |     /// Consider creating an framework-specific error by calling the `Error::from_framework` 
 23 |     /// function, rather than constructing an `Error` using this variant.
 24 |     Framework(&'static str),
 25 |     /// An error returned when attempting to access uninitialized memory.
 26 |     UninitializedMemory,
 27 |     /// Unable to drop the provided device because a memory allocation was not found for it.
 28 |     AllocatedMemoryNotFoundForDevice,
 29 |     /// An error occurred while attempting to synchronize memory.
 30 |     MemorySynchronizationFailed,
 31 |     /// A memory synchronization route was requested, but no available synchronization route was found.
 32 |     NoAvailableSynchronizationRouteFound,
 33 |     /// An error occurred while attempting to allocate memory.
 34 |     MemoryAllocationFailed,
 35 |     /// An error occurred while downcasting
 36 |     MemoryDowncasting,
 37 | 
 38 |     // MARK: - A set of tensor error categories
 39 | 
 40 |     /// Maximum number of backing memories has been reached (`BitMap` - type alias for `u64`).
 41 |     CapacityExceeded,
 42 |     /// The tensor shape is incompatible with the shape of some data.
 43 |     IncompatibleShape,
 44 |     /// Invalid reshaped tensor size.
 45 |     InvalidReshapedTensorSize,
 46 | 
 47 |     /// Any error not part of this list.
 48 |     Other,
 49 |     /// A marker variant that tells the compiler that users of this enum cannot match 
 50 |     /// it exhaustively ([related RFC](https://github.com/rust-lang/rust/issues/32770)).
 51 |     #[doc(hidden)]
 52 |     _NonExhaustive,
 53 | }
 54 | 
 55 | impl ErrorKind {
 56 |     fn as_str(&self) -> &'static str {
 57 |         use self::ErrorKind::*;
 58 | 
 59 |         match *self {
 60 |             Framework(name) => name,
 61 |             CapacityExceeded => "the maximum number of backing memories has been reached",
 62 |             IncompatibleShape => "the tensor shape is incompatible with the shape of the data",
 63 |             InvalidReshapedTensorSize => "size of the provided shape is not equal to the size of the current shape",
 64 |             UninitializedMemory => "uninitialized memory",
 65 |             AllocatedMemoryNotFoundForDevice => "memory allocation was not found for the provided device",
 66 |             MemorySynchronizationFailed => "memory synchronization failed",
 67 |             NoAvailableSynchronizationRouteFound => "no available memory synchronization route",
 68 |             MemoryAllocationFailed => "memory allocation failed",
 69 |             MemoryDowncasting => "something went wrong while downcasting",
 70 |             Other => "other error",
 71 |             _ => unreachable!(),
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | impl Error {
 77 |     /// Creates a new error from a known kind of error as well as an arbitrary error error.
 78 |     pub fn new<K, E>(kind: K, error: E) -> Error 
 79 |         where K: Into<ErrorKind>, 
 80 |               E: Into<Box<error::Error + Send + Sync>> {
 81 |         Self::_new(kind.into(), Some(error.into()))
 82 |     }
 83 | 
 84 |     /// Returns a reference to the inner error wrapped by this error (if any).
 85 |     pub fn get_ref(&self) -> Option<&(error::Error + Send + Sync + 'static)> {
 86 |         match self.inner {
 87 |             Some(ref error) => Some(error.deref()),
 88 |             _ => None
 89 |         }
 90 |     }
 91 |     /// Returns the corresponding `ErrorKind` for this error.
 92 |     pub fn kind(&self) -> ErrorKind {
 93 |         self.kind
 94 |     }
 95 | }
 96 | 
 97 | impl Error {
 98 |     // "De-generization" technique..
 99 |     fn _new(kind: ErrorKind, error: Option<Box<error::Error + Send + Sync>>) -> Error {
100 |         Error { kind, inner: error }
101 |     }
102 | }
103 | 
104 | impl fmt::Display for Error {
105 |     fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
106 |         write!(fmt, "{}", self.kind.as_str())
107 |     }
108 | }
109 | 
110 | impl error::Error for Error {
111 |     fn description(&self) -> &str {
112 |         if let Some(ref error) = self.inner {
113 |             error.description()
114 |         } else {
115 |             self.kind.as_str()
116 |         }
117 |     }
118 | 
119 |     fn cause(&self) -> Option<&error::Error> {
120 |         match self.inner {
121 |             Some(ref error) => error.cause(),
122 |             _ => None,
123 |         }
124 |     }
125 | }
126 | 
127 | impl From<ErrorKind> for Error {
128 |     /// Creates a new error from a known kind of error
129 |     fn from(kind: ErrorKind) -> Error {
130 |         Error::_new(kind, None)
131 |     }
132 | }
133 | 
134 | #[cfg(test)]
135 | mod test {
136 |     use super::{Error, ErrorKind};
137 |     use std::{error, fmt};
138 | 
139 |     #[test]
140 |     fn test_downcasting() {
141 |         #[derive(Debug)]
142 |         struct TestError;
143 |         impl fmt::Display for TestError { fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result {
144 |             Ok(())
145 |         }}
146 |         impl error::Error for TestError { fn description(&self) -> &str { "abc" } }
147 |         let err = Error::new(ErrorKind::Other, TestError);
148 |         assert!(err.get_ref().unwrap().is::<TestError>());
149 |         assert_eq!("abc", err.get_ref().unwrap().description());
150 |     }
151 | }


--------------------------------------------------------------------------------
/src/extension_package.rs:
--------------------------------------------------------------------------------
 1 | //! A package can be a binary, a source file, c code, a single kernel, etc., or a collective which
 2 | //! share related functionalities. A package is provided by a specific library such as BLAS. Notice
 3 | //! that packages are analogous to those of Rust (i.e., crates):
 4 | //!
 5 | //! compiled crate <-> package
 6 | //! library (one or more modules) <-> bundle
 7 | //!
 8 | //! A package needs to be _built_, which is handled by the specific implementation of a binary
 9 | //! representation, and returns initialized operations based on the library. Interacting directly
10 | //! with the package itself is possible, but it should be used to construct the backend-agnostic
11 | //! operations, which can then be executed and parallelized via a unified interface.
12 | //!
13 | //! ## Extensions
14 | //!
15 | //! A library can be a binary, a source file, c code, a single kernel, etc., or a collective.
16 | //!
17 | //! A backend is a Rust struct like any other, therefore you probably would like to implement
18 | //! certain methods for the Backend. As the whole purpose of a Backend is to provide an
19 | //! abstraction over various computation devices and computation languages, these implemented
20 | //! methods will than be able to execute on different devices and use the full power of 
21 | //! the machine's underlying hardware.
22 | //!
23 | //! Extending the backend with operations is easy. In Parenchyma we call crates, which provide
24 | //! operations for the backend, _extensions_. Extensions are usually a group of related 
25 | //! operations of a common field. Two examples for Parenchyma extensions 
26 | //! are [BLAS][parenchyma-blas] and [NN][parenchyma-nn].
27 | //!
28 | //! An extension provides generic traits and the explicit implementation of these traits for 
29 | //! one or (even better) all available Parenchyma frameworks - common host CPU, OpenCL, CUDA.
30 | //!
31 | //! The structure of an extension is pretty simple with as little overhead as possible. Macros 
32 | //! and build-scripts make implementations even easier. If you would like to use specific 
33 | //! extension for you backend, all you need to do is set them as dependencies in your Cargo 
34 | //! file in addition to the Parenchyma crate. The extension then automatically extends the 
35 | //! backend provided by Parenchyma.
36 | //!
37 | //! Extending the backend with your own extension is a straightforward process. For now we 
38 | //! recommend that you take a look at the general code structure 
39 | //! of [Parenchyma-BLAS][parenchyma-blas] or its documentation. Let us now about your extension 
40 | //! on the Gitter chat, we are happy to feature your Parenchyma Extension on the README.
41 | 
42 | use super::context::Context;
43 | use super::error::Result;
44 | 
45 | /// Represents a package dependency.
46 | pub trait Dependency<P>: ExtensionPackage {
47 |     /// Returns the dependency.
48 |     fn dependency(&self) -> &P;
49 | }
50 | 
51 | impl<P> Dependency<P> for P where P: ExtensionPackage {
52 |     fn dependency(&self) -> &P {
53 |         &self
54 |     }
55 | }
56 | 
57 | /// Provides the generic functionality for a backend-specific implementation of a library.
58 | pub trait ExtensionPackage: 'static {
59 |     type Extension: ?Sized;
60 | 
61 |     /// The name of the package.
62 |     ///
63 |     /// This associated constant is primarily used for logging/debugging purposes. The naming 
64 |     /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn").
65 |     fn package_name(&self) -> &'static str;
66 | }
67 | 
68 | /// Builds a package and provides the functionality for turning a library into backend-specific, 
69 | /// executable operations, and tailored for the target framework.
70 | ///
71 | /// note: the `Context` trait is used here simply as a marker trait.
72 | pub trait ExtensionPackageCtor<TargetContext>: Sized
73 |     /*where Self: ExtensionPackage + Sized, 
74 |           TargetContext: Context*/ {
75 |     /// Compiles the library into a package after initializing and configuring the library.
76 |     ///
77 |     /// This associated constant is primarily used for logging/debugging purposes. The naming 
78 |     /// convention is as follows: "[organization]/[package-name]" (e.g., "parenchyma/nn").
79 |     fn package(target: &mut TargetContext) -> Result<Self>;
80 | }
81 | 
82 | impl ExtensionPackage for () {
83 |     type Extension = ::std::any::Any;
84 |     /// The default package.
85 |     fn package_name(&self) -> &'static str {
86 |         return "parenchyma/default";
87 |     }
88 | }
89 | 
90 | impl<T> ExtensionPackageCtor<T> for () where T: Context {
91 |     fn package(_target: &mut T) -> Result<Self> {
92 |         return Ok(());
93 |     }
94 | }


--------------------------------------------------------------------------------
/src/framework.rs:
--------------------------------------------------------------------------------
 1 | //! Provides the generic functionality of a hardware supporting frameworks such 
 2 | //! as native CPU, Open CL, CUDA, etc..
 3 | //!
 4 | //! The default framework is simply the host CPU for common computation. To make use 
 5 | //! of other devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to 
 6 | //! access the processing capabilities of the device(s). To start backend-agnostic and highly 
 7 | //! parallel computation, you start by initializing one of the `Framework` implementations, 
 8 | //! resulting in an initialized Framework, that contains, among other things, a list of all 
 9 | //! available hardwares through that framework.
10 | //!
11 | //! # Example
12 | //!
13 | //! ```
14 | //! extern crate parenchyma;
15 | //! 
16 | //! use parenchyma::frameworks::Native;
17 | //! use parenchyma::prelude::*;
18 | //!
19 | //! // A ready to go backend can be created by simply providing the framework type.
20 | //! let backend: Backend = Backend::new::<Native>().unwrap();
21 | //! ```
22 | 
23 | use super::error::Result;
24 | use super::hardware::Hardware;
25 | 
26 | /// A trait implemented for all frameworks. `Framework`s contain a list of all available 
27 | /// devices as well as other objects specific to the implementor.
28 | pub trait Framework: 'static {
29 |     /// Returns the name of the framework, which is mainly used for the purposes of debugging 
30 |     /// and reporting errors.
31 |     fn name(&self) -> &'static str;
32 |     /// Returns the cached and available hardware.
33 |     ///
34 |     /// note: this method will likely be replaced 
35 |     /// with a [field](https://github.com/rust-lang/rfcs/pull/1546).
36 |     fn hardware(&self) -> &[Hardware];
37 | }
38 | 
39 | /// The non-object-safe part of the framework trait.
40 | ///
41 | /// A separate trait is used because it violates object-safety rules, i.e., `Framework` is the 
42 | /// object-safe version of `FrameworkCtor` (or `FrameworkCtor` is the non-object-safe 
43 | /// version of `Framework`). `FrameworkCtor` is simply a constructor (hence the name `*Ctor`). In 
44 | /// other words, this trait is split into object-safe and non-object-safe parts.
45 | ///
46 | /// todo: generic associated types may help here..
47 | pub trait FrameworkCtor: Framework + Sized {
48 |     /// The context representation for the framework.
49 |     type Context;
50 |     /// Initializes a `Framework`.
51 |     fn new() -> Result<Self>;
52 | }


--------------------------------------------------------------------------------
/src/frameworks/mod.rs:
--------------------------------------------------------------------------------
1 | //! Exposes the specific framework implementations.
2 | 
3 | pub use self::native::{HOST, Native, NativeContext, NativeDevice, NativeMemory};
4 | pub use self::open_cl::{OpenCL, OpenCLBuf, OpenCLContext, OpenCLDevice, OpenCLMemory};
5 | 
6 | mod native;
7 | mod open_cl;


--------------------------------------------------------------------------------
/src/frameworks/native/context.rs:
--------------------------------------------------------------------------------
 1 | use std::marker::PhantomData;
 2 | 
 3 | use std::marker::Unsize;
 4 | use super::Native;
 5 | use super::super::super::compute_device::ComputeDevice;
 6 | use super::super::super::context::{Context, ContextCtor};
 7 | use super::super::super::error::Result;
 8 | use super::super::super::extension_package::ExtensionPackage;
 9 | use super::super::super::hardware::Hardware;
10 | 
11 | /// Defines a Native context.
12 | pub struct NativeContext<P>(PhantomData<P>);
13 | 
14 | impl<Package> Context for NativeContext<Package> 
15 |     where Package: ExtensionPackage, 
16 |           NativeContext<Package>: Unsize<Package::Extension> {
17 | 
18 |     type Package = Package;
19 | 
20 |     fn active_codev(&self) -> &ComputeDevice {
21 |         &super::HOST
22 |     }
23 | 
24 |     fn activate(&mut self, _: usize) -> Result {
25 |         Ok(())
26 |     }
27 | 
28 |     fn extension(&self) -> &<Package as ExtensionPackage>::Extension {
29 |         self
30 |     }
31 | }
32 | 
33 | impl<P> ContextCtor<P> for NativeContext<P>
34 |     where P: 'static + ExtensionPackage, 
35 |           NativeContext<P>: Unsize<P::Extension> {
36 |             
37 |     type F = Native<P>;
38 | 
39 |     fn new(_: &Self::F, _: &[Hardware]) -> Result<Self> {
40 |         Ok(NativeContext(PhantomData))
41 |     }
42 | }


--------------------------------------------------------------------------------
/src/frameworks/native/device.rs:
--------------------------------------------------------------------------------
 1 | use ndarray::Array;
 2 | 
 3 | use super::NativeMemory;
 4 | use super::super::super::compute_device::{Allocate, ComputeDevice};
 5 | use super::super::super::error::Result;
 6 | use super::super::super::memory::Memory;
 7 | use super::super::super::tensor::TensorShape;
 8 | 
 9 | /// The native device.
10 | #[derive(Debug)]
11 | pub struct NativeDevice;
12 | 
13 | impl ComputeDevice for NativeDevice { }
14 | 
15 | impl<T: 'static> Allocate<T> for NativeDevice {
16 |     fn allocate(&self, shape: &TensorShape) -> Result<Box<Memory<T>>> {
17 |         let mut v = Vec::with_capacity(shape.capacity());
18 | 
19 |         unsafe {
20 |             v.set_len(shape.capacity());
21 |         }
22 | 
23 |         let array = Array::from_shape_vec(shape.dimensions(), v).unwrap();
24 |         let memory = NativeMemory(array);
25 | 
26 |         return Ok(Box::new(memory));
27 |     }
28 | }


--------------------------------------------------------------------------------
/src/frameworks/native/framework.rs:
--------------------------------------------------------------------------------
 1 | use super::NativeContext;
 2 | use super::super::super::error::Result;
 3 | use super::super::super::framework::{Framework, FrameworkCtor};
 4 | use super::super::super::hardware::{Hardware, HardwareKind};
 5 | 
 6 | use std::marker::PhantomData;
 7 | 
 8 | /// The native framework
 9 | #[derive(Debug)]
10 | pub struct Native<P> {
11 |     hardware: [Hardware; 1],
12 |     package: PhantomData<P>,
13 | }
14 | 
15 | impl<P> Native<P> {
16 |     const ID: &'static str = "native/host";
17 | }
18 | 
19 | impl<P> Framework for Native<P> where P: 'static {
20 |     fn name(&self) -> &'static str {
21 |         return Native::<P>::ID;
22 |     }
23 | 
24 |     fn hardware(&self) -> &[Hardware] {
25 |         &self.hardware
26 |     }
27 | }
28 | 
29 | impl<P> FrameworkCtor for Native<P> where P: 'static {
30 |     type Context = NativeContext<P>;
31 | 
32 |     fn new() -> Result<Self> {
33 |         Ok(Native {
34 |             hardware: [Hardware {
35 |                 id: 0usize,
36 |                 framework: Native::<P>::ID,
37 |                 kind: HardwareKind::CPU,
38 |                 name: String::from("Host CPU"),
39 |                 compute_units: 1,
40 |             }],
41 |             package: PhantomData,
42 |         })
43 |     }
44 | }


--------------------------------------------------------------------------------
/src/frameworks/native/memory.rs:
--------------------------------------------------------------------------------
 1 | use ndarray::{Array, IxDyn};
 2 | use std::ops::{Deref, DerefMut};
 3 | 
 4 | // use super::super::super::{Device, Memory, TransferDirection};
 5 | // use super::super::super::error::Result;
 6 | 
 7 | use super::NativeDevice;
 8 | use super::super::super::compute_device::ComputeDevice;
 9 | use super::super::super::memory::Memory;
10 | 
11 | /// A newtype (with an internal type of an n-dimensional array) representing a native memory buffer.
12 | ///
13 | /// note: named `Memory` for consistency across frameworks.
14 | pub struct NativeMemory<T>(pub(in crate) Array<T, IxDyn>);
15 | 
16 | impl<T: 'static> Memory<T> for NativeMemory<T> {
17 |     fn synchronized(&self, compute_device: &ComputeDevice) -> bool {
18 |         compute_device.is::<NativeDevice>()
19 |     }
20 | }
21 | 
22 | impl<T> Deref for NativeMemory<T> {
23 |     type Target = Array<T, IxDyn>;
24 |     fn deref(&self) -> &Self::Target {
25 |         &self.0
26 |     }
27 | }
28 | 
29 | impl<T> DerefMut for NativeMemory<T> {
30 |     fn deref_mut(&mut self) -> &mut Self::Target {
31 |         &mut self.0
32 |     }
33 | }


--------------------------------------------------------------------------------
/src/frameworks/native/mod.rs:
--------------------------------------------------------------------------------
 1 | pub use self::context::NativeContext;
 2 | pub use self::device::NativeDevice;
 3 | pub use self::framework::Native;
 4 | pub use self::memory::NativeMemory;
 5 | 
 6 | mod context;
 7 | mod device;
 8 | mod framework;
 9 | mod memory;
10 | 
11 | pub const HOST: NativeDevice = NativeDevice;


--------------------------------------------------------------------------------
/src/frameworks/open_cl/context.rs:
--------------------------------------------------------------------------------
  1 | use ocl;
  2 | use std::ffi::CString;
  3 | use std::marker::Unsize;
  4 | use super::{OpenCL, OpenCLDevice};
  5 | use super::super::super::compute_device::ComputeDevice;
  6 | use super::super::super::context::{Context, ContextCtor};
  7 | use super::super::super::error::{Error, ErrorKind, Result};
  8 | use super::super::super::extension_package::{ExtensionPackage, ExtensionPackageCtor};
  9 | use super::super::super::hardware::Hardware;
 10 | 
 11 | /// Defines a Open CL context.
 12 | ///
 13 | /// A context is responsible for managing OpenCL objects and resources (command-queues, program 
 14 | /// objects, kernel objects, executing kernels, etc.). The usual configuration is a single context 
 15 | /// encapsulating multiple devices. The resources, such as [buffers][buffer] and [events][event], 
 16 | /// can be shared across multiple devices in a single context. Other possible setups include:
 17 | ///
 18 | /// * a single context for multiple devices
 19 | /// * a single context for a single device
 20 | /// * a context for each device
 21 | ///
 22 | /// note: multi-platform contexts are not supported in OpenCL.
 23 | ///
 24 | /// ## Programs
 25 | ///
 26 | /// An OpenCL context can have multiple programs associated with it. Programs can be compiled
 27 | /// individually to avoid possible name clashes due to using packages from multiple package 
 28 | /// authors.
 29 | ///
 30 | /// [buffer]: ./frameworks/opencl/struct.Memory.html
 31 | /// [event]: ./frameworks/opencl/struct.Event.html
 32 | pub struct OpenCLContext<P> {
 33 |     /// The context.
 34 |     context: ocl::Context,
 35 |     /// The index of the _active_ device.
 36 |     active: usize,
 37 |     /// A list of devices associated with the context.
 38 |     selected_devices: Vec<OpenCLDevice>,
 39 |     /// The `Device`s' corresponding `Hardware`.
 40 |     selected_hardware: Vec<Hardware>,
 41 |     // todo document this:
 42 |     // package is stored here because
 43 |     // a) the program depends on the selected devices
 44 |     // b) the lazy static would new the context
 45 |     //   1) mutating would be possible but wouldn't be worth the cost and trouble
 46 |     extension_package: P,
 47 | }
 48 | 
 49 | impl<P> OpenCLContext<P> {
 50 |     pub fn device(&self) -> &OpenCLDevice {
 51 |         &self.selected_devices[self.active]
 52 |     }
 53 | 
 54 |     pub fn extension_package(&self) -> &P {
 55 |         &self.extension_package
 56 |     }
 57 |     
 58 |     /// Builds and returns a program.
 59 |     pub fn program(&self, src_strings: Vec<CString>) -> Result<ocl::Program> {
 60 |         let cmplr_opts = CString::new("").unwrap();
 61 |         let device_ids: Vec<_> = self.selected_devices.iter().map(|d| d.device.clone()).collect();
 62 | 
 63 |         Ok(ocl::Program::new(
 64 |             self.context.core(), 
 65 |             src_strings, 
 66 |             Some(&device_ids), 
 67 |             cmplr_opts
 68 |         )?)
 69 |     }
 70 | }
 71 | 
 72 | impl<Package> Context for OpenCLContext<Package> 
 73 |     where Package: ExtensionPackage, 
 74 |           OpenCLContext<Package>: Unsize<Package::Extension> {
 75 | 
 76 |     type Package = Package;
 77 | 
 78 |     fn active_codev(&self) -> &ComputeDevice {
 79 |         &self.selected_devices[self.active]
 80 |     }
 81 | 
 82 |     fn extension(&self) -> &<Package as ExtensionPackage>::Extension {
 83 |         self
 84 |     }
 85 | 
 86 |     fn activate(&mut self, index: usize) -> Result {
 87 |         if index >= self.selected_devices.len() {
 88 |             return Err(Error::new(ErrorKind::Other, "device index out of range"));
 89 |         }
 90 | 
 91 |         self.active = index;
 92 | 
 93 |         Ok(())
 94 |     }
 95 | }
 96 | 
 97 | impl<P> ContextCtor<P> for OpenCLContext<P>
 98 |     where P: 'static + ExtensionPackage + ExtensionPackageCtor<OpenCLContext<()>>, 
 99 |           OpenCLContext<P>: Unsize<P::Extension> {
100 |             
101 |     type F = OpenCL<P>;
102 | 
103 |     fn new(framework: &Self::F, selection: &[Hardware]) -> Result<Self> {
104 | 
105 |         let props = ocl::builders::ContextProperties::new().platform(framework.implementation);
106 |         let s = ocl::builders::DeviceSpecifier::Indices(selection.iter().map(|h| h.id).collect());
107 |         let ctx = ocl::Context::new(Some(props), Some(s), None, None)?;
108 | 
109 |         let mut devices = vec![];
110 | 
111 |         for hardware in selection.iter() {
112 |             let d = ocl::Device::by_idx_wrap(framework.implementation, hardware.id);
113 |             let queue = ocl::Queue::new(&ctx, d, Some(ocl::flags::QUEUE_PROFILING_ENABLE))?;
114 | 
115 |             devices.push(OpenCLDevice {
116 |                 device: d,
117 |                 context: ctx.clone(),
118 |                 queue,
119 |             });
120 |         }
121 | 
122 |         let mut unpackaged = OpenCLContext { 
123 |             context: ctx, 
124 |             active: 0, 
125 |             selected_devices: devices, 
126 |             selected_hardware: selection.to_vec(),
127 |             extension_package: (),
128 |         };
129 | 
130 |         let package = P::package(&mut unpackaged)?;
131 | 
132 |         Ok(OpenCLContext {
133 |             context: unpackaged.context,
134 |             active: unpackaged.active,
135 |             selected_devices: unpackaged.selected_devices,
136 |             selected_hardware: unpackaged.selected_hardware,
137 |             extension_package: package,
138 |         })
139 |     }
140 | }


--------------------------------------------------------------------------------
/src/frameworks/open_cl/device.rs:
--------------------------------------------------------------------------------
 1 | use ocl;
 2 | 
 3 | use super::{OpenCLBuf, OpenCLMemory};
 4 | use super::super::super::compute_device::{Allocate, ComputeDevice};
 5 | use super::super::super::error::Result;
 6 | use super::super::super::memory::Memory;
 7 | use super::super::super::tensor::{TensorShape, TensorType};
 8 | 
 9 | /// Represents an Open CL device.
10 | #[derive(Clone, Debug)]
11 | pub struct OpenCLDevice {
12 |     pub(in frameworks::open_cl) device: ocl::Device,
13 |     pub(in frameworks::open_cl) context: ocl::Context,
14 |     /// A command queue
15 |     ///
16 |     /// A command queue is the mechanism for interaction with the device. The queue is used for 
17 |     /// operations such as kernel launches and memory copies. At least one command queue per device
18 |     /// is required. Queues are used by the host application to submit work to devices and 
19 |     /// associated with devices within a context.
20 |     ///
21 |     /// __commands__:
22 |     ///
23 |     /// - memory copy or mapping
24 |     /// - device code execution
25 |     /// - synchronization point
26 |     ///
27 |     /// __modes__:
28 |     ///
29 |     /// - in-order
30 |     /// - out-of-order
31 |     ///
32 |     /// ## TODO
33 |     ///
34 |     /// * Use events to synchronize
35 |     pub(in frameworks::open_cl) queue: ocl::Queue,
36 | }
37 | 
38 | impl OpenCLDevice {
39 |     pub fn queue(&self) -> &ocl::Queue {
40 |         &self.queue
41 |     }
42 | }
43 | 
44 | impl ComputeDevice for OpenCLDevice { }
45 | 
46 | impl<T> Allocate<T> for OpenCLDevice  where T: TensorType + 'static {
47 |     fn allocate(&self, shape: &TensorShape) -> Result<Box<Memory<T>>> {
48 |         let ctx = &self.context;
49 |         let flags_opt = Some(ocl::flags::MEM_READ_WRITE);
50 |         let dims = ocl::SpatialDims::One(shape.capacity);
51 |         let host_data = None;
52 |         let buf: OpenCLBuf<T> = OpenCLBuf { 
53 |             buf: ocl::Buffer::new(ctx, flags_opt, dims, host_data)? 
54 |         };
55 |         let device = self.clone();
56 |         let memory = Box::new(OpenCLMemory {
57 |             buf,
58 |             device,
59 |         });
60 | 
61 |         return Ok(memory);
62 |     }
63 | }


--------------------------------------------------------------------------------
/src/frameworks/open_cl/error.rs:
--------------------------------------------------------------------------------
1 | use ocl::Error as OpenCLError;
2 | use error::{Error, ErrorKind};
3 | 
4 | impl From<OpenCLError> for Error {
5 |     /// Creates a new error from a known kind of error
6 |     fn from(e: OpenCLError) -> Error {
7 |         Error::new(ErrorKind::Framework(super::OpenCL::<()>::ID), ::std::error::Error::description(&e)) 
8 |     }
9 | }


--------------------------------------------------------------------------------
/src/frameworks/open_cl/framework.rs:
--------------------------------------------------------------------------------
  1 | use ocl;
  2 | use ocl::Platform as Implementation;
  3 | use ocl::enums::{DeviceInfo, DeviceInfoResult};
  4 | use ocl::flags::{DEVICE_TYPE_ACCELERATOR, DEVICE_TYPE_CPU, DEVICE_TYPE_GPU};
  5 | use std::marker::PhantomData;
  6 | 
  7 | use super::OpenCLContext;
  8 | use super::super::super::error::Result;
  9 | use super::super::super::framework::{Framework, FrameworkCtor};
 10 | use super::super::super::hardware::{Hardware, HardwareKind};
 11 | 
 12 | /// Provides the Open CL framework.
 13 | ///
 14 | /// # Flow
 15 | ///
 16 | /// Since multiple platforms can exist, the first available platform is selected during 
 17 | /// the initialization. A list of available devices are then provided for your choosing. Then,
 18 | /// the provided selection of devices are used to create a context, with a command queue for each
 19 | /// device. At this stage, a program(s) is compiled. A (host) program is essentially a collection 
 20 | /// of kernels. A kernel is the smallest unit of execution.
 21 | ///
 22 | /// In OpenCL, the host code can read in a kernel binary (i.e., compiled off-line) or a kernel 
 23 | /// source file (i.e., compile on-line). More information on on-line/off-line compilation can be
 24 | /// found [here][1]. Kernels are expensive to start, so they're typically used to do a large amount
 25 | /// of work. Memory allocated on an OpenCL device can be used when executing kernels, and then 
 26 | /// transfered back.
 27 | ///
 28 | /// Work-groups, a collection of work-items, are assigned to execute on compute-units. A work-item
 29 | /// is an instance of a kernel as runtime. That kernel instance is at a point in an index, which 
 30 | /// can be thought of as a grid and the work-groups which contain the work-items can be thought of 
 31 | /// as sub-grids within the grid. The work-groups can be defined explicitly or implicitly by 
 32 | /// simply specifying the number of work-items, both dealing with data parallelism. In terms of task
 33 | /// parallelism, kernels are executed independent of an index space.
 34 | /// It should also be noted that there are [built-in scalar data types][2] along with
 35 | /// [built-in functions][3].
 36 | ///
 37 | /// [1]: https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/online-offline-compilation/
 38 | /// [2]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/scalarDataTypes.html
 39 | /// [3]: https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/mathFunctions.html
 40 | #[derive(Debug)]
 41 | pub struct OpenCL<P> {
 42 |     /// A list of available devices for the first platform found.
 43 |     available_hardware: Vec<Hardware>,
 44 |     /// The specific Open CL implementation (e.g., AMD APP, NVIDIA or Intel Open CL)
 45 |     ///
 46 |     /// Platforms are defined by the implementation. Platforms enables the host to interact with 
 47 |     /// OpenCL-capable devices.
 48 |     pub(in frameworks::open_cl) implementation: Implementation,
 49 |     package: PhantomData<P>,
 50 | }
 51 | 
 52 | impl<P> OpenCL<P> {
 53 |     pub(in frameworks::open_cl) const ID: &'static str = "Open CL";
 54 | }
 55 | 
 56 | impl<P> Framework for OpenCL<P> where P: 'static {
 57 |     fn name(&self) -> &'static str {
 58 |         return OpenCL::<P>::ID;
 59 |     }
 60 | 
 61 |     fn hardware(&self) -> &[Hardware] {
 62 |         &self.available_hardware
 63 |     }
 64 | }
 65 | 
 66 | impl<P> FrameworkCtor for OpenCL<P> where P: 'static {
 67 |     type Context = OpenCLContext<P>;
 68 | 
 69 |     fn new() -> Result<Self> {
 70 |         let ignore_env_var = false;
 71 |         let implementation = Implementation::first(ignore_env_var)?;
 72 |         let devices = ocl::Device::list_all(implementation)?;
 73 | 
 74 |         let available_hardware = {
 75 | 
 76 |             devices.iter().enumerate()
 77 | 
 78 |             .filter(|&(_, d)| {
 79 |                 use ocl::enums::DeviceInfo::{MaxComputeUnits, Type};
 80 |                 use ocl::enums::DeviceInfoResult::Error;
 81 | 
 82 |                 let _1 = d.is_available().unwrap_or(false);
 83 |                 // let _2 = match d.info(Type) { Error(_) => false, _ => true };
 84 |                 // let _3 = match d.info(MaxComputeUnits) { Error(_) => false, _ => true };
 85 |                 // TODO
 86 | 
 87 |                 _1
 88 |             })
 89 | 
 90 |             .map(|(i, d)| {
 91 |                 
 92 |                 let kind = {
 93 |                     match d.info(DeviceInfo::Type) {
 94 |                         DeviceInfoResult::Type(t) => match t {
 95 |                             DEVICE_TYPE_ACCELERATOR => HardwareKind::Accelerator,
 96 |                             DEVICE_TYPE_CPU => HardwareKind::CPU,
 97 |                             DEVICE_TYPE_GPU => HardwareKind::GPU,
 98 |                             _ => HardwareKind::Unknown,
 99 |                         },
100 |                         _ => unreachable!(),
101 |                     }
102 |                 };
103 | 
104 |                 let compute_units = {
105 |                     match d.info(DeviceInfo::MaxComputeUnits) {
106 |                         DeviceInfoResult::MaxComputeUnits(n) => n as usize,
107 |                         _ => unreachable!(),
108 |                     }
109 |                 };
110 | 
111 |                 Hardware {
112 |                     id: i,
113 |                     framework: OpenCL::<P>::ID,
114 |                     kind,
115 |                     name: d.name(),
116 |                     compute_units,
117 |                 }
118 |             })
119 | 
120 |             .collect::<Vec<Hardware>>()
121 |         };
122 | 
123 |         Ok(OpenCL {  available_hardware, implementation, package: PhantomData })
124 |     }
125 | }


--------------------------------------------------------------------------------
/src/frameworks/open_cl/memory.rs:
--------------------------------------------------------------------------------
 1 | use ocl;
 2 | use super::OpenCLDevice;
 3 | use super::super::NativeMemory;
 4 | use super::super::super::compute_device::ComputeDevice;
 5 | use super::super::super::error::{ErrorKind, Result};
 6 | use super::super::super::memory::{Memory, TransferDirection};
 7 | use super::super::super::tensor::TensorType;
 8 | 
 9 | /// A `Memory` wraps around an OpenCL buffer id that manages its deallocation, named 
10 | /// as such for consistency's sake.
11 | ///
12 | /// Memory objects can be copied to host memory, from host memory, or to other memory objects.
13 | /// Copying from the host to a device is considered _writing_. Copying from a device to the host is
14 | /// considered _reading_.
15 | ///
16 | /// Unlike CUDA, OpenCL [buffers][1] are only context specific, not device specific. Also note: 
17 | /// currently, lazy allocation is used on the NVIDIA driver. That is, the buffer object, in a sense,
18 | /// is located _nowhere_ when allocated. It only exists when needed.
19 | ///
20 | /// [1]: https://goo.gl/S9B3TL
21 | #[derive(Clone, Debug)]
22 | pub struct OpenCLBuf<T> where T: TensorType {
23 |     pub(in super) buf: ocl::Buffer<T>,
24 | }
25 | 
26 | /// Memory representation for Open CL 
27 | pub struct OpenCLMemory<T> where T: TensorType {
28 |     pub(in super) buf: OpenCLBuf<T>,
29 |     pub(in super) device: OpenCLDevice,
30 | }
31 | 
32 | impl<T> Memory<T> for OpenCLMemory<T> where T: TensorType + 'static {
33 |     fn synchronized(&self, device: &ComputeDevice) -> bool {
34 |         if let Some(op) = device.downcast_ref::<OpenCLDevice>() {
35 |             (self.device.device == op.device) && (self.device.context.core() == op.context.core())
36 |         } else {
37 |             false
38 |         }
39 |     }
40 | 
41 |     fn transfer(&mut self, dir: TransferDirection, m: &mut Memory<T>) -> Result {
42 |         match dir {
43 |             TransferDirection::TransferIn => {
44 |                 if let Some(na) = m.downcast_ref::<NativeMemory<T>>() {
45 |                     let buffer_write_cmd = unsafe {
46 |                         self.buf.buf.write(
47 |                             na.0.as_slice_memory_order()
48 |                                 .expect("the array's data is not contiguous") // TODO
49 |                         )
50 |                         .queue(&self.device.queue)
51 |                         .block(true) // TODO
52 |                         .len(na.0.len())
53 |                     };
54 | 
55 |                     Ok(buffer_write_cmd.enq()?)
56 |                 } else {
57 |                     Err(ErrorKind::NoAvailableSynchronizationRouteFound.into())
58 |                 }
59 |             },
60 | 
61 |             TransferDirection::TransferOut => {
62 |                 if let Some(na) = m.downcast_mut::<NativeMemory<T>>() {
63 |                     let length = na.0.len();
64 | 
65 |                     let buffer_read_cmd = unsafe {
66 |                         self.buf.buf.read(
67 |                             na.0.as_slice_memory_order_mut()
68 |                                 .expect("the array's data is not contiguous") // TODO
69 |                         )
70 |                         .queue(&self.device.queue)
71 |                         .block(true) // TODO
72 |                         .len(length)
73 |                     };
74 | 
75 |                     Ok(buffer_read_cmd.enq()?)
76 |                 } else {
77 |                     Err(ErrorKind::NoAvailableSynchronizationRouteFound.into())
78 |                 }
79 |             }
80 |         }
81 |     }
82 | }
83 | 
84 | impl<T: TensorType> ::ocl::core::AsMem<T> for OpenCLMemory<T> {
85 |     fn as_mem(&self) -> &::ocl::core::Mem {
86 |         self.buf.buf.as_mem()
87 |     }
88 | }
89 | 
90 | unsafe impl<T: TensorType> ::ocl::core::MemCmdAll for OpenCLMemory<T> { }
91 | unsafe impl<'a, T: TensorType> ::ocl::core::MemCmdAll for &'a OpenCLMemory<T> { }
92 | unsafe impl<'a, T: TensorType> ::ocl::core::MemCmdAll for &'a mut OpenCLMemory<T> { }


--------------------------------------------------------------------------------
/src/frameworks/open_cl/mod.rs:
--------------------------------------------------------------------------------
 1 | pub use self::context::OpenCLContext;
 2 | pub use self::device::OpenCLDevice;
 3 | pub use self::framework::OpenCL;
 4 | pub use self::memory::{OpenCLBuf, OpenCLMemory};
 5 | 
 6 | mod context;
 7 | mod device;
 8 | mod error;
 9 | mod framework;
10 | mod memory;


--------------------------------------------------------------------------------
/src/hardware.rs:
--------------------------------------------------------------------------------
 1 | //! Hardware can be GPUs, multi-core CPUs or DSPs, Cell/B.E. processor or whatever else
 2 | //! is supported by the provided framework. The struct holds all important information about 
 3 | //! the hardware. To execute code on hardware, turn hardware into a [`ComputeDevice`].
 4 | //!
 5 | //! [`Device`]: [device]: ./compute_device/struct.Device.html
 6 | 
 7 | /// Representation for hardware across frameworks.
 8 | #[derive(Clone, Debug)]
 9 | pub struct Hardware {
10 |     /// The unique ID of the hardware.
11 |     pub id: usize,
12 |     /// Framework marker
13 |     pub framework: &'static str,
14 |     /// The type of compute device, such as a CPU or a GPU.
15 |     pub kind: HardwareKind,
16 |     /// The name.
17 |     pub name: String,
18 |     /// The number of compute units.
19 |     ///
20 |     /// A compute unit is the fundamental unit of computation. A compute device usually has 
21 |     /// multiple compute units.
22 |     pub compute_units: usize,
23 | }
24 | 
25 | /// General classes for devices, used to identify the type of a device.
26 | #[derive(Clone, Debug, Eq, Ord, PartialEq, PartialOrd)]
27 | pub enum HardwareKind {
28 |     /// Used for accelerators. Accelerators can communicate with host processor using a peripheral
29 |     /// interconnect such as PCIe.
30 |     Accelerator,
31 |     /// Used for cells.
32 |     Cell,
33 |     /// Used for devices that are host processors. The host processor runs the implementations
34 |     /// and is a single or multi-core CPU.
35 |     CPU,
36 |     /// Used for digital signal processors.
37 |     DSP,
38 |     /// Used for GPU devices.
39 |     GPU,
40 |     /// Used for anything else.
41 |     Unknown,
42 | }


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! Provides a simple, unified API for running highly parallel computations on different
  2 | //! devices across different GPGPU frameworks, allowing you to swap your backend at runtime.
  3 | //!
  4 | //! Parenchyma began as a hard fork of [Collenchyma], a now-defunct project started at [Autumn].
  5 | //!
  6 | //! ## Abstract
  7 | //!
  8 | //! Code is often executed on the CPU, but can be executed on other devices, such as GPUs
  9 | //! and accelerators. These devices are accessible through GPGPU frameworks. Most interfaces are 
 10 | //! complicated, making the use of these devices a painful experience. Some of the pain points when 
 11 | //! writing such code for a particular device are:
 12 | //!
 13 | //! * portability: not only do frameworks have different interfaces, devices support different 
 14 | //! versions and machines might have different hardware - all of this leads to code that will be 
 15 | //! executable only on a very specific set of machines and platforms.
 16 | //! * learning curve: executing code on a device through a framework is quite different to
 17 | //! running code on the native CPU and comes with a lot of hurdles. OpenCL's 1.2 specification, for
 18 | //! example, has close to 400 pages.
 19 | //! * custom code: integrating support for devices into your project requires the need for writing
 20 | //! a lot of low-level code, e.g., kernels, memory management, and general business logic.
 21 | //!
 22 | //! Writing code for non-CPU devices is often a good choice, as these devices can execute
 23 | //! operations a lot faster than native CPUs. GPUs, for example, can execute operations roughly
 24 | //! one to two orders of magnitudes faster, thanks to better support of parallelizing operations.
 25 | //!
 26 | //! Parenchyma eliminates the pain points of writing device code, so you can run your code like any 
 27 | //! other code without needing to learn about kernels, events, or memory synchronization. Parenchyma
 28 | //! also allows you to deploy your code with ease to servers, desktops and mobile device, all while 
 29 | //! enabling your code to make full use of the underlying hardware. 
 30 | //!
 31 | //! ## Architecture
 32 | //!
 33 | //! The single entry point of Parenchyma is a [Backend](./struct.Backend.html). A
 34 | //! backend is agnostic over the device it runs operations on. In order to be agnostic over the 
 35 | //! device, such as native host CPU, GPUs, accelerators or any other devices, the backend needs to 
 36 | //! be agnostic over the framework as well. The framework is important, as it provides the interface 
 37 | //! to execute operations on devices, among other things. Since different vendors of hardware use 
 38 | //! different frameworks, it becomes important that the backend is agnostic over the framework.
 39 | //! This allows us to run computations on any machine without having to worry about hardware 
 40 | //! availability, which gives us the freedom to write code once and deploy it on different machines 
 41 | //! where it will execute on the most potent hardware by default.
 42 | //!
 43 | //! ### Frameworks
 44 | //!
 45 | //! The default framework is simply the host CPU for common computation. To make use of other
 46 | //! devices such as GPUs, you may choose a GPGPU framework (such as OpenCL or CUDA) to access the 
 47 | //! processing capabilities of the device(s).
 48 | //!
 49 | //! ### Extensions
 50 | //!
 51 | //! Operations are introduced by a Parenchyma extension. An extension extends your backend with 
 52 | //! ready-to-execute operations. All you need to do is add the Parenchyma extension crate(s)
 53 | //! to your `Cargo.toml` file alongside the Parenchyma crate. Your backend will then be extended with
 54 | //! operations provided by the extension(s). The interface is simply the language you're using to 
 55 | //! work with Parenchyma. For example, you'd simply call `backend.dot(..)` using Rust-lang and 
 56 | //! a BLAS extension. Whether or not the dot operation is executed on one GPU, multiple GPUS or on 
 57 | //! a CPU device depends solely on how you configured the backend.
 58 | //!
 59 | //! ### Packages
 60 | //!
 61 | //! The concept of Parenchyma extensions has one more component - the [Package](./trait.ExtensionPackage.html)
 62 | //! trait. As opposed to executing code on the native CPU, other devices need to compile and build 
 63 | //! the extension manually at runtime which makes up a significant part of a framework. We need an
 64 | //! instance that's able to be initialized at runtime for holding the sate and compiled 
 65 | //! operations - which is the package's main purpose.
 66 | //!
 67 | //! ### Memory
 68 | //!
 69 | //! The last piece of Parenchyma is the memory. An operation happens over data, but this data needs
 70 | //! to be accessible to the device on which the operation is executed. That memory space needs to be
 71 | //! allocated on the device and then, in a later step, synced from the host to the device or from
 72 | //! the device back to the host. Thanks to the [Tensor](./struct.SharedTensor.html) type, we do not
 73 | //! have to care about memory management between devices for the execution of operations. The tensor
 74 | //! tracks and automatically manages data and its memory across devices, which is often the host and
 75 | //! the device. Memory can also be passed around to different backends. Operations take tensors
 76 | //! as arguments while handling the synchronization and allocation for you.
 77 | //!
 78 | //! ## Development
 79 | //!
 80 | //! At the moment, Parenchyma itself will provide Rust APIs for the important 
 81 | //! frameworks - OpenCL and CUDA.
 82 | //!
 83 | //! If a framework isn't specified, the backend will try to use the most potent framework given
 84 | //! the underlying hardware - which would probably be in this order: CUDA -> OpenCL -> Native. The 
 85 | //! process might take longer, as every framework needs to be checked and devices need to be loaded 
 86 | //! in order to identify the best setup. The time it takes to go through that process is a 
 87 | //! reasonable compromise as it would allow you to deploy a Parenchyma-backed application to almost 
 88 | //! any machine - server, desktops, mobiles, etc.
 89 | //!
 90 | //! ## Notes
 91 | //!
 92 | //! The `array` macro is re-exported as a way to allow the end-user to work directly with the 
 93 | //! crate instead of manually having to add `ndarray` for this single macro.
 94 | //!
 95 | //! [Collenchyma]: https://github.com/autumnai/collenchyma
 96 | //! [Autumn]: https://github.com/autumnai
 97 | #![feature(box_syntax, crate_in_paths, get_type_id, non_modrs_mods, unsize, use_extern_macros)]
 98 | 
 99 | #[macro_use]
100 | extern crate log;
101 | #[macro_use(array)]
102 | extern crate ndarray;
103 | extern crate num;
104 | extern crate ocl;
105 | 
106 | pub use self::ndarray::array;
107 | 
108 | pub mod backend;
109 | pub mod changelog;
110 | pub mod compute_device;
111 | pub mod context;
112 | pub mod error;
113 | pub mod extension_package;
114 | pub mod framework;
115 | pub mod frameworks;
116 | pub mod hardware;
117 | pub mod memory;
118 | pub mod tensor;
119 | 
120 | pub mod prelude {
121 |     pub use super::backend::Backend;
122 |     pub use super::framework::{Framework, FrameworkCtor};
123 |     pub use super::tensor::{IntoTensor, SharedTensor, TensorShape};
124 | }


--------------------------------------------------------------------------------
/src/memory.rs:
--------------------------------------------------------------------------------
  1 | //! Provides a unified representation of memory across different frameworks.
  2 | 
  3 | use std::any::{Any, TypeId};
  4 | use super::compute_device::ComputeDevice;
  5 | use super::error::{ErrorKind, Result};
  6 | 
  7 | // TODO
  8 | // pub struct Stacked<'p, T> { data: T, marker: PhantomData<&'p mut &'a ()> }
  9 | // pub struct Boxed<T> { data: Box<T> }
 10 | 
 11 | /// The transfer direction
 12 | pub enum TransferDirection {
 13 |     /// Transfer data 
 14 |     TransferIn,
 15 |     /// Transfer data out (i.e., _dumps_ data)
 16 |     TransferOut,
 17 | }
 18 | 
 19 | /// The struct `Memory` has generic type parameters representing memory and its location as seen 
 20 | /// by the shared tensor.
 21 | ///
 22 | /// **notes**:
 23 | ///
 24 | /// * The words _buf_ and _memory_ are used here (until a better term comes to 
 25 | /// mind (candidates: _allocation_, _partition_, etc.)) for the sake of simplification.
 26 | /// * Each framework handles memory allocation differently (e.g., OpenCL allocates memory _lazily_ 
 27 | /// and isn't associated with any device within the context, even after it's used). 
 28 | /// * Downcast methods are provided, but normally you will want to use a [`SharedTensor`] which 
 29 | /// handles synchronization of the latest memory copy to the required device.
 30 | pub trait Memory<T>: Any {
 31 |     /// Specifies synchronization behavior for keeping data consistent across frameworks and contexts.
 32 |     ///
 33 |     /// **note**
 34 |     ///
 35 |     /// _Synch_ shouldn't be confused with the marker type `Sync` found in the standard library. 
 36 |     /// The less common abbreviation for _synchronize_ (the extra _h_) is used here to 
 37 |     /// avoid confusion.
 38 |     ///
 39 |     /// The `transfer` method handles the asynchronous data transfer behavior across 
 40 |     /// frameworks and contexts.
 41 |     ///
 42 |     // # TODO: Transfer Matrix/Routes
 43 |     //
 44 |     // Host-GPU: Host <-> GPU
 45 |     // GPU-GPU: GPU -> HOST -> GPU
 46 |     //
 47 |     // ```{.text}
 48 |     // opencl device (context `a`) -> opencl device (context `b`) = true
 49 |     // opencl device -> native/host = true
 50 |     // opencl <-> cuda = false
 51 |     // native/host -> native/host = true
 52 |     // native/host -> cuda/opencl = false
 53 |     // ```
 54 |     fn transfer(&mut self, TransferDirection, &mut Memory<T>) -> Result {
 55 |         Err(ErrorKind::NoAvailableSynchronizationRouteFound.into())
 56 |     }
 57 |     /// Determines whether or not the memory is allocated or pinned on the `backend`'s active device.
 58 |     ///
 59 |     /// # Arguments
 60 |     ///
 61 |     /// * `compute_device` - The computation device.
 62 |     ///
 63 |     /// **note**:
 64 |     ///
 65 |     /// Certain frameworks have a concept of _shared memory_, where the location of the 
 66 |     /// memory is omnipresent (in a very abstract sense) as long as the devices are within the same
 67 |     /// context.
 68 |     #[allow(unused_variables)]
 69 |     fn synchronized(&self, compute_device: &ComputeDevice) -> bool {
 70 |         return false;
 71 |     }
 72 | }
 73 | 
 74 | impl<T: 'static> Memory<T> {
 75 |     /// Returns `true` if the boxed type is the same as `T`.
 76 |     #[inline]
 77 |     pub fn is<M: Memory<T>>(&self) -> bool {
 78 |         // Get TypeId of the type this function is instantiated with
 79 |         let t = TypeId::of::<M>();
 80 | 
 81 |         // Get TypeId of the type in the trait object
 82 |         let boxed = self.get_type_id();
 83 | 
 84 |         // Compare both TypeIds on equality
 85 |         t == boxed
 86 |     }
 87 | 
 88 |     /// Returns some reference to the boxed value if it is of type `T`, or
 89 |     /// `None` if it isn't.
 90 |     #[inline]
 91 |     pub fn downcast_ref<M: Memory<T>>(&self) -> Option<&M> {
 92 |         if self.is::<M>() {
 93 |             unsafe {
 94 |                 Some(&*(self as *const Memory<T> as *const M))
 95 |             }
 96 |         } else {
 97 |             None
 98 |         }
 99 |     }
100 | 
101 |     /// Returns some mutable reference to the boxed value if it is of type `T`, or
102 |     /// `None` if it isn't.
103 |     #[inline]
104 |     pub fn downcast_mut<M: Memory<T>>(&mut self) -> Option<&mut M> {
105 |         if self.is::<M>() {
106 |             unsafe {
107 |                 Some(&mut *(self as *mut Memory<T> as *mut M))
108 |             }
109 |         } else {
110 |             None
111 |         }
112 |     }
113 | }


--------------------------------------------------------------------------------
/src/tensor/into_tensor.rs:
--------------------------------------------------------------------------------
 1 | use ndarray::{Array, Dimension};
 2 | use std::cell::RefCell;
 3 | 
 4 | use super::{SharedTensor, TensorMap, TensorShape};
 5 | use super::super::memory::Memory;
 6 | use super::super::frameworks::NativeMemory;
 7 | 
 8 | pub trait IntoTensor<T> {
 9 |     fn into_tensor(self) -> SharedTensor<T>;
10 | }
11 | 
12 | impl<T: 'static, D> IntoTensor<T> for Array<T, D> where D: Dimension {
13 |     fn into_tensor(self) -> SharedTensor<T> {
14 |         SharedTensor::<T>::from(self)
15 |     }
16 | }
17 | 
18 | impl<T, Dim> From<Array<T, Dim>> for SharedTensor<T> where 
19 |     T: 'static,
20 |     Dim: Dimension {
21 | 
22 |     fn from(array: Array<T, Dim>) -> Self {
23 |         if !array.is_standard_layout() {
24 |             panic!("Array data must be laid out in contiguous “C order” in memory");
25 |         }
26 | 
27 |         let shape = TensorShape::from(array.shape());
28 |         let n = NativeMemory(array.into_dyn());
29 | 
30 |         let memories = RefCell::new(vec![
31 |             Box::new(n) as Box<Memory<T>>
32 |         ]);
33 |         
34 |         let synch_map = TensorMap::with(1 << 0);
35 |         SharedTensor { memories, shape, synch_map }
36 |     }
37 | }


--------------------------------------------------------------------------------
/src/tensor/tensor_map.rs:
--------------------------------------------------------------------------------
 1 | use std::cell::Cell;
 2 | 
 3 | /// A "newtype" with an internal type of `Cell<u64>`. `TensorMap` uses [bit manipulation][1] to manage 
 4 | /// memory versions.
 5 | ///
 6 | /// [1]: http://stackoverflow.com/a/141873/2561805
 7 | #[allow(non_camel_case_types)]
 8 | #[derive(Debug)]
 9 | pub(in super) struct TensorMap(Cell<u64>);
10 | 
11 | impl TensorMap {
12 |     /// The maximum number of bits in the bit map can contain.
13 |     pub const CAPACITY: usize = 64;
14 | 
15 |     /// Constructs a new `TensorMap`.
16 |     pub(in super) fn new() -> TensorMap {
17 |         TensorMap::with(0)
18 |     }
19 | 
20 |     /// Constructs a new `TensorMap` with the supplied `n`.
21 |     pub(in super) fn with(n: u64) -> TensorMap {
22 |         TensorMap(Cell::new(n))
23 |     }
24 | 
25 |     // fn get(&self) -> u64 {
26 |     //     self.0.get()
27 |     // }
28 | 
29 |     pub(in super) fn set(&self, v: u64) {
30 |         self.0.set(v)
31 |     }
32 | 
33 |     pub(in super) fn empty(&self) -> bool {
34 |         self.0.get() == 0
35 |     }
36 | 
37 |     pub(in super) fn insert(&self, k: usize) {
38 |         self.0.set(self.0.get() | (1 << k))
39 |     }
40 | 
41 |     pub(in super) fn contains(&self, k: usize) -> bool {
42 |         k < Self::CAPACITY && (self.0.get() & (1 << k) != 0)
43 |     }
44 | 
45 |     pub(in super) fn latest(&self) -> u32 {
46 |         self.0.get().trailing_zeros()
47 |     }
48 | }


--------------------------------------------------------------------------------
/src/tensor/tensor_memories.rs:
--------------------------------------------------------------------------------
1 | use std::cell::RefCell;
2 | use super::super::memory::Memory;
3 | 
4 | pub type TensorMemories<T> = RefCell<Vec<Box<Memory<T>>>>;


--------------------------------------------------------------------------------
/src/tensor/tensor_shape.rs:
--------------------------------------------------------------------------------
  1 | use super::super::error::{Error, ErrorKind, Result};
  2 | 
  3 | /// Describes the shape of a tensor.
  4 | ///
  5 | /// **note**: `From` conversion implementations are provided for low-rank shapes.
  6 | #[derive(Clone, Debug, Eq, PartialEq)]
  7 | pub struct TensorShape {
  8 |     /// The number of components the associated tensor can store.
  9 |     ///
 10 |     /// # Example
 11 |     ///
 12 |     /// ```{.text}
 13 |     /// // The following tensor has 9 components
 14 |     ///
 15 |     /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
 16 |     /// ```
 17 |     pub capacity: usize,
 18 |     /// A list of numbers with each representing the dimension at each index.
 19 |     ///
 20 |     /// # Example
 21 |     ///
 22 |     /// The following tensor has a shape of `[2, 1]`:
 23 |     ///
 24 |     /// ```{.text}
 25 |     /// [[a], [b]]
 26 |     /// ```
 27 |     pub dimsizes: Vec<usize>,
 28 |     // /// The stride tells the tensor how to interpret its flattened representation.
 29 |     // stride: Vec<usize>,
 30 | }
 31 | 
 32 | impl TensorShape {
 33 |     /// Checks that the shape of the provided `data` is compatible.
 34 |     pub fn check<T>(&self, data: &[T]) -> Result {
 35 |         if self.capacity != data.len() {
 36 |             let message = format!(
 37 |                 "TODO: incompatible shape. Capacity = {}, Length = {}", 
 38 |                 self.capacity, 
 39 |                 data.len());
 40 |             let kind = ErrorKind::IncompatibleShape;
 41 |             let e = Error::new(kind, message);
 42 | 
 43 |             return Err(e);
 44 |         }
 45 | 
 46 |         Ok(())
 47 |     }
 48 | 
 49 |     /// Returns the `dimensions`.
 50 |     pub fn dimensions(&self) -> &[usize] {
 51 |         &self.dimsizes
 52 |     }
 53 | 
 54 |     /// Returns the number of elements the tensor can hold without reallocating.
 55 |     pub fn capacity(&self) -> usize {
 56 |         self.capacity
 57 |     }
 58 | 
 59 |     /// Returns the total number of indices required to identify each component uniquely (i.e, the
 60 |     /// tensor's rank, degree, or order).
 61 |     ///
 62 |     /// # Example
 63 |     ///
 64 |     /// The following tensor has a rank of 2:
 65 |     ///
 66 |     /// ```{.text}
 67 |     /// [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
 68 |     /// ```
 69 |     pub fn rank(&self) -> usize {
 70 |         self.dimsizes.len()
 71 |     }
 72 | }
 73 | 
 74 | impl From<Vec<usize>> for TensorShape {
 75 | 
 76 |     fn from(vector: Vec<usize>) -> TensorShape {
 77 | 
 78 |         TensorShape {
 79 |             capacity: vector.iter().fold(1, |acc, &dims| acc * dims),
 80 |             dimsizes: vector,
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | impl<'slice> From<&'slice [usize]> for TensorShape {
 86 | 
 87 |     fn from(slice: &[usize]) -> TensorShape {
 88 |         TensorShape {
 89 |             capacity: slice.iter().fold(1, |acc, &dims| acc * dims),
 90 |             dimsizes: slice.to_owned(),
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | impl From<usize> for TensorShape {
 96 | 
 97 |     fn from(dimensions: usize) -> TensorShape {
 98 |         TensorShape {
 99 |             capacity: dimensions,
100 |             dimsizes: vec![dimensions],
101 |         }
102 |     }
103 | }
104 | 
105 | macro_rules! shape {
106 |     ($($length:expr),*) => ($(impl From<[usize; $length]> for TensorShape {
107 |         fn from(array: [usize; $length]) -> TensorShape {
108 | 
109 |             TensorShape {
110 |                 capacity: array.iter().fold(1, |acc, &dims| acc * dims),
111 |                 dimsizes: array.to_vec(),
112 |             }
113 |         }
114 |     })*)
115 | }
116 | 
117 | shape!(0, 1, 2, 3, 4, 5, 6);


--------------------------------------------------------------------------------
/src/tensor/tensor_type.rs:
--------------------------------------------------------------------------------
 1 | use ocl::traits::OclPrm as PrimitiveType;
 2 | 
 3 | /// A marker trait implemented by primitive types that usable within kernels.
 4 | pub trait TensorType: PrimitiveType {
 5 |     // ..
 6 | }
 7 | 
 8 | impl<T: PrimitiveType> TensorType for T {
 9 |     // ..
10 | }


--------------------------------------------------------------------------------
/src/tensor/utility.rs:
--------------------------------------------------------------------------------
 1 | use std::mem;
 2 | 
 3 | pub(in super) unsafe fn extend_lifetime<'a, 'b, T>(t: &'a T) -> &'b T 
 4 |     where T: ?Sized {
 5 | 
 6 |     mem::transmute::<&'a T, &'b T>(t)
 7 | }
 8 | 
 9 | pub(in super) unsafe fn extend_lifetime_mut<'a, 'b, T>(t: &'a mut T) -> &'b mut T 
10 |     where T: ?Sized {
11 | 
12 |     mem::transmute::<&'a mut T, &'b mut T>(t)
13 | }


--------------------------------------------------------------------------------
/tests/backend_specs.rs:
--------------------------------------------------------------------------------
 1 | extern crate parenchyma;
 2 | 
 3 | #[cfg(test)]
 4 | mod backend_spec {
 5 |     mod native {
 6 |         use std::rc::Rc;
 7 |         use parenchyma::backend::Backend;
 8 |         use parenchyma::frameworks::Native;
 9 | 
10 |         #[test]
11 |         fn it_can_create_default_backend() {
12 |             let backend: Result<Backend, _> = Backend::new::<Native>();
13 |             assert!(backend.is_ok());
14 |         }
15 | 
16 |         #[test]
17 |         fn it_can_use_ibackend_trait_object() {
18 |             let backend: Rc<Backend> = Rc::new(Backend::new::<Native>().unwrap());
19 |             use_ibackend(backend);
20 |         }
21 | 
22 |         fn use_ibackend(backend: Rc<Backend>) {
23 |             let backend: Rc<Backend> = backend.clone();
24 |         }
25 |     }
26 | 
27 |     // #[cfg(feature = "cuda")]
28 |     // mod cuda {
29 |     //     use co::*;
30 |     //     #[test]
31 |     //     fn it_can_create_default_backend() {
32 |     //         assert!(Backend::new::<Cuda>().is_ok());
33 |     //     }
34 |     // }
35 | 
36 |     // mod opencl {
37 |     //     //use parenchyma::{Backend, Framework, FrameworkCtor, OpenCL};
38 |     //     use parenchyma::backend::Backend;
39 |     //     use parenchyma::frameworks::OpenCL;
40 |     //     use parenchyma::prelude::*;
41 | 
42 |     //     #[test]
43 |     //     fn it_can_create_default_backend() {
44 |     //         let backend: Result<Backend, _> = Backend::new::<OpenCL>();
45 |     //         assert!(backend.is_ok());
46 |     //     }
47 | 
48 |     //     #[test]
49 |     //     fn it_can_manually_create_backend() {
50 |     //         let framework = OpenCL::new().unwrap();
51 |     //         let hardware = framework.hardware().to_vec();
52 |     //         let backend: Backend = Backend::with(framework, hardware).unwrap();
53 |     //         println!("{:?}", backend);
54 |     //     }
55 |     // }
56 | }


--------------------------------------------------------------------------------
/tests/framework_native_specs.rs:
--------------------------------------------------------------------------------
 1 | extern crate parenchyma;
 2 | 
 3 | #[cfg(test)]
 4 | mod framework_native_spec {
 5 |     use parenchyma::frameworks::Native;
 6 |     use parenchyma::prelude::{Framework, FrameworkCtor};
 7 | 
 8 |     #[test]
 9 |     fn it_works() {
10 |         let framework: Native = Native::new().unwrap();
11 |         assert_eq!(framework.hardware().len(), 1);
12 |     }
13 | }


--------------------------------------------------------------------------------
/tests/shared_memory_specs.rs:
--------------------------------------------------------------------------------
 1 | // extern crate parenchyma as pa;
 2 | 
 3 | // #[cfg(test)]
 4 | // mod shared_memory_spec {
 5 | //     use pa::{Backend, ErrorKind, Memory, Native, OpenCL, SharedTensor};
 6 | 
 7 | //     pub fn write(memory: &mut Memory<f32>, data: &[f32]) {
 8 | //         let ndarray = unsafe { memory.as_mut_native_unchecked() };
 9 | //         let buf = ndarray.as_slice_memory_order_mut().unwrap();
10 | 
11 | //         for (index, datum) in data.iter().enumerate() {
12 | //             buf[index] = *datum;
13 | //         }
14 | //     }
15 | 
16 | //     #[test]
17 | //     fn it_creates_new_shared_memory_for_native() {
18 | //         let ref host: Backend = Backend::new::<Native>().unwrap();
19 | //         let mut shared_data = SharedTensor::<f32>::new(10);
20 | //         let tensor = shared_data.write(host).unwrap();
21 | //         assert_eq!(tensor.as_native().unwrap().len(), 10);
22 | //     }
23 | 
24 | //     #[test]
25 | //     //#[cfg(feature = "opencl")]
26 | //     fn it_creates_new_shared_memory_for_opencl() {
27 | //         let ref backend: Backend = Backend::new::<OpenCL>().unwrap();
28 | //         let mut shared_data: SharedTensor = SharedTensor::new(10);
29 | //         assert!(shared_data.write(backend).is_ok());
30 | //     }
31 | 
32 | //     #[test]
33 | //     fn it_fails_on_initialized_memory_read() {
34 | //         let ref host: Backend = Backend::new::<Native>().unwrap();
35 | //         let mut shared_data = SharedTensor::<f32>::new(10);
36 | //         assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
37 | //         assert_eq!(shared_data.read_write(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
38 | 
39 | //         // initialize memory
40 | //         let _ = shared_data.write(host).unwrap();
41 | //         let _ = shared_data.dealloc(host).unwrap();
42 | 
43 | //         assert_eq!(shared_data.read(host).unwrap_err().kind(), ErrorKind::UninitializedMemory);
44 | //     }
45 | 
46 | //     #[test]
47 | //     //#[cfg(feature = "opencl")]
48 | //     fn it_syncs_from_native_to_opencl_and_back() {
49 | //         let ref host: Backend = Backend::new::<Native>().unwrap();
50 | //         let ref backend: Backend = Backend::new::<OpenCL>().unwrap();
51 | 
52 | //         let mut sh = SharedTensor::<f32>::new(3);
53 | //         write(sh.write(host).unwrap(), &[1.0f32, 2.0, 123.456]);
54 | //         let _ = sh.read(backend).unwrap();
55 | 
56 | //         // It has not successfully synced to the device.
57 | //         // Not the other way around.
58 | 
59 | //         //let _ = sh.dealloc(host).unwrap();// TODO ?
60 | //         let _ = sh.dealloc(backend).unwrap();
61 | 
62 | //         assert_eq!(
63 | //             sh.read(host).unwrap().as_native().unwrap().as_slice_memory_order().unwrap(), 
64 | //             [1.0, 2.0, 123.456]
65 | //         );
66 | //     }
67 | 
68 | //     #[test]
69 | //     fn it_reshapes_correctly() {
70 | //         let mut shared_data = SharedTensor::<f32>::new(10);
71 | //         assert!(shared_data.reshape([5, 2]).is_ok());
72 | //     }
73 | 
74 | //     #[test]
75 | //     fn it_returns_err_for_invalid_size_reshape() {
76 | //         let mut shared_data = SharedTensor::<f32>::new(10);
77 | //         assert!(shared_data.reshape([10, 2]).is_err());
78 | //     }
79 | // }


--------------------------------------------------------------------------------