├── .gitignore ├── src ├── cl │ ├── math.cl │ ├── slice_ops.cl │ └── main.cl ├── num.rs ├── lib.rs ├── helper.rs ├── context.rs ├── range_arg.rs ├── array.rs ├── tensor.rs ├── kernels.rs └── ops.rs ├── Cargo.toml ├── examples ├── matmul.rs ├── transpose.rs ├── operation_composition.rs └── add_slice.rs ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | *.swp 4 | -------------------------------------------------------------------------------- /src/cl/math.cl: -------------------------------------------------------------------------------- 1 | float sigmoid(float z){return 1.0/(1.0+exp(-z));} 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gpuarray" 3 | version = "0.1.0" 4 | authors = ["Theodore DeRego "] 5 | 6 | [dependencies.opencl] 7 | git = "https://github.com/tedsta/rust-opencl" 8 | 9 | [dependencies] 10 | libc = "^0.1.8" 11 | -------------------------------------------------------------------------------- /src/num.rs: -------------------------------------------------------------------------------- 1 | use opencl::hl::KernelArg; 2 | 3 | pub trait Num: KernelArg+Copy+'static { } 4 | 5 | impl Num for f32 { } 6 | //impl Num for f64 { } 7 | //impl Num for i8 { } 8 | //impl Num for i16 { } 9 | impl Num for i32 { } 10 | impl Num for i64 { } 11 | //impl Num for u8 { } 12 | //impl Num for u16 { } 13 | impl Num for u32 { } 14 | impl Num for u64 { } 15 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate opencl; 2 | extern crate libc; 3 | 4 | pub use context::Context; 5 | pub use array::Array; 6 | pub use tensor::{Event, Tensor, TensorMode}; 7 | pub use ops::*; 8 | pub use range_arg::RangeArg; 9 | 10 | pub mod array; 11 | pub mod context; 12 | pub mod kernels; 13 | pub mod num; 14 | #[macro_use] pub mod range_arg; 15 | pub mod ops; 16 | pub mod tensor; 17 | 18 | mod helper; 19 | -------------------------------------------------------------------------------- /src/helper.rs: -------------------------------------------------------------------------------- 1 | pub fn compute_dim_steps(shape: &[usize]) -> Vec { 2 | let mut dim_steps = vec![0; shape.len()]; 3 | dim_steps[shape.len()-1] = 1; 4 | for i in 1..shape.len() { 5 | let cur_index = shape.len()-i-1; 6 | dim_steps[cur_index] = shape[cur_index+1]*dim_steps[cur_index+1]; 7 | } 8 | dim_steps 9 | } 10 | 11 | #[test] 12 | fn test_compute_dim_steps() { 13 | assert!(compute_dim_steps(&[2, 3, 4]) == &[12, 4, 1]); 14 | } 15 | -------------------------------------------------------------------------------- /examples/matmul.rs: -------------------------------------------------------------------------------- 1 | extern crate gpuarray as ga; 2 | 3 | use ga::Context; 4 | use ga::tensor::{Tensor, TensorMode}; 5 | use ga::array::Array; 6 | 7 | fn main() { 8 | let ref ctx = Context::new(); 9 | 10 | let a = Array::from_vec(vec![5, 10], (0..5*10).map(|x| x as f32).collect()); 11 | let b = Array::from_vec(vec![10, 15], (0..10*15).map(|x| (x as f32)*2.0).collect()); 12 | 13 | let a_gpu = Tensor::from_array(ctx, &a, TensorMode::In); 14 | let b_gpu = Tensor::from_array(ctx, &b, TensorMode::In); 15 | let c_gpu: Tensor = Tensor::new(ctx, vec![5, 15], TensorMode::Mut); 16 | 17 | ga::matmul(ctx, &a_gpu, &b_gpu, &c_gpu); 18 | 19 | let c = c_gpu.get(ctx); 20 | 21 | println!("A = \n{:?}", a); 22 | println!("B = \n{:?}", b); 23 | println!("A*B = \n{:?}", c); 24 | } 25 | -------------------------------------------------------------------------------- /examples/transpose.rs: -------------------------------------------------------------------------------- 1 | extern crate gpuarray as ga; 2 | 3 | use ga::Context; 4 | use ga::tensor::{Tensor, TensorMode}; 5 | use ga::array::Array; 6 | 7 | fn main() { 8 | let ref ctx = Context::new(); 9 | 10 | let a = Array::from_vec(vec![100, 1000], (0..100*1000).map(|x| x as f32).collect()); 11 | let b = Array::from_vec(vec![100, 1000], (0..100*1000).map(|x| (x as f32)*2.0).collect()); 12 | 13 | let a_gpu = Tensor::from_array(ctx, &a, TensorMode::In); 14 | let b_gpu = Tensor::from_array(ctx, &b, TensorMode::In); 15 | let c_gpu = Tensor::new(ctx, vec![100, 1000], TensorMode::In); 16 | let d_gpu: Tensor = Tensor::new(ctx, vec![1000, 100], TensorMode::Out); 17 | 18 | ga::add(ctx, &a_gpu, -1, &b_gpu, &c_gpu); 19 | ga::transpose(ctx, &c_gpu, &d_gpu); 20 | 21 | let d = d_gpu.get(ctx); 22 | 23 | for i in 0..100 { 24 | for j in 0..1000 { 25 | assert!(d[&[j, i]] == a[&[i, j]] + b[&[i, j]]); 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Theodore DeRego 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpuarray-rs 2 | 3 | Make use of GPU-powered array operations from Rust! Gpuarray-rs uses OpenCL but hides all the details. Still mostly a proof of concept. 4 | 5 | ### Example 6 | 7 | Matrix multiplication 8 | 9 | ```Rust 10 | extern crate gpuarray as ga; 11 | 12 | use ga::Context; 13 | use ga::tensor::{Tensor, TensorMode}; 14 | use ga::array::Array; 15 | 16 | fn main() { 17 | let ref ctx = Context::new(); 18 | 19 | let a = Array::from_vec(vec![5, 10], (0..5*10).map(|x| x as f32).collect()); 20 | let b = Array::from_vec(vec![10, 15], (0..10*15).map(|x| (x as f32)*2.0).collect()); 21 | 22 | let a_gpu = Tensor::from_array(ctx, &a, TensorMode::In); 23 | let b_gpu = Tensor::from_array(ctx, &b, TensorMode::In); 24 | let c_gpu: Tensor = Tensor::new(ctx, vec![5, 15], TensorMode::Mut); 25 | 26 | ga::matmul(ctx, &a_gpu, &b_gpu, &c_gpu); 27 | 28 | let c = c_gpu.get(ctx); 29 | 30 | println!("A = \n{:?}", a); 31 | println!("B = \n{:?}", b); 32 | println!("A*B = \n{:?}", c); 33 | } 34 | ``` 35 | 36 | ### License 37 | 38 | MIT 39 | -------------------------------------------------------------------------------- /examples/operation_composition.rs: -------------------------------------------------------------------------------- 1 | extern crate gpuarray as ga; 2 | 3 | use ga::Context; 4 | use ga::tensor::{Tensor, TensorMode}; 5 | use ga::array::Array; 6 | 7 | fn main() { 8 | let ref ctx = Context::new(); 9 | 10 | let a = Array::from_vec(vec![5, 15], (0..5*15).map(|x| x as f32).collect()); 11 | let b = Array::from_vec(vec![15, 10], (0..15*10).map(|x| (x as f32)*2.0).collect()); 12 | let c = Array::from_vec(vec![5, 10], vec![1.0; 5*10]); 13 | 14 | let a_gpu = Tensor::from_array(ctx, &a, TensorMode::In); 15 | let b_gpu = Tensor::from_array(ctx, &b, TensorMode::In); 16 | let c_gpu = Tensor::from_array(ctx, &c, TensorMode::In); 17 | // Our intermediate result must be TensorMode::Mut 18 | let d_gpu: Tensor = Tensor::new(ctx, vec![5, 10], TensorMode::Mut); 19 | let e_gpu: Tensor = Tensor::new(ctx, vec![5, 10], TensorMode::Out); 20 | 21 | ga::matmul(ctx, &a_gpu, &b_gpu, &d_gpu); 22 | ga::add(ctx, &d_gpu, -1, &c_gpu, &e_gpu); 23 | 24 | let d = d_gpu.get(ctx); 25 | let e = e_gpu.get(ctx); 26 | 27 | println!("A = \n{:?}", a); 28 | println!("B = \n{:?}", b); 29 | println!("C = \n{:?}", c); 30 | println!("D = A * B = \n{:?}", d); 31 | println!("D + C = \n{:?}", e); 32 | } 33 | -------------------------------------------------------------------------------- /examples/add_slice.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] extern crate gpuarray as ga; 2 | 3 | use ga::{Array, Context, Tensor, TensorMode, add_slice}; 4 | 5 | fn main() { 6 | 7 | let ref ctx = Context::new(); 8 | 9 | let a = Array::from_vec(vec![4, 3], vec![2, 3, 4, 10 | 6, 7, 8, 11 | 10, 11, 12, 12 | 14, 15, 16]); 13 | let a_gpu = Tensor::from_array(ctx, &a, TensorMode::Mut); 14 | 15 | let b = Array::from_vec(vec![4, 4], vec![1, 2, 3, 4, 16 | 5, 6, 7, 8, 17 | 9, 10, 11, 12, 18 | 13, 14, 15, 16]); 19 | let b_gpu = Tensor::from_array(ctx, &b, TensorMode::Mut); 20 | 21 | let c = Array::from_vec(vec![4, 4], vec![0; 16]); 22 | let c_gpu = Tensor::from_array(ctx, &c, TensorMode::Mut); 23 | 24 | add_slice(ctx, &a_gpu.slice(s![1..3, 1]), &b_gpu.slice(s![1..3, 3]), &c_gpu.slice(s![2..4, 0])); 25 | //println!("{:?}", ct.get(ctx)); 26 | assert!(c_gpu.get(ctx).buffer() == &[0, 0, 0, 0, 27 | 0, 0, 0, 0, 28 | 15, 0, 0, 0, 29 | 23, 0, 0, 0]); 30 | } 31 | -------------------------------------------------------------------------------- /src/context.rs: -------------------------------------------------------------------------------- 1 | use opencl; 2 | 3 | use kernels::Kernels; 4 | 5 | pub struct Context { 6 | pub device: opencl::hl::Device, 7 | pub ctx: opencl::hl::Context, 8 | pub queue: opencl::hl::CommandQueue, 9 | pub program: opencl::hl::Program, 10 | kernels: Kernels, 11 | } 12 | 13 | impl Context { 14 | pub fn new() -> Context { 15 | let program_src = format!("{}\n{}\n{}", 16 | include_str!("cl/math.cl"), 17 | include_str!("cl/main.cl"), 18 | include_str!("cl/slice_ops.cl")); 19 | 20 | let (device, ctx, queue) = opencl::util::create_compute_context_prefer(opencl::util::PreferedType::GPUPrefered).unwrap(); 21 | 22 | println!("Using OpenCL Device: {}", device.name()); 23 | 24 | let program = ctx.create_program_from_source(&program_src); 25 | program.build(&device).ok().expect("Couldn't build program."); 26 | 27 | // Create and store all of the kernels 28 | let kernels = Kernels::new(&program); 29 | 30 | Context { 31 | device: device, 32 | ctx: ctx, 33 | queue: queue, 34 | program: program, 35 | kernels: kernels, 36 | } 37 | } 38 | 39 | pub fn kernels(&self) -> &Kernels { 40 | &self.kernels 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/range_arg.rs: -------------------------------------------------------------------------------- 1 | use std::ops::{Range, RangeFrom, RangeTo, RangeFull}; 2 | 3 | pub struct RangeArg { 4 | pub start: usize, 5 | pub end: Option, 6 | } 7 | 8 | impl RangeArg { 9 | pub fn len(&self, len: usize) -> usize { 10 | self.end.unwrap_or(len) - self.start 11 | } 12 | } 13 | 14 | impl From> for RangeArg { 15 | #[inline] 16 | fn from(r: Range) -> RangeArg { 17 | RangeArg { 18 | start: r.start, 19 | end: Some(r.end), 20 | } 21 | } 22 | } 23 | 24 | impl From> for RangeArg { 25 | #[inline] 26 | fn from(r: RangeFrom) -> RangeArg { 27 | RangeArg { 28 | start: r.start, 29 | end: None, 30 | } 31 | } 32 | } 33 | 34 | impl From> for RangeArg { 35 | #[inline] 36 | fn from(r: RangeTo) -> RangeArg { 37 | RangeArg { 38 | start: 0, 39 | end: Some(r.end), 40 | } 41 | } 42 | } 43 | 44 | impl From for RangeArg { 45 | #[inline] 46 | fn from(_: RangeFull) -> RangeArg { 47 | RangeArg { 48 | start: 0, 49 | end: None, 50 | } 51 | } 52 | } 53 | 54 | impl From for RangeArg { 55 | #[inline] 56 | fn from(i: usize) -> RangeArg { 57 | RangeArg { 58 | start: i, 59 | end: Some(i+1), 60 | } 61 | } 62 | } 63 | 64 | //////////////////////////////////////////////////////////////////////////////////////////////////// 65 | 66 | #[macro_export] 67 | macro_rules! s( 68 | (@as_expr $e:expr) => ($e); 69 | (@parse [$($stack:tt)*] $r:expr) => { 70 | s![@as_expr [$($stack)* s!(@step $r)]] 71 | }; 72 | (@parse [$($stack:tt)*] $r:expr, $($t:tt)*) => { 73 | s![@parse [$($stack)* s!(@step $r),] $($t)*] 74 | }; 75 | (@step $r:expr) => { 76 | <$crate::RangeArg as ::std::convert::From<_>>::from($r) 77 | }; 78 | ($($t:tt)*) => { 79 | s![@parse [] $($t)*] 80 | }; 81 | ); 82 | 83 | #[test] 84 | fn test_s_macro() { 85 | let s: [RangeArg; 2] = s![1..3, 1]; 86 | 87 | assert!(s[0].start == 1); 88 | assert!(s[1].start == 1); 89 | 90 | assert!(s[0].end == Some(3)); 91 | assert!(s[1].end == Some(2)); 92 | 93 | assert!(s[0].len(5) == 2); 94 | assert!(s[1].len(5) == 1); 95 | } 96 | -------------------------------------------------------------------------------- /src/array.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::ops::{Index, IndexMut}; 3 | 4 | use helper; 5 | 6 | // A n-dimensional array 7 | pub struct Array { 8 | shape: Vec, 9 | dim_steps: Vec, // the ‘volume’ of 1 unit in each dimension. 10 | buffer: Vec, 11 | } 12 | 13 | impl Array { 14 | pub fn new(shape: Vec, initial: T) -> Array where T: Clone { 15 | let buf_size = shape.iter().fold(1, |a, b| a*b); 16 | let dim_steps = helper::compute_dim_steps(&shape); 17 | Array { 18 | shape: shape, 19 | dim_steps: dim_steps, 20 | buffer: vec![initial; buf_size], 21 | } 22 | } 23 | 24 | pub fn from_vec(shape: Vec, vec: Vec) -> Array { 25 | let dim_steps = helper::compute_dim_steps(&shape); 26 | Array { 27 | shape: shape, 28 | dim_steps: dim_steps, 29 | buffer: vec, 30 | } 31 | } 32 | 33 | pub fn reshape(&mut self, new_shape: Vec) { 34 | let buf_size = new_shape.iter().fold(1, |a, b| a*b); 35 | 36 | if buf_size != self.buffer.len() { 37 | panic!("Failed to reshape Array of shape {:?} to {:?}", self.shape, new_shape); 38 | } 39 | self.dim_steps = helper::compute_dim_steps(&new_shape); 40 | self.shape = new_shape; 41 | } 42 | 43 | pub fn get<'a, 'b, I: IntoIterator>(&'a self, coords: I) -> &'a T { 44 | let index: usize = coords.into_iter().zip(self.dim_steps.iter()) 45 | .map(|(c, s)| (*c)*(*s)) 46 | .sum(); 47 | &self.buffer[index] 48 | } 49 | 50 | pub fn get_mut<'a, 'b, I: IntoIterator>(&'a mut self, coords: I) -> &'a mut T { 51 | let index: usize = coords.into_iter().zip(self.dim_steps.iter()) 52 | .map(|(c, s)| (*c)*(*s)) 53 | .sum(); 54 | &mut self.buffer[index] 55 | } 56 | 57 | pub fn shape(&self) -> &[usize] { 58 | &self.shape 59 | } 60 | 61 | pub fn dim_steps(&self) -> &[usize] { 62 | &self.dim_steps 63 | } 64 | 65 | pub fn buffer(&self) -> &[T] { 66 | &self.buffer 67 | } 68 | 69 | pub fn buffer_mut(&mut self) -> &mut [T] { 70 | &mut self.buffer 71 | } 72 | } 73 | 74 | impl<'a, 'b, T, I: IntoIterator> Index for Array { 75 | type Output = T; 76 | 77 | fn index<'r>(&'r self, index: I) -> &'r T { 78 | self.get(index) 79 | } 80 | } 81 | 82 | impl<'a, 'b, T, I: IntoIterator> IndexMut for Array { 83 | fn index_mut<'r>(&'r mut self, index: I) -> &'r mut T { 84 | self.get_mut(index) 85 | } 86 | } 87 | 88 | impl fmt::Debug for Array { 89 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 90 | try!(write!(f, "[\n")); 91 | for row in 0..self.shape[0] { 92 | try!(write!(f, "[{:?}", self.get(&[row, 0]))); 93 | for col in 1..self.shape[1] { 94 | try!(write!(f, "\t{:?}", self.get(&[row, col]))); 95 | } 96 | try!(write!(f, "]\n")); 97 | } 98 | try!(write!(f, "]\n")); 99 | Ok(()) 100 | } 101 | } 102 | 103 | //////////////////////////////////////////////////////////////////////////////////////////////////// 104 | 105 | #[test] 106 | fn test_reshape() { 107 | let mut a = Array::from_vec(vec![2, 3], vec![1, 2, 3, 108 | 4, 5, 6]); 109 | a.reshape(vec![6]); 110 | 111 | assert!(a[&[0]] == 1); 112 | assert!(a[&[1]] == 2); 113 | assert!(a[&[2]] == 3); 114 | assert!(a[&[3]] == 4); 115 | assert!(a[&[4]] == 5); 116 | assert!(a[&[5]] == 6); 117 | } 118 | 119 | #[test] 120 | fn test_array_indexing() { 121 | let a = Array::from_vec(vec![2, 3, 4], vec![1, 2, 3, 4, 122 | 5, 6, 7, 8, 123 | 9, 10, 11, 12, 124 | 125 | 13, 14, 15, 16, 126 | 17, 18, 19, 20, 127 | 21, 22, 23, 24]); 128 | assert!(a[&[0, 0, 0]] == 1); 129 | assert!(a[&[0, 0, 1]] == 2); 130 | assert!(a[&[0, 0, 2]] == 3); 131 | assert!(a[&[0, 0, 3]] == 4); 132 | assert!(a[&[0, 1, 0]] == 5); 133 | assert!(a[&[0, 1, 1]] == 6); 134 | assert!(a[&[0, 1, 2]] == 7); 135 | assert!(a[&[0, 1, 3]] == 8); 136 | assert!(a[&[0, 2, 0]] == 9); 137 | assert!(a[&[0, 2, 1]] == 10); 138 | assert!(a[&[0, 2, 2]] == 11); 139 | assert!(a[&[0, 2, 3]] == 12); 140 | assert!(a[&[1, 0, 0]] == 13); 141 | assert!(a[&[1, 0, 1]] == 14); 142 | assert!(a[&[1, 0, 2]] == 15); 143 | assert!(a[&[1, 0, 3]] == 16); 144 | assert!(a[&[1, 1, 0]] == 17); 145 | assert!(a[&[1, 1, 1]] == 18); 146 | assert!(a[&[1, 1, 2]] == 19); 147 | assert!(a[&[1, 1, 3]] == 20); 148 | assert!(a[&[1, 2, 0]] == 21); 149 | assert!(a[&[1, 2, 1]] == 22); 150 | assert!(a[&[1, 2, 2]] == 23); 151 | assert!(a[&[1, 2, 3]] == 24); 152 | } 153 | 154 | #[test] 155 | fn test_array_indexing_mut() { 156 | let mut a = Array::from_vec(vec![3], vec![1, 2, 3]); 157 | a[&[1]] = 42; 158 | assert!(a[&[1]] == 42); 159 | } 160 | -------------------------------------------------------------------------------- /src/tensor.rs: -------------------------------------------------------------------------------- 1 | use std::rc::Rc; 2 | use std::cell::{RefCell, Ref}; 3 | 4 | use opencl; 5 | use opencl::hl::KernelArg; 6 | use opencl::mem::{Buffer, CLBuffer}; 7 | use libc; 8 | 9 | use array::Array; 10 | use context::Context; 11 | use helper; 12 | use num::Num; 13 | use range_arg::RangeArg; 14 | 15 | pub enum TensorMode { 16 | In, 17 | Out, 18 | Mut, 19 | } 20 | 21 | pub struct Tensor { 22 | shape: Vec, 23 | dim_steps: Vec, 24 | buffer: CLBuffer, 25 | event: RefCell>, 26 | } 27 | 28 | impl Tensor { 29 | pub fn new(ctx: &Context, shape: Vec, mode: TensorMode) -> Tensor { 30 | let mem_mode = 31 | match mode { 32 | TensorMode::In => { opencl::cl::CL_MEM_READ_ONLY }, 33 | TensorMode::Out => { opencl::cl::CL_MEM_WRITE_ONLY }, 34 | TensorMode::Mut => { opencl::cl::CL_MEM_READ_WRITE }, 35 | }; 36 | let buf_size = shape.iter().fold(1, |a, b| a*b); 37 | let dim_steps = helper::compute_dim_steps(&shape); 38 | Tensor { 39 | shape: shape, 40 | dim_steps: dim_steps, 41 | buffer: ctx.ctx.create_buffer(buf_size, mem_mode), 42 | event: RefCell::new(Rc::new(Event::new_complete(&ctx.ctx))), 43 | } 44 | } 45 | 46 | pub fn from_array(ctx: &Context, 47 | array: &Array, 48 | mode: TensorMode) -> Tensor { 49 | let mem_mode = 50 | match mode { 51 | TensorMode::In => { opencl::cl::CL_MEM_READ_ONLY }, 52 | TensorMode::Out => { opencl::cl::CL_MEM_WRITE_ONLY }, 53 | TensorMode::Mut => { opencl::cl::CL_MEM_READ_WRITE }, 54 | }; 55 | Tensor { 56 | shape: array.shape().to_vec(), 57 | dim_steps: array.dim_steps().to_owned(), 58 | buffer: ctx.ctx.create_buffer_from(array.buffer(), mem_mode), 59 | event: RefCell::new(Rc::new(Event::new_complete(&ctx.ctx))), 60 | } 61 | } 62 | 63 | pub fn get(&self, ctx: &Context) -> Array { 64 | let vec = ctx.queue.get(&self.buffer, &**self.get_event()); 65 | Array::from_vec(self.shape.clone(), vec) 66 | } 67 | 68 | pub fn read(&self, ctx: &Context, array: &mut Array) { 69 | ctx.queue.read(&self.buffer, &mut array.buffer_mut(), &**self.get_event()); 70 | } 71 | 72 | pub fn set(&self, ctx: &Context, array: &Array) { 73 | ctx.queue.write(&self.buffer, &array.buffer(), ()); 74 | } 75 | 76 | pub fn shape(&self) -> &[usize] { 77 | &self.shape 78 | } 79 | 80 | pub fn dim_steps(&self) -> &[usize] { 81 | &self.dim_steps 82 | } 83 | 84 | pub fn len(&self) -> usize { 85 | self.buffer.len() 86 | } 87 | 88 | pub fn set_event(&self, e: Rc) { 89 | *self.event.borrow_mut() = e; 90 | } 91 | 92 | pub fn get_event(&self) -> Ref> { 93 | self.event.borrow() 94 | } 95 | 96 | pub fn slice<'t, R: AsRef<[RangeArg]>>(&'t self, r: R) -> TensorView<'t, T, R> { 97 | TensorView { 98 | shape: self.shape.as_ref(), 99 | dim_steps: self.dim_steps.as_ref(), 100 | ranges: r, 101 | buffer: &self.buffer, 102 | event: &self.event, 103 | } 104 | } 105 | } 106 | 107 | impl KernelArg for Tensor { 108 | fn get_value(&self) -> (libc::size_t, *const libc::c_void) { 109 | self.buffer.get_value() 110 | } 111 | } 112 | 113 | impl<'t, T: Num, R: AsRef<[RangeArg]>> KernelArg for TensorView<'t, T, R> { 114 | fn get_value(&self) -> (libc::size_t, *const libc::c_void) { 115 | self.buffer.get_value() 116 | } 117 | } 118 | 119 | pub struct TensorView<'t, T: Num+'t, R: AsRef<[RangeArg]>> { 120 | pub shape: &'t [usize], 121 | pub dim_steps: &'t [usize], 122 | ranges: R, 123 | buffer: &'t CLBuffer, 124 | event: &'t RefCell>, 125 | } 126 | 127 | impl<'t, T: Num, R: AsRef<[RangeArg]>> TensorView<'t, T, R> { 128 | pub fn set_event(&self, e: Rc) { 129 | *self.event.borrow_mut() = e; 130 | } 131 | 132 | pub fn get_event(&self) -> Ref> { 133 | self.event.borrow() 134 | } 135 | 136 | pub fn view_offset(&self, dim: usize) -> usize { 137 | self.ranges.as_ref().get(dim).map(|r| r.start).unwrap_or(0) 138 | } 139 | 140 | pub fn view_shape(&self, dim: usize) -> usize { 141 | self.ranges.as_ref().get(dim).map(|r| r.len(self.shape[dim])).unwrap_or(self.shape[dim]) 142 | } 143 | 144 | pub fn len(&self) -> usize { 145 | self.buffer.len() 146 | } 147 | } 148 | 149 | pub type Event = opencl::hl::Event; 150 | 151 | //////////////////////////////////////////////////////////////////////////////////////////////////// 152 | 153 | #[test] 154 | fn test_tensor_read() { 155 | let ref ctx = Context::new(); 156 | 157 | let a = Array::from_vec(vec![2, 2], vec![1i32, 2, 158 | 3, 4]); 159 | let at = Tensor::from_array(ctx, &a, TensorMode::Out); 160 | let mut b = Array::new(vec![2, 2], 0); 161 | 162 | at.read(ctx, &mut b); 163 | 164 | assert!(b.buffer() == &[1, 2, 165 | 3, 4]); 166 | } 167 | 168 | #[test] 169 | fn test_tensor_incomplete_slice() { 170 | let ref ctx = Context::new(); 171 | let t = Tensor::::new(ctx, vec![4, 5, 6], TensorMode::Out); 172 | let t_slice = t.slice(s![1..3]); 173 | assert!(t_slice.view_offset(0) == 1); 174 | assert!(t_slice.view_offset(1) == 0); 175 | assert!(t_slice.view_offset(2) == 0); 176 | assert!(t_slice.view_shape(0) == 2); 177 | assert!(t_slice.view_shape(1) == 5); 178 | assert!(t_slice.view_shape(2) == 6); 179 | } 180 | -------------------------------------------------------------------------------- /src/kernels.rs: -------------------------------------------------------------------------------- 1 | use std::any::TypeId; 2 | use std::collections::HashMap; 3 | 4 | use opencl::hl::{Kernel, Program}; 5 | 6 | use num::Num; 7 | 8 | macro_rules! kernels_hashmap { 9 | ( $program:ident, $kernel_name:expr, $( $t:ty ),* ) => { 10 | { 11 | let mut kernels = HashMap::new(); 12 | $( 13 | let type_name = stringify!($t); 14 | kernels.insert(TypeId::of::<$t>(), 15 | $program.create_kernel(format!("array_{}_{}", 16 | $kernel_name, type_name).as_ref())); 17 | )* 18 | kernels 19 | } 20 | }; 21 | } 22 | 23 | pub struct Kernels { 24 | copy_to: HashMap, 25 | fill: HashMap, 26 | sum: HashMap, 27 | add: HashMap, 28 | sub: HashMap, 29 | multiply: HashMap, 30 | divide: HashMap, 31 | transpose: HashMap, 32 | matmul: HashMap, 33 | max: HashMap, 34 | dmax: HashMap, 35 | min: HashMap, 36 | dmin: HashMap, 37 | mse: HashMap, 38 | dmse: HashMap, 39 | tanh: HashMap, 40 | dtanh: HashMap, 41 | sigmoid: HashMap, 42 | dsigmoid: HashMap, 43 | log: HashMap, 44 | exp: HashMap, 45 | negate: HashMap, 46 | sgd: HashMap, 47 | rmsprop: HashMap, 48 | 49 | add_slice: HashMap, 50 | copy_to_slice: HashMap, 51 | fill_slice: HashMap, 52 | multiply_slice: HashMap, 53 | sigmoid_slice: HashMap, 54 | dsigmoid_slice: HashMap, 55 | tanh_slice: HashMap, 56 | dtanh_slice: HashMap, 57 | } 58 | 59 | impl Kernels { 60 | pub fn new(program: &Program) -> Kernels { 61 | Kernels { 62 | copy_to: kernels_hashmap!(program, "copy_to", f32, i32), 63 | fill: kernels_hashmap!(program, "fill", f32, i32), 64 | sum: kernels_hashmap!(program, "sum", f32, i32), 65 | add: kernels_hashmap!(program, "add", f32, i32), 66 | sub: kernels_hashmap!(program, "sub", f32), 67 | multiply: kernels_hashmap!(program, "multiply", f32, i32), 68 | divide: kernels_hashmap!(program, "divide", f32), 69 | transpose: kernels_hashmap!(program, "transpose", f32, i32), 70 | matmul: kernels_hashmap!(program, "matmul", f32, i32), 71 | max: kernels_hashmap!(program, "max", f32), 72 | dmax: kernels_hashmap!(program, "dmax", f32), 73 | min: kernels_hashmap!(program, "min", f32), 74 | dmin: kernels_hashmap!(program, "dmin", f32), 75 | mse: kernels_hashmap!(program, "mse", f32), 76 | dmse: kernels_hashmap!(program, "dmse", f32), 77 | tanh: kernels_hashmap!(program, "tanh", f32), 78 | dtanh: kernels_hashmap!(program, "dtanh", f32), 79 | sigmoid: kernels_hashmap!(program, "sigmoid", f32), 80 | dsigmoid: kernels_hashmap!(program, "dsigmoid", f32), 81 | log: kernels_hashmap!(program, "log", f32), 82 | exp: kernels_hashmap!(program, "exp", f32), 83 | negate: kernels_hashmap!(program, "negate", f32), 84 | sgd: kernels_hashmap!(program, "sgd", f32), 85 | rmsprop: kernels_hashmap!(program, "rmsprop", f32), 86 | 87 | add_slice: kernels_hashmap!(program, "add_slice", f32, i32), 88 | copy_to_slice: kernels_hashmap!(program, "copy_to_slice", f32, i32), 89 | fill_slice: kernels_hashmap!(program, "fill_slice", f32, i32), 90 | multiply_slice: kernels_hashmap!(program, "multiply_slice", f32, i32), 91 | sigmoid_slice: kernels_hashmap!(program, "sigmoid_slice", f32), 92 | dsigmoid_slice: kernels_hashmap!(program, "dsigmoid_slice", f32), 93 | tanh_slice: kernels_hashmap!(program, "tanh_slice", f32), 94 | dtanh_slice: kernels_hashmap!(program, "dtanh_slice", f32), 95 | } 96 | } 97 | 98 | pub fn copy_to(&self) -> &Kernel { 99 | &self.copy_to[&TypeId::of::()] 100 | } 101 | 102 | pub fn fill(&self) -> &Kernel { 103 | &self.fill[&TypeId::of::()] 104 | } 105 | 106 | pub fn sum(&self) -> &Kernel { 107 | &self.sum[&TypeId::of::()] 108 | } 109 | 110 | pub fn add(&self) -> &Kernel { 111 | &self.add[&TypeId::of::()] 112 | } 113 | 114 | pub fn sub(&self) -> &Kernel { 115 | &self.sub[&TypeId::of::()] 116 | } 117 | 118 | pub fn multiply(&self) -> &Kernel { 119 | &self.multiply[&TypeId::of::()] 120 | } 121 | 122 | pub fn divide(&self) -> &Kernel { 123 | &self.divide[&TypeId::of::()] 124 | } 125 | 126 | pub fn transpose(&self) -> &Kernel { 127 | &self.transpose[&TypeId::of::()] 128 | } 129 | 130 | pub fn matmul(&self) -> &Kernel { 131 | &self.matmul[&TypeId::of::()] 132 | } 133 | 134 | pub fn max(&self) -> &Kernel { 135 | &self.max[&TypeId::of::()] 136 | } 137 | 138 | pub fn dmax(&self) -> &Kernel { 139 | &self.dmax[&TypeId::of::()] 140 | } 141 | 142 | pub fn min(&self) -> &Kernel { 143 | &self.min[&TypeId::of::()] 144 | } 145 | 146 | pub fn dmin(&self) -> &Kernel { 147 | &self.dmin[&TypeId::of::()] 148 | } 149 | 150 | pub fn mse(&self) -> &Kernel { 151 | &self.mse[&TypeId::of::()] 152 | } 153 | 154 | pub fn dmse(&self) -> &Kernel { 155 | &self.dmse[&TypeId::of::()] 156 | } 157 | 158 | pub fn tanh(&self) -> &Kernel { 159 | &self.tanh[&TypeId::of::()] 160 | } 161 | 162 | pub fn dtanh(&self) -> &Kernel { 163 | &self.dtanh[&TypeId::of::()] 164 | } 165 | 166 | pub fn sigmoid(&self) -> &Kernel { 167 | &self.sigmoid[&TypeId::of::()] 168 | } 169 | 170 | pub fn dsigmoid(&self) -> &Kernel { 171 | &self.dsigmoid[&TypeId::of::()] 172 | } 173 | 174 | pub fn log(&self) -> &Kernel { 175 | &self.log[&TypeId::of::()] 176 | } 177 | 178 | pub fn exp(&self) -> &Kernel { 179 | &self.exp[&TypeId::of::()] 180 | } 181 | 182 | pub fn negate(&self) -> &Kernel { 183 | &self.negate[&TypeId::of::()] 184 | } 185 | 186 | pub fn sgd(&self) -> &Kernel { 187 | &self.sgd[&TypeId::of::()] 188 | } 189 | 190 | pub fn rmsprop(&self) -> &Kernel { 191 | &self.rmsprop[&TypeId::of::()] 192 | } 193 | 194 | pub fn copy_to_slice(&self) -> &Kernel { 195 | &self.copy_to_slice[&TypeId::of::()] 196 | } 197 | 198 | pub fn fill_slice(&self) -> &Kernel { 199 | &self.fill_slice[&TypeId::of::()] 200 | } 201 | 202 | pub fn add_slice(&self) -> &Kernel { 203 | &self.add_slice[&TypeId::of::()] 204 | } 205 | 206 | pub fn multiply_slice(&self) -> &Kernel { 207 | &self.multiply_slice[&TypeId::of::()] 208 | } 209 | 210 | pub fn sigmoid_slice(&self) -> &Kernel { 211 | &self.sigmoid_slice[&TypeId::of::()] 212 | } 213 | 214 | pub fn dsigmoid_slice(&self) -> &Kernel { 215 | &self.dsigmoid_slice[&TypeId::of::()] 216 | } 217 | 218 | pub fn tanh_slice(&self) -> &Kernel { 219 | &self.tanh_slice[&TypeId::of::()] 220 | } 221 | 222 | pub fn dtanh_slice(&self) -> &Kernel { 223 | &self.dtanh_slice[&TypeId::of::()] 224 | } 225 | } 226 | -------------------------------------------------------------------------------- /src/cl/slice_ops.cl: -------------------------------------------------------------------------------- 1 | ulong index2(ulong cols, ulong row, ulong col) { 2 | return row*cols + col; 3 | } 4 | 5 | ulong dot_ulong4(ulong4 a, ulong4 b) { 6 | ulong4 prod = a*b; 7 | ulong2 half_sum = prod.xy + prod.zw; 8 | return half_sum[0] + half_sum[1]; 9 | } 10 | 11 | ulong index4(ulong4 dim_steps, ulong4 coords) { 12 | return dot_ulong4(dim_steps, coords); 13 | } 14 | 15 | //////////////////////////////////////////////////////////////////////////////////////////////////// 16 | 17 | __kernel void array_add_slice_i32(__global int* a, __global int* b, __global int* c, 18 | ulong4 a_dim_steps, ulong4 a_off, 19 | ulong4 b_dim_steps, ulong4 b_off, 20 | ulong4 c_dim_steps, ulong4 c_off) { 21 | ulong i = get_global_id(0); 22 | ulong j = get_global_id(1); 23 | ulong k = get_global_id(2); 24 | 25 | a_off[1] += i; 26 | a_off[2] += j; 27 | a_off[3] += k; 28 | 29 | b_off[1] += i; 30 | b_off[2] += j; 31 | b_off[3] += k; 32 | 33 | c_off[1] += i; 34 | c_off[2] += j; 35 | c_off[3] += k; 36 | 37 | ulong ai = index4(a_dim_steps, a_off); 38 | ulong bi = index4(b_dim_steps, b_off); 39 | ulong ci = index4(c_dim_steps, c_off); 40 | 41 | c[ci] = a[ai] + b[bi]; 42 | } 43 | 44 | __kernel void array_add_slice_f32(__global float* a, __global float* b, __global float* c, 45 | ulong4 a_dim_steps, ulong4 a_off, 46 | ulong4 b_dim_steps, ulong4 b_off, 47 | ulong4 c_dim_steps, ulong4 c_off) { 48 | ulong i = get_global_id(0); 49 | ulong j = get_global_id(1); 50 | ulong k = get_global_id(2); 51 | 52 | a_off[1] += i; 53 | a_off[2] += j; 54 | a_off[3] += k; 55 | 56 | b_off[1] += i; 57 | b_off[2] += j; 58 | b_off[3] += k; 59 | 60 | c_off[1] += i; 61 | c_off[2] += j; 62 | c_off[3] += k; 63 | 64 | ulong ai = index4(a_dim_steps, a_off); 65 | ulong bi = index4(b_dim_steps, b_off); 66 | ulong ci = index4(c_dim_steps, c_off); 67 | 68 | c[ci] = a[ai] + b[bi]; 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////////////////////////// 72 | 73 | __kernel void array_multiply_slice_i32(__global int* a, __global int* b, __global int* c, 74 | ulong4 a_dim_steps, ulong4 a_off, 75 | ulong4 b_dim_steps, ulong4 b_off, 76 | ulong4 c_dim_steps, ulong4 c_off) { 77 | ulong i = get_global_id(0); 78 | ulong j = get_global_id(1); 79 | ulong k = get_global_id(2); 80 | 81 | a_off[1] += i; 82 | a_off[2] += j; 83 | a_off[3] += k; 84 | 85 | b_off[1] += i; 86 | b_off[2] += j; 87 | b_off[3] += k; 88 | 89 | c_off[1] += i; 90 | c_off[2] += j; 91 | c_off[3] += k; 92 | 93 | ulong ai = index4(a_dim_steps, a_off); 94 | ulong bi = index4(b_dim_steps, b_off); 95 | ulong ci = index4(c_dim_steps, c_off); 96 | 97 | c[ci] = a[ai] * b[bi]; 98 | } 99 | 100 | __kernel void array_multiply_slice_f32(__global float* a, __global float* b, __global float* c, 101 | ulong4 a_dim_steps, ulong4 a_off, 102 | ulong4 b_dim_steps, ulong4 b_off, 103 | ulong4 c_dim_steps, ulong4 c_off) { 104 | ulong i = get_global_id(0); 105 | ulong j = get_global_id(1); 106 | ulong k = get_global_id(2); 107 | 108 | a_off[1] += i; 109 | a_off[2] += j; 110 | a_off[3] += k; 111 | 112 | b_off[1] += i; 113 | b_off[2] += j; 114 | b_off[3] += k; 115 | 116 | c_off[1] += i; 117 | c_off[2] += j; 118 | c_off[3] += k; 119 | 120 | ulong ai = index4(a_dim_steps, a_off); 121 | ulong bi = index4(b_dim_steps, b_off); 122 | ulong ci = index4(c_dim_steps, c_off); 123 | 124 | c[ci] = a[ai] * b[bi]; 125 | } 126 | 127 | //////////////////////////////////////////////////////////////////////////////////////////////////// 128 | 129 | __kernel void array_copy_to_slice_i32(__global int* a, __global int* b, 130 | ulong4 a_dim_steps, ulong4 a_off, 131 | ulong4 b_dim_steps, ulong4 b_off) { 132 | ulong i = get_global_id(0); 133 | ulong j = get_global_id(1); 134 | ulong k = get_global_id(2); 135 | 136 | a_off[1] += i; 137 | a_off[2] += j; 138 | a_off[3] += k; 139 | 140 | b_off[1] += i; 141 | b_off[2] += j; 142 | b_off[3] += k; 143 | 144 | ulong ai = index4(a_dim_steps, a_off); 145 | ulong bi = index4(b_dim_steps, b_off); 146 | 147 | b[bi] = a[ai]; 148 | } 149 | 150 | __kernel void array_copy_to_slice_f32(__global float* a, __global float* b, 151 | ulong4 a_dim_steps, ulong4 a_off, 152 | ulong4 b_dim_steps, ulong4 b_off) { 153 | ulong i = get_global_id(0); 154 | ulong j = get_global_id(1); 155 | ulong k = get_global_id(2); 156 | 157 | a_off[1] += i; 158 | a_off[2] += j; 159 | a_off[3] += k; 160 | 161 | b_off[1] += i; 162 | b_off[2] += j; 163 | b_off[3] += k; 164 | 165 | ulong ai = index4(a_dim_steps, a_off); 166 | ulong bi = index4(b_dim_steps, b_off); 167 | 168 | b[bi] = a[ai]; 169 | } 170 | 171 | //////////////////////////////////////////////////////////////////////////////////////////////////// 172 | 173 | __kernel void array_fill_slice_i32(__global int* a, int val, 174 | ulong4 a_dim_steps, ulong4 a_off) { 175 | ulong i = get_global_id(0); 176 | ulong j = get_global_id(1); 177 | ulong k = get_global_id(2); 178 | 179 | a_off[1] += i; 180 | a_off[2] += j; 181 | a_off[3] += k; 182 | 183 | ulong ai = index4(a_dim_steps, a_off); 184 | 185 | a[ai] = val; 186 | } 187 | 188 | __kernel void array_fill_slice_f32(__global float* a, float val, 189 | ulong4 a_dim_steps, ulong4 a_off) { 190 | ulong i = get_global_id(0); 191 | ulong j = get_global_id(1); 192 | ulong k = get_global_id(2); 193 | 194 | a_off[1] += i; 195 | a_off[2] += j; 196 | a_off[3] += k; 197 | 198 | ulong ai = index4(a_dim_steps, a_off); 199 | 200 | a[ai] = val; 201 | } 202 | 203 | //////////////////////////////////////////////////////////////////////////////////////////////////// 204 | 205 | __kernel void array_sigmoid_slice_f32(__global float* a, __global float* b, 206 | ulong a_off0, ulong a_off1, 207 | ulong b_off0, ulong b_off1, 208 | ulong a_cols, ulong b_cols) { 209 | ulong i = get_global_id(0); 210 | ulong j = get_global_id(1); 211 | ulong ai = index2(a_cols, i+a_off0, j+a_off1); 212 | ulong bi = index2(b_cols, i+b_off0, j+b_off1); 213 | b[bi] = sigmoid(a[ai]); 214 | } 215 | 216 | __kernel void array_dsigmoid_slice_f32(__global float* a, __global float* b, 217 | ulong a_off0, ulong a_off1, 218 | ulong b_off0, ulong b_off1, 219 | ulong a_cols, ulong b_cols) { 220 | ulong i = get_global_id(0); 221 | ulong j = get_global_id(1); 222 | ulong ai = index2(a_cols, i+a_off0, j+a_off1); 223 | ulong bi = index2(b_cols, i+b_off0, j+b_off1); 224 | // dsigmoid(x) = sigmoid(x)*(1 - sigmoid(x)) 225 | b[bi] = sigmoid(a[ai]); 226 | b[bi] = b[bi]*(1.0 - b[bi]); 227 | } 228 | 229 | //////////////////////////////////////////////////////////////////////////////////////////////////// 230 | 231 | __kernel void array_tanh_slice_f32(__global float* a, __global float* b, 232 | ulong a_off0, ulong a_off1, 233 | ulong b_off0, ulong b_off1, 234 | ulong a_cols, ulong b_cols) { 235 | ulong i = get_global_id(0); 236 | ulong j = get_global_id(1); 237 | ulong ai = index2(a_cols, i+a_off0, j+a_off1); 238 | ulong bi = index2(b_cols, i+b_off0, j+b_off1); 239 | b[bi] = tanh(a[ai]); 240 | } 241 | 242 | __kernel void array_dtanh_slice_f32(__global float* a, __global float* b, 243 | ulong a_off0, ulong a_off1, 244 | ulong b_off0, ulong b_off1, 245 | ulong a_cols, ulong b_cols) { 246 | ulong i = get_global_id(0); 247 | ulong j = get_global_id(1); 248 | ulong ai = index2(a_cols, i+a_off0, j+a_off1); 249 | ulong bi = index2(b_cols, i+b_off0, j+b_off1); 250 | // dtanh(x) = 1 - tanh(x)^2 251 | b[bi] = tanh(a[ai]); 252 | b[bi] = 1.0 - b[bi]*b[bi]; 253 | } 254 | -------------------------------------------------------------------------------- /src/cl/main.cl: -------------------------------------------------------------------------------- 1 | __kernel void array_fill_f32(__global float* a, float val) { 2 | uintptr_t i = get_global_id(0); 3 | a[i] = val; 4 | } 5 | 6 | __kernel void array_fill_i32(__global int* a, int val) { 7 | uintptr_t i = get_global_id(0); 8 | a[i] = val; 9 | } 10 | 11 | //////////////////////////////////////////////////////////////////////////////////////////////////// 12 | 13 | __kernel void array_copy_to_f32(__global const float *f, 14 | __global float *t) { 15 | uintptr_t i = get_global_id(0); 16 | t[i] = f[i]; 17 | } 18 | 19 | __kernel void array_copy_to_i32(__global const int *f, 20 | __global int *t) { 21 | uintptr_t i = get_global_id(0); 22 | t[i] = f[i]; 23 | } 24 | 25 | //////////////////////////////////////////////////////////////////////////////////////////////////// 26 | // Sum 27 | 28 | __kernel void array_sum_f32(__global float *a, 29 | __global float *b, 30 | ulong rows, 31 | ulong cols, 32 | ulong axis) { 33 | ulong i = get_global_id(0); 34 | 35 | b[i] = 0.0; // Initialize to zero before we start summing things up 36 | 37 | if (axis == 0) { 38 | for (ulong m = 0; m < rows; m++) { 39 | b[i] += a[m*cols + i]; 40 | } 41 | } else if (axis == 1) { 42 | for (ulong m = 0; m < cols; m++) { 43 | b[i] += a[i*cols + m]; 44 | } 45 | } 46 | } 47 | 48 | __kernel void array_sum_i32(__global int *a, 49 | __global int *b, 50 | ulong rows, 51 | ulong cols, 52 | ulong axis) { 53 | ulong i = get_global_id(0); 54 | 55 | b[i] = 0.0; // Initialize to zero before we start summing things up 56 | 57 | if (axis == 0) { 58 | for (ulong m = 0; m < rows; m++) { 59 | b[i] += a[m*cols + i]; 60 | } 61 | } else if (axis == 1) { 62 | for (ulong m = 0; m < cols; m++) { 63 | b[i] += a[i*cols + m]; 64 | } 65 | } 66 | } 67 | 68 | //////////////////////////////////////////////////////////////////////////////////////////////////// 69 | 70 | __kernel void array_add_f32(__global const float *a, 71 | __global const float *b, 72 | __global float *c, 73 | const ulong cols, 74 | const int axis) { 75 | ulong i = get_global_id(0); 76 | ulong j = get_global_id(1); 77 | if (axis == -1) { 78 | c[i*cols + j] = a[i*cols + j] + b[i*cols + j]; 79 | } else if (axis == 0) { 80 | c[i*cols + j] = a[i*cols + j] + b[j]; 81 | } else if (axis == 1) { 82 | c[i*cols + j] = a[i*cols + j] + b[i]; 83 | } 84 | } 85 | 86 | __kernel void array_add_i32(__global const int *a, 87 | __global const int *b, 88 | __global int *c, 89 | const ulong cols, 90 | const int axis) { 91 | ulong i = get_global_id(0); 92 | ulong j = get_global_id(1); 93 | if (axis == -1) { 94 | c[i*cols + j] = a[i*cols + j] + b[i*cols + j]; 95 | } else if (axis == 0) { 96 | c[i*cols + j] = a[i*cols + j] + b[j]; 97 | } else if (axis == 1) { 98 | c[i*cols + j] = a[i*cols + j] + b[i]; 99 | } 100 | } 101 | 102 | __kernel void array_add_u64(__global const ulong *a, 103 | __global const ulong *b, 104 | __global ulong *c, 105 | const ulong cols, 106 | const int axis) { 107 | ulong i = get_global_id(0); 108 | ulong j = get_global_id(1); 109 | if (axis == -1) { 110 | c[i*cols + j] = a[i*cols + j] + b[i*cols + j]; 111 | } else if (axis == 0) { 112 | c[i*cols + j] = a[i*cols + j] + b[j]; 113 | } else if (axis == 1) { 114 | c[i*cols + j] = a[i*cols + j] + b[i]; 115 | } 116 | } 117 | 118 | //////////////////////////////////////////////////////////////////////////////////////////////////// 119 | 120 | __kernel void array_sub_f32(__global const float *a, 121 | __global const float *b, 122 | __global float *c) { 123 | uintptr_t i = get_global_id(0); 124 | c[i] = a[i] - b[i]; 125 | } 126 | 127 | __kernel void array_sub_i8(__global const char *a, 128 | __global const char *b, 129 | __global char *c) { 130 | uintptr_t i = get_global_id(0); 131 | c[i] = a[i] - b[i]; 132 | } 133 | 134 | __kernel void array_sub_i16(__global const short *a, 135 | __global const short *b, 136 | __global short *c) { 137 | uintptr_t i = get_global_id(0); 138 | c[i] = a[i] - b[i]; 139 | } 140 | 141 | __kernel void array_sub_i32(__global const int *a, 142 | __global const int *b, 143 | __global int *c) { 144 | uintptr_t i = get_global_id(0); 145 | c[i] = a[i] - b[i]; 146 | } 147 | 148 | __kernel void array_sub_i64(__global const long *a, 149 | __global const long *b, 150 | __global long *c) { 151 | uintptr_t i = get_global_id(0); 152 | c[i] = a[i] - b[i]; 153 | } 154 | 155 | __kernel void array_sub_u8(__global const uchar *a, 156 | __global const uchar *b, 157 | __global uchar *c) { 158 | uintptr_t i = get_global_id(0); 159 | c[i] = a[i] - b[i]; 160 | } 161 | 162 | __kernel void array_sub_u16(__global const ushort *a, 163 | __global const ushort *b, 164 | __global ushort *c) { 165 | uintptr_t i = get_global_id(0); 166 | c[i] = a[i] - b[i]; 167 | } 168 | 169 | __kernel void array_sub_u32(__global const uint *a, 170 | __global const uint *b, 171 | __global uint *c) { 172 | uintptr_t i = get_global_id(0); 173 | c[i] = a[i] - b[i]; 174 | } 175 | 176 | __kernel void array_sub_u64(__global const ulong *a, 177 | __global const ulong *b, 178 | __global ulong *c) { 179 | uintptr_t i = get_global_id(0); 180 | c[i] = a[i] - b[i]; 181 | } 182 | 183 | //////////////////////////////////////////////////////////////////////////////////////////////////// 184 | 185 | /*__kernel void array_multiply_f32(__global const float *a, 186 | __global const float *b, 187 | __global float *c) { 188 | uintptr_t i = get_global_id(0); 189 | c[i] = a[i] * b[i]; 190 | }*/ 191 | 192 | __kernel void array_multiply_f32(__global const float *a, 193 | __global const float *b, 194 | __global float *c, 195 | const ulong cols, 196 | const int axis) { 197 | ulong i = get_global_id(0); 198 | ulong j = get_global_id(1); 199 | if (axis == -1) { 200 | c[i*cols + j] = a[i*cols + j] * b[i*cols + j]; 201 | } else if (axis == 0) { 202 | c[i*cols + j] = a[i*cols + j] * b[j]; 203 | } else if (axis == 1) { 204 | c[i*cols + j] = a[i*cols + j] * b[i]; 205 | } 206 | } 207 | 208 | __kernel void array_multiply_i8(__global const char *a, 209 | __global const char *b, 210 | __global char *c) { 211 | uintptr_t i = get_global_id(0); 212 | c[i] = a[i] * b[i]; 213 | } 214 | 215 | __kernel void array_multiply_i16(__global const short *a, 216 | __global const short *b, 217 | __global short *c) { 218 | uintptr_t i = get_global_id(0); 219 | c[i] = a[i] * b[i]; 220 | } 221 | 222 | __kernel void array_multiply_i32(__global const int *a, 223 | __global const int *b, 224 | __global int *c, 225 | const ulong cols, 226 | const int axis) { 227 | ulong i = get_global_id(0); 228 | ulong j = get_global_id(1); 229 | if (axis == -1) { 230 | c[i*cols + j] = a[i*cols + j] * b[i*cols + j]; 231 | } else if (axis == 0) { 232 | c[i*cols + j] = a[i*cols + j] * b[j]; 233 | } else if (axis == 1) { 234 | c[i*cols + j] = a[i*cols + j] * b[i]; 235 | } 236 | } 237 | 238 | __kernel void array_multiply_i64(__global const long *a, 239 | __global const long *b, 240 | __global long *c) { 241 | uintptr_t i = get_global_id(0); 242 | c[i] = a[i] * b[i]; 243 | } 244 | 245 | __kernel void array_multiply_u8(__global const uchar *a, 246 | __global const uchar *b, 247 | __global uchar *c) { 248 | uintptr_t i = get_global_id(0); 249 | c[i] = a[i] * b[i]; 250 | } 251 | 252 | __kernel void array_multiply_u16(__global const ushort *a, 253 | __global const ushort *b, 254 | __global ushort *c) { 255 | uintptr_t i = get_global_id(0); 256 | c[i] = a[i] * b[i]; 257 | } 258 | 259 | __kernel void array_multiply_u32(__global const uint *a, 260 | __global const uint *b, 261 | __global uint *c) { 262 | uintptr_t i = get_global_id(0); 263 | c[i] = a[i] * b[i]; 264 | } 265 | 266 | __kernel void array_multiply_u64(__global const ulong *a, 267 | __global const ulong *b, 268 | __global ulong *c) { 269 | uintptr_t i = get_global_id(0); 270 | c[i] = a[i] * b[i]; 271 | } 272 | 273 | //////////////////////////////////////////////////////////////////////////////////////////////////// 274 | 275 | __kernel void array_divide_f32(__global const float *a, 276 | __global const float *b, 277 | __global float *c, 278 | const ulong cols, 279 | const int axis) { 280 | ulong i = get_global_id(0); 281 | ulong j = get_global_id(1); 282 | if (axis == -1) { 283 | c[i*cols + j] = a[i*cols + j] / b[i*cols + j]; 284 | } else if (axis == 0) { 285 | c[i*cols + j] = a[i*cols + j] / b[j]; 286 | } else if (axis == 1) { 287 | c[i*cols + j] = a[i*cols + j] / b[i]; 288 | } 289 | } 290 | 291 | //////////////////////////////////////////////////////////////////////////////////////////////////// 292 | 293 | __kernel void array_transpose_f32(__global const float *a, 294 | __global float *b, 295 | const ulong rows, 296 | const ulong cols) { 297 | ulong i = get_global_id(0); 298 | ulong j = get_global_id(1); 299 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 300 | } 301 | 302 | __kernel void array_transpose_i8(__global const char *a, 303 | __global char *b, 304 | const ulong rows, 305 | const ulong cols) { 306 | ulong i = get_global_id(0); 307 | ulong j = get_global_id(1); 308 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 309 | } 310 | 311 | __kernel void array_transpose_i16(__global const short *a, 312 | __global short *b, 313 | const ulong rows, 314 | const ulong cols) { 315 | ulong i = get_global_id(0); 316 | ulong j = get_global_id(1); 317 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 318 | } 319 | 320 | __kernel void array_transpose_i32(__global const int *a, 321 | __global int *b, 322 | const ulong rows, 323 | const ulong cols) { 324 | ulong i = get_global_id(0); 325 | ulong j = get_global_id(1); 326 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 327 | } 328 | 329 | __kernel void array_transpose_i64(__global const long *a, 330 | __global long *b, 331 | const ulong rows, 332 | const ulong cols) { 333 | ulong i = get_global_id(0); 334 | ulong j = get_global_id(1); 335 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 336 | } 337 | 338 | __kernel void array_transpose_u8(__global const uchar *a, 339 | __global uchar *b, 340 | const ulong rows, 341 | const ulong cols) { 342 | ulong i = get_global_id(0); 343 | ulong j = get_global_id(1); 344 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 345 | } 346 | 347 | __kernel void array_transpose_u16(__global const ushort *a, 348 | __global ushort *b, 349 | const ulong rows, 350 | const ulong cols) { 351 | ulong i = get_global_id(0); 352 | ulong j = get_global_id(1); 353 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 354 | } 355 | 356 | __kernel void array_transpose_u32(__global const uint *a, 357 | __global uint *b, 358 | const ulong rows, 359 | const ulong cols) { 360 | ulong i = get_global_id(0); 361 | ulong j = get_global_id(1); 362 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 363 | } 364 | 365 | __kernel void array_transpose_u64(__global const ulong *a, 366 | __global ulong *b, 367 | const ulong rows, 368 | const ulong cols) { 369 | ulong i = get_global_id(0); 370 | ulong j = get_global_id(1); 371 | b[j*rows + i] = a[i*cols + j]; // Flip the dimensions 372 | } 373 | 374 | //////////////////////////////////////////////////////////////////////////////////////////////////// 375 | 376 | __kernel void array_matmul_f32(__global const float *a, 377 | __global const float *b, 378 | __global float *c, 379 | const ulong wa, 380 | const ulong wb) { 381 | ulong i = get_global_id(0); 382 | ulong j = get_global_id(1); 383 | 384 | float accum = 0.0; 385 | for (ulong k = 0; k < wa; k++) { 386 | accum += a[i*wa + k] * b[k*wb + j]; 387 | } 388 | c[i*wb + j] = accum; 389 | } 390 | 391 | __kernel void array_matmul_i32(__global const int *a, 392 | __global const int *b, 393 | __global int *c, 394 | const ulong wa, 395 | const ulong wb) { 396 | ulong i = get_global_id(0); 397 | ulong j = get_global_id(1); 398 | 399 | int accum = 0; 400 | for (ulong k = 0; k < wa; k++) { 401 | accum += a[i*wa + k] * b[k*wb + j]; 402 | } 403 | c[i*wb + j] = accum; 404 | } 405 | 406 | //////////////////////////////////////////////////////////////////////////////////////////////////// 407 | // Max 408 | 409 | __kernel void array_max_f32(__global float *a, 410 | __global float *b, 411 | float const threshold) { 412 | uintptr_t i = get_global_id(0); 413 | b[i] = max(threshold, a[i]); 414 | } 415 | 416 | //////////////////////////////////////////////////////////////////////////////////////////////////// 417 | // derivative of max with respect to a 418 | 419 | __kernel void array_dmax_f32(__global float *a, 420 | __global float *b, 421 | const float threshold) { 422 | uintptr_t i = get_global_id(0); 423 | if (a[i] > threshold) { 424 | b[i] = 1; 425 | } else { 426 | b[i] = 0; 427 | } 428 | } 429 | 430 | //////////////////////////////////////////////////////////////////////////////////////////////////// 431 | // Min 432 | 433 | __kernel void array_min_f32(__global float *a, 434 | __global float *b, 435 | const float threshold) { 436 | uintptr_t i = get_global_id(0); 437 | b[i] = min(threshold, a[i]); 438 | } 439 | 440 | //////////////////////////////////////////////////////////////////////////////////////////////////// 441 | // derivative of min with respect to a 442 | 443 | __kernel void array_dmin_f32(__global float *a, 444 | __global float *b, 445 | const float threshold) { 446 | uintptr_t i = get_global_id(0); 447 | if (a[i] < threshold) { 448 | b[i] = 1; 449 | } else { 450 | b[i] = 0; 451 | } 452 | } 453 | 454 | //////////////////////////////////////////////////////////////////////////////////////////////////// 455 | // Mean Squared Error (MSE) 456 | 457 | __kernel void array_mse_f32(__global float *h, 458 | __global float *y, 459 | __global float *out, 460 | const ulong rows, 461 | const ulong cols) { 462 | ulong i = get_global_id(0); 463 | 464 | out[i] = 0.0; // Initialize to zero before we start summing things up 465 | 466 | // Sum up squared errors 467 | for (ulong m = 0; m < rows; m++) { 468 | float error = h[m*cols + i] - y[m*cols + i]; 469 | out[i] += error*error; 470 | } 471 | 472 | // Divide by batch size to calculate the mean 473 | out[i] /= (float)rows; 474 | out[i] /= 2.0; 475 | } 476 | 477 | //////////////////////////////////////////////////////////////////////////////////////////////////// 478 | // Mean Squared Error derivative 479 | 480 | __kernel void array_dmse_f32(__global float *h, 481 | __global float *y, 482 | __global float *out, 483 | const ulong rows, 484 | const ulong cols) { 485 | ulong i = get_global_id(0); 486 | ulong j = get_global_id(1); 487 | 488 | uintptr_t index = i*cols + j; 489 | out[index] = h[index] - y[index]; 490 | } 491 | 492 | //////////////////////////////////////////////////////////////////////////////////////////////////// 493 | // tanh 494 | 495 | __kernel void array_tanh_f32(__global float *a, 496 | __global float *b) { 497 | uintptr_t i = get_global_id(0); 498 | b[i] = tanh(a[i]); 499 | } 500 | 501 | //////////////////////////////////////////////////////////////////////////////////////////////////// 502 | // derivative of tanh 503 | 504 | __kernel void array_dtanh_f32(__global float *a, 505 | __global float *b) { 506 | uintptr_t i = get_global_id(0); 507 | // dtanh = 1 - tanh(x)^2 508 | b[i] = tanh(a[i]); 509 | b[i] = 1.0 - b[i]*b[i]; 510 | } 511 | 512 | //////////////////////////////////////////////////////////////////////////////////////////////////// 513 | // sigmoid 514 | 515 | __kernel void array_sigmoid_f32(__global float *a, 516 | __global float *b) { 517 | uintptr_t i = get_global_id(0); 518 | b[i] = sigmoid(a[i]); 519 | } 520 | 521 | //////////////////////////////////////////////////////////////////////////////////////////////////// 522 | // derivative of sigmoid 523 | 524 | __kernel void array_dsigmoid_f32(__global float *a, 525 | __global float *b) { 526 | uintptr_t i = get_global_id(0); 527 | // dsigmoid = sigmoid(x)*(1 - sigmoid(x)) 528 | b[i] = sigmoid(a[i]); 529 | b[i] = b[i]*(1.0 - b[i]); 530 | } 531 | 532 | //////////////////////////////////////////////////////////////////////////////////////////////////// 533 | // log 534 | 535 | __kernel void array_log_f32(__global float *a, 536 | __global float *b) { 537 | uintptr_t i = get_global_id(0); 538 | b[i] = log(a[i]); 539 | } 540 | 541 | //////////////////////////////////////////////////////////////////////////////////////////////////// 542 | // exp 543 | 544 | __kernel void array_exp_f32(__global float *a, 545 | __global float *b) { 546 | uintptr_t i = get_global_id(0); 547 | b[i] = exp(a[i]); 548 | } 549 | 550 | //////////////////////////////////////////////////////////////////////////////////////////////////// 551 | // negate 552 | 553 | __kernel void array_negate_f32(__global float *a, 554 | __global float *b) { 555 | uintptr_t i = get_global_id(0); 556 | b[i] = -a[i]; 557 | } 558 | 559 | //////////////////////////////////////////////////////////////////////////////////////////////////// 560 | // sgd 561 | 562 | __kernel void array_sgd_f32(__global float *x, __global float *dx, float learn_rate) { 563 | uintptr_t i = get_global_id(0); 564 | x[i] += learn_rate*dx[i]; 565 | } 566 | 567 | //////////////////////////////////////////////////////////////////////////////////////////////////// 568 | // rmsprop 569 | 570 | __kernel void array_rmsprop_f32(__global float *x, __global float *dx, __global float *cache, 571 | float learn_rate, float decay_rate, float eps) { 572 | uintptr_t i = get_global_id(0); 573 | cache[i] = decay_rate * cache[i] + (1.f - decay_rate) * dx[i]*dx[i]; 574 | x[i] += learn_rate*dx[i] / (sqrt(cache[i]) + eps); 575 | } 576 | -------------------------------------------------------------------------------- /src/ops.rs: -------------------------------------------------------------------------------- 1 | use std::rc::Rc; 2 | use std::cell::Ref; 3 | 4 | use context::Context; 5 | use num::Num; 6 | use tensor::{Event, Tensor, TensorView}; 7 | use range_arg::RangeArg; 8 | 9 | pub fn copy_to(ctx: &Context, a: &Tensor, output: &Tensor) { 10 | let kernel = ctx.kernels().copy_to::(); 11 | 12 | kernel.set_arg(0, a); 13 | kernel.set_arg(1, output); 14 | 15 | output.set_event(Rc::new(ctx.queue 16 | .enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 17 | None, &**a.get_event()))); 18 | } 19 | 20 | pub fn fill(ctx: &Context, a: &Tensor, val: T) { 21 | let kernel = ctx.kernels().fill::(); 22 | 23 | kernel.set_arg(0, a); 24 | kernel.set_arg(1, &val); 25 | 26 | a.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), None, ()))); 27 | } 28 | 29 | pub fn sum(ctx: &Context, a: &Tensor, axis: usize, b: &Tensor) { 30 | let kernel = ctx.kernels().sum::(); 31 | 32 | kernel.set_arg(0, a); 33 | kernel.set_arg(1, b); 34 | kernel.set_arg(2, &a.shape()[0]); 35 | kernel.set_arg(3, &a.shape()[1]); 36 | kernel.set_arg(4, &axis); 37 | 38 | let keep_dim = [a.shape()[1], a.shape()[0]][axis]; 39 | 40 | let new_event = { 41 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, keep_dim, None, &**a.get_event()) 42 | }; 43 | b.set_event(Rc::new(new_event)); 44 | } 45 | 46 | pub fn add(ctx: &Context, a: &Tensor, axis: i32, b: &Tensor, output: &Tensor) { 47 | let kernel = ctx.kernels().add::(); 48 | 49 | kernel.set_arg(0, a); 50 | kernel.set_arg(1, b); 51 | kernel.set_arg(2, output); 52 | kernel.set_arg(3, &a.shape()[1]); 53 | kernel.set_arg(4, &axis); 54 | 55 | let new_event = { 56 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 57 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.shape()[0], a.shape()[1]), None, event_list) 58 | }; 59 | output.set_event(Rc::new(new_event)); 60 | } 61 | 62 | pub fn sub(ctx: &Context, a: &Tensor, b: &Tensor, output: &Tensor) { 63 | let kernel = ctx.kernels().sub::(); 64 | 65 | kernel.set_arg(0, a); 66 | kernel.set_arg(1, b); 67 | kernel.set_arg(2, output); 68 | 69 | let new_event = { 70 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 71 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), None, event_list) 72 | }; 73 | output.set_event(Rc::new(new_event)); 74 | } 75 | 76 | pub fn multiply(ctx: &Context, a: &Tensor, axis: i32, b: &Tensor, output: &Tensor) { 77 | let kernel = ctx.kernels().multiply::(); 78 | 79 | kernel.set_arg(0, a); 80 | kernel.set_arg(1, b); 81 | kernel.set_arg(2, output); 82 | kernel.set_arg(3, &a.shape()[1]); 83 | kernel.set_arg(4, &axis); 84 | 85 | let new_event = { 86 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 87 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.shape()[0], a.shape()[1]), None, event_list) 88 | }; 89 | output.set_event(Rc::new(new_event)); 90 | } 91 | 92 | pub fn divide(ctx: &Context, a: &Tensor, axis: i32, b: &Tensor, output: &Tensor) { 93 | let kernel = ctx.kernels().divide::(); 94 | 95 | kernel.set_arg(0, a); 96 | kernel.set_arg(1, b); 97 | kernel.set_arg(2, output); 98 | kernel.set_arg(3, &a.shape()[1]); 99 | kernel.set_arg(4, &axis); 100 | 101 | let new_event = { 102 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 103 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.shape()[0], a.shape()[1]), None, event_list) 104 | }; 105 | output.set_event(Rc::new(new_event)); 106 | } 107 | 108 | pub fn transpose(ctx: &Context, a: &Tensor, output: &Tensor) { 109 | let kernel = ctx.kernels().transpose::(); 110 | 111 | kernel.set_arg(0, a); 112 | kernel.set_arg(1, output); 113 | kernel.set_arg(2, &a.shape()[0]); 114 | kernel.set_arg(3, &a.shape()[1]); 115 | 116 | output.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.shape()[0], a.shape()[1]), 117 | None, &**a.get_event()))); 118 | } 119 | 120 | pub fn matmul(ctx: &Context, a: &Tensor, b: &Tensor, output: &Tensor) { 121 | let kernel = ctx.kernels().matmul::(); 122 | 123 | kernel.set_arg(0, a); 124 | kernel.set_arg(1, b); 125 | kernel.set_arg(2, output); 126 | kernel.set_arg(3, &a.shape()[1]); 127 | kernel.set_arg(4, &b.shape()[1]); 128 | 129 | let new_event = { 130 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 131 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, 132 | (a.shape()[0], b.shape()[1]), 133 | None, event_list) 134 | }; 135 | output.set_event(Rc::new(new_event)); 136 | } 137 | 138 | pub fn max(ctx: &Context, a: &Tensor, threshold: T, output: &Tensor) { 139 | let kernel = ctx.kernels().max::(); 140 | 141 | kernel.set_arg(0, a); 142 | kernel.set_arg(1, output); 143 | kernel.set_arg(2, &threshold); 144 | 145 | output.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 146 | None, &**a.get_event()))); 147 | } 148 | 149 | pub fn dmax(ctx: &Context, a: &Tensor, threshold: T, output: &Tensor) { 150 | let kernel = ctx.kernels().dmax::(); 151 | 152 | kernel.set_arg(0, a); 153 | kernel.set_arg(1, output); 154 | kernel.set_arg(2, &threshold); 155 | 156 | output.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 157 | None, &**a.get_event()))); 158 | } 159 | 160 | pub fn min(ctx: &Context, a: &Tensor, threshold: T, output: &Tensor) { 161 | let kernel = ctx.kernels().min::(); 162 | 163 | kernel.set_arg(0, a); 164 | kernel.set_arg(1, output); 165 | kernel.set_arg(2, &threshold); 166 | 167 | output.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 168 | None, &**a.get_event()))); 169 | } 170 | 171 | pub fn dmin(ctx: &Context, a: &Tensor, threshold: T, output: &Tensor) { 172 | let kernel = ctx.kernels().dmin::(); 173 | 174 | kernel.set_arg(0, a); 175 | kernel.set_arg(1, output); 176 | kernel.set_arg(2, &threshold); 177 | 178 | output.set_event(Rc::new(ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 179 | None, &**a.get_event()))); 180 | } 181 | 182 | pub fn mse(ctx: &Context, a: &Tensor, train: &Tensor, output: &Tensor) { 183 | let kernel = ctx.kernels().mse::(); 184 | 185 | kernel.set_arg(0, a); 186 | kernel.set_arg(1, train); 187 | kernel.set_arg(2, output); 188 | kernel.set_arg(3, &a.shape()[0]); 189 | kernel.set_arg(4, &a.shape()[1]); 190 | 191 | let new_event = { 192 | let event_list: &[Ref>] = &[a.get_event(), train.get_event()]; 193 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, 194 | a.shape()[1], 195 | None, event_list) 196 | }; 197 | output.set_event(Rc::new(new_event)); 198 | } 199 | 200 | pub fn dmse(ctx: &Context, a: &Tensor, train: &Tensor, output: &Tensor) { 201 | let kernel = ctx.kernels().dmse::(); 202 | 203 | kernel.set_arg(0, a); 204 | kernel.set_arg(1, train); 205 | kernel.set_arg(2, output); 206 | kernel.set_arg(3, &a.shape()[0]); 207 | kernel.set_arg(4, &a.shape()[1]); 208 | 209 | let new_event = { 210 | let event_list: &[Ref>] = &[a.get_event(), train.get_event()]; 211 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, 212 | (a.shape()[0], a.shape()[1]), 213 | None, event_list) 214 | }; 215 | output.set_event(Rc::new(new_event)); 216 | } 217 | 218 | pub fn tanh(ctx: &Context, a: &Tensor, output: &Tensor) { 219 | let kernel = ctx.kernels().tanh::(); 220 | 221 | kernel.set_arg(0, a); 222 | kernel.set_arg(1, output); 223 | 224 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 225 | None, &**a.get_event()); 226 | output.set_event(Rc::new(new_event)); 227 | } 228 | 229 | pub fn dtanh(ctx: &Context, a: &Tensor, output: &Tensor) { 230 | let kernel = ctx.kernels().dtanh::(); 231 | 232 | kernel.set_arg(0, a); 233 | kernel.set_arg(1, output); 234 | 235 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 236 | None, &**a.get_event()); 237 | output.set_event(Rc::new(new_event)); 238 | } 239 | 240 | pub fn sigmoid(ctx: &Context, a: &Tensor, output: &Tensor) { 241 | let kernel = ctx.kernels().sigmoid::(); 242 | 243 | kernel.set_arg(0, a); 244 | kernel.set_arg(1, output); 245 | 246 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 247 | None, &**a.get_event()); 248 | output.set_event(Rc::new(new_event)); 249 | } 250 | 251 | pub fn dsigmoid(ctx: &Context, a: &Tensor, output: &Tensor) { 252 | let kernel = ctx.kernels().dsigmoid::(); 253 | 254 | kernel.set_arg(0, a); 255 | kernel.set_arg(1, output); 256 | 257 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 258 | None, &**a.get_event()); 259 | output.set_event(Rc::new(new_event)); 260 | } 261 | 262 | pub fn log(ctx: &Context, a: &Tensor, output: &Tensor) { 263 | let kernel = ctx.kernels().log::(); 264 | 265 | kernel.set_arg(0, a); 266 | kernel.set_arg(1, output); 267 | 268 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 269 | None, &**a.get_event()); 270 | output.set_event(Rc::new(new_event)); 271 | } 272 | 273 | pub fn exp(ctx: &Context, a: &Tensor, output: &Tensor) { 274 | let kernel = ctx.kernels().exp::(); 275 | 276 | kernel.set_arg(0, a); 277 | kernel.set_arg(1, output); 278 | 279 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 280 | None, &**a.get_event()); 281 | output.set_event(Rc::new(new_event)); 282 | } 283 | 284 | pub fn negate(ctx: &Context, a: &Tensor, output: &Tensor) { 285 | let kernel = ctx.kernels().negate::(); 286 | 287 | kernel.set_arg(0, a); 288 | kernel.set_arg(1, output); 289 | 290 | let new_event = ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, a.len(), 291 | None, &**a.get_event()); 292 | output.set_event(Rc::new(new_event)); 293 | } 294 | 295 | pub fn sgd(ctx: &Context, x: &Tensor, dx: &Tensor, learn_rate: f32) { 296 | let kernel = ctx.kernels().sgd::(); 297 | 298 | kernel.set_arg(0, x); 299 | kernel.set_arg(1, dx); 300 | kernel.set_arg(2, &learn_rate); 301 | 302 | let new_event = { 303 | let event_list: &[Ref>] = &[x.get_event(), dx.get_event()]; 304 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, x.len(), None, event_list) 305 | }; 306 | x.set_event(Rc::new(new_event)); 307 | } 308 | 309 | pub fn rmsprop(ctx: &Context, x: &Tensor, dx: &Tensor, cache: &Tensor, learn_rate: f32, decay_rate: f32, eps: f32) { 310 | let kernel = ctx.kernels().rmsprop::(); 311 | 312 | kernel.set_arg(0, x); 313 | kernel.set_arg(1, dx); 314 | kernel.set_arg(2, cache); 315 | kernel.set_arg(3, &learn_rate); 316 | kernel.set_arg(4, &decay_rate); 317 | kernel.set_arg(5, &eps); 318 | 319 | let new_event = { 320 | let event_list: &[Ref>] = &[x.get_event(), dx.get_event(), cache.get_event()]; 321 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, x.len(), None, event_list) 322 | }; 323 | let new_event = Rc::new(new_event); 324 | cache.set_event(new_event.clone()); 325 | x.set_event(new_event); 326 | } 327 | 328 | //////////////////////////////////////////////////////////////////////////////////////////////////// 329 | 330 | fn dim_steps_as_ulong4(dim_steps: &[usize]) -> [u64; 4] { 331 | let mut array = [0u64; 4]; 332 | let array_len = array.len(); 333 | 334 | for i in 0..dim_steps.len() { 335 | array[array_len-1-i] = dim_steps[dim_steps.len()-1-i] as u64; 336 | } 337 | for i in dim_steps.len()..array_len { 338 | array[array_len-1-i] = dim_steps[0] as u64; 339 | } 340 | 341 | array 342 | } 343 | 344 | fn tensor_view_offsets_as_ulong4>(t: &TensorView) -> [u64; 4] { 345 | let mut array = [0u64; 4]; 346 | let array_len = array.len(); 347 | 348 | for i in 0..t.shape.len() { 349 | array[array_len-1-i] = t.view_offset(t.shape.len()-1-i) as u64; 350 | } 351 | for i in t.shape.len()..array_len { 352 | array[array_len-1-i] = 0; 353 | } 354 | 355 | array 356 | } 357 | 358 | pub fn fill_slice>(ctx: &Context, a: &TensorView, val: T) { 359 | let kernel = ctx.kernels().fill_slice::(); 360 | 361 | let a_dim_steps = dim_steps_as_ulong4(a.dim_steps); 362 | let a_offsets = tensor_view_offsets_as_ulong4(a); 363 | 364 | kernel.set_arg(0, a); 365 | kernel.set_arg(1, &val); 366 | kernel.set_arg(2, &a_dim_steps); 367 | kernel.set_arg(3, &a_offsets); 368 | 369 | let mut work_dim = [1; 3]; 370 | for i in 0..a.shape.len() { 371 | work_dim[2-i] = a.view_shape(a.shape.len()-1-i); 372 | } 373 | 374 | let new_event = { 375 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, work_dim, None, &**a.get_event()) 376 | }; 377 | a.set_event(Rc::new(new_event)); 378 | } 379 | 380 | pub fn copy_to_slice(ctx: &Context, 381 | a: &TensorView, 382 | b: &TensorView) 383 | where AR: AsRef<[RangeArg]>, 384 | BR: AsRef<[RangeArg]>, 385 | { 386 | let kernel = ctx.kernels().copy_to_slice::(); 387 | 388 | let a_dim_steps = dim_steps_as_ulong4(a.dim_steps); 389 | let b_dim_steps = dim_steps_as_ulong4(b.dim_steps); 390 | 391 | let a_offsets = tensor_view_offsets_as_ulong4(a); 392 | let b_offsets = tensor_view_offsets_as_ulong4(b); 393 | 394 | kernel.set_arg(0, a); 395 | kernel.set_arg(1, b); 396 | kernel.set_arg(2, &a_dim_steps); 397 | kernel.set_arg(3, &a_offsets); 398 | kernel.set_arg(4, &b_dim_steps); 399 | kernel.set_arg(5, &b_offsets); 400 | 401 | let mut work_dim = [1; 3]; 402 | for i in 0..a.shape.len() { 403 | work_dim[2-i] = a.view_shape(a.shape.len()-1-i); 404 | } 405 | 406 | let new_event = { 407 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, work_dim, None, &**a.get_event()) 408 | }; 409 | b.set_event(Rc::new(new_event)); 410 | } 411 | 412 | pub fn add_slice(ctx: &Context, 413 | a: &TensorView, 414 | b: &TensorView, 415 | out: &TensorView) 416 | where AR: AsRef<[RangeArg]>, 417 | BR: AsRef<[RangeArg]>, 418 | CR: AsRef<[RangeArg]> 419 | { 420 | let kernel = ctx.kernels().add_slice::(); 421 | 422 | let a_dim_steps = dim_steps_as_ulong4(a.dim_steps); 423 | let b_dim_steps = dim_steps_as_ulong4(b.dim_steps); 424 | let out_dim_steps = dim_steps_as_ulong4(out.dim_steps); 425 | 426 | let a_offsets = tensor_view_offsets_as_ulong4(a); 427 | let b_offsets = tensor_view_offsets_as_ulong4(b); 428 | let out_offsets = tensor_view_offsets_as_ulong4(out); 429 | 430 | kernel.set_arg(0, a); 431 | kernel.set_arg(1, b); 432 | kernel.set_arg(2, out); 433 | kernel.set_arg(3, &a_dim_steps); 434 | kernel.set_arg(4, &a_offsets); 435 | kernel.set_arg(5, &b_dim_steps); 436 | kernel.set_arg(6, &b_offsets); 437 | kernel.set_arg(7, &out_dim_steps); 438 | kernel.set_arg(8, &out_offsets); 439 | 440 | let mut work_dim = [1; 3]; 441 | for i in 0..a.shape.len() { 442 | work_dim[2-i] = a.view_shape(a.shape.len()-1-i); 443 | } 444 | 445 | let new_event = { 446 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 447 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, work_dim, None, event_list) 448 | }; 449 | out.set_event(Rc::new(new_event)); 450 | } 451 | 452 | pub fn multiply_slice(ctx: &Context, 453 | a: &TensorView, 454 | b: &TensorView, 455 | out: &TensorView) 456 | where AR: AsRef<[RangeArg]>, 457 | BR: AsRef<[RangeArg]>, 458 | CR: AsRef<[RangeArg]>, 459 | { 460 | let kernel = ctx.kernels().multiply_slice::(); 461 | 462 | let a_dim_steps = dim_steps_as_ulong4(a.dim_steps); 463 | let b_dim_steps = dim_steps_as_ulong4(b.dim_steps); 464 | let out_dim_steps = dim_steps_as_ulong4(out.dim_steps); 465 | 466 | let a_offsets = tensor_view_offsets_as_ulong4(a); 467 | let b_offsets = tensor_view_offsets_as_ulong4(b); 468 | let out_offsets = tensor_view_offsets_as_ulong4(out); 469 | 470 | kernel.set_arg(0, a); 471 | kernel.set_arg(1, b); 472 | kernel.set_arg(2, out); 473 | kernel.set_arg(3, &a_dim_steps); 474 | kernel.set_arg(4, &a_offsets); 475 | kernel.set_arg(5, &b_dim_steps); 476 | kernel.set_arg(6, &b_offsets); 477 | kernel.set_arg(7, &out_dim_steps); 478 | kernel.set_arg(8, &out_offsets); 479 | 480 | let mut work_dim = [1; 3]; 481 | for i in 0..a.shape.len() { 482 | work_dim[2-i] = a.view_shape(a.shape.len()-1-i); 483 | } 484 | 485 | let new_event = { 486 | let event_list: &[Ref>] = &[a.get_event(), b.get_event()]; 487 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, work_dim, None, event_list) 488 | }; 489 | out.set_event(Rc::new(new_event)); 490 | } 491 | 492 | pub fn sigmoid_slice(ctx: &Context, 493 | a: &TensorView, 494 | b: &TensorView) 495 | where AR: AsRef<[RangeArg]>, 496 | BR: AsRef<[RangeArg]>, 497 | { 498 | let kernel = ctx.kernels().sigmoid_slice::(); 499 | 500 | kernel.set_arg(0, a); 501 | kernel.set_arg(1, b); 502 | kernel.set_arg(2, &a.view_offset(0)); 503 | kernel.set_arg(3, &a.view_offset(1)); 504 | kernel.set_arg(4, &b.view_offset(0)); 505 | kernel.set_arg(5, &b.view_offset(1)); 506 | kernel.set_arg(6, &a.shape[1]); 507 | kernel.set_arg(7, &b.shape[1]); 508 | 509 | let new_event = { 510 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.view_shape(0), a.view_shape(1)), None, &**a.get_event()) 511 | }; 512 | b.set_event(Rc::new(new_event)); 513 | } 514 | 515 | pub fn dsigmoid_slice(ctx: &Context, 516 | a: &TensorView, 517 | b: &TensorView) 518 | where AR: AsRef<[RangeArg]>, 519 | BR: AsRef<[RangeArg]>, 520 | { 521 | let kernel = ctx.kernels().dsigmoid_slice::(); 522 | 523 | kernel.set_arg(0, a); 524 | kernel.set_arg(1, b); 525 | kernel.set_arg(2, &a.view_offset(0)); 526 | kernel.set_arg(3, &a.view_offset(1)); 527 | kernel.set_arg(4, &b.view_offset(0)); 528 | kernel.set_arg(5, &b.view_offset(1)); 529 | kernel.set_arg(6, &a.shape[1]); 530 | kernel.set_arg(7, &b.shape[1]); 531 | 532 | let new_event = { 533 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.view_shape(0), a.view_shape(1)), None, &**a.get_event()) 534 | }; 535 | b.set_event(Rc::new(new_event)); 536 | } 537 | 538 | pub fn tanh_slice(ctx: &Context, 539 | a: &TensorView, 540 | b: &TensorView) 541 | where AR: AsRef<[RangeArg]>, 542 | BR: AsRef<[RangeArg]>, 543 | { 544 | let kernel = ctx.kernels().tanh_slice::(); 545 | 546 | kernel.set_arg(0, a); 547 | kernel.set_arg(1, b); 548 | kernel.set_arg(2, &a.view_offset(0)); 549 | kernel.set_arg(3, &a.view_offset(1)); 550 | kernel.set_arg(4, &b.view_offset(0)); 551 | kernel.set_arg(5, &b.view_offset(1)); 552 | kernel.set_arg(6, &a.shape[1]); 553 | kernel.set_arg(7, &b.shape[1]); 554 | 555 | let new_event = { 556 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.view_shape(0), a.view_shape(1)), None, &**a.get_event()) 557 | }; 558 | b.set_event(Rc::new(new_event)); 559 | } 560 | 561 | pub fn dtanh_slice(ctx: &Context, 562 | a: &TensorView, 563 | b: &TensorView) 564 | where AR: AsRef<[RangeArg]>, 565 | BR: AsRef<[RangeArg]>, 566 | { 567 | let kernel = ctx.kernels().dtanh_slice::(); 568 | 569 | kernel.set_arg(0, a); 570 | kernel.set_arg(1, b); 571 | kernel.set_arg(2, &a.view_offset(0)); 572 | kernel.set_arg(3, &a.view_offset(1)); 573 | kernel.set_arg(4, &b.view_offset(0)); 574 | kernel.set_arg(5, &b.view_offset(1)); 575 | kernel.set_arg(6, &a.shape[1]); 576 | kernel.set_arg(7, &b.shape[1]); 577 | 578 | let new_event = { 579 | ctx.queue.enqueue_async_kernel(&ctx.ctx, &kernel, (a.view_shape(0), a.view_shape(1)), None, &**a.get_event()) 580 | }; 581 | b.set_event(Rc::new(new_event)); 582 | } 583 | 584 | //////////////////////////////////////////////////////////////////////////////////////////////////// 585 | 586 | #[cfg(test)] 587 | use array::Array; 588 | #[cfg(test)] 589 | use tensor::TensorMode; 590 | 591 | #[test] 592 | fn tensor_fill() { 593 | let ref ctx = Context::new(); 594 | 595 | let a: Array = Array::from_vec(vec![5, 3], vec![0i32; 15]); 596 | 597 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 598 | 599 | fill(ctx, &a_cl, 42); 600 | let result = a_cl.get(ctx); 601 | 602 | assert!(result.buffer() == &[42; 15]); 603 | } 604 | 605 | #[test] 606 | fn tensor_transpose() { 607 | let ref ctx = Context::new(); 608 | 609 | let a: Array = Array::from_vec(vec![5, 3], (0..15).collect()); 610 | 611 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 612 | let b_cl = Tensor::new(ctx, vec![3, 5], TensorMode::Out); 613 | 614 | transpose(ctx, &a_cl, &b_cl); 615 | let b = b_cl.get(ctx); 616 | 617 | assert!(b.buffer() == &[0, 3, 6, 9, 12, 618 | 1, 4, 7, 10, 13, 619 | 2, 5, 8, 11, 14]); 620 | } 621 | 622 | #[test] 623 | fn tensor_sum_axis0() { 624 | let ref ctx = Context::new(); 625 | 626 | let a: Array = Array::from_vec(vec![5, 3], (0..15).collect()); 627 | let b = Array::from_vec(vec![1, 3], (0..3).collect()); 628 | 629 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 630 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::Out); 631 | 632 | sum(ctx, &a_cl, 0, &b_cl); 633 | let b = b_cl.get(ctx); 634 | 635 | assert!(b.buffer() == &[30, 35, 40]); 636 | } 637 | 638 | #[test] 639 | fn tensor_sum_axis1() { 640 | let ref ctx = Context::new(); 641 | 642 | let a: Array = Array::from_vec(vec![5, 3], (0..15).collect()); 643 | let b = Array::from_vec(vec![5, 1], (0..5).collect()); 644 | 645 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 646 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::Out); 647 | 648 | sum(ctx, &a_cl, 1, &b_cl); 649 | let b = b_cl.get(ctx); 650 | 651 | assert!(b.buffer() == &[3, 12, 21, 30, 39]); 652 | } 653 | 654 | #[test] 655 | fn tensor_add() { 656 | let ref ctx = Context::new(); 657 | 658 | let a = Array::from_vec(vec![5, 10000], (0..5*10000).collect()); 659 | let b = Array::from_vec(vec![5, 10000], (0..5*10000).map(|x| x*2).collect()); 660 | 661 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 662 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 663 | let c_cl: Tensor = Tensor::new(ctx, vec![5, 10000], TensorMode::Out); 664 | 665 | add(ctx, &a_cl, -1, &b_cl, &c_cl); 666 | 667 | let c = c_cl.get(ctx); 668 | 669 | for i in 0..5 { 670 | for j in 0..10000 { 671 | assert!(c[&[i, j]] == a[&[i, j]] + b[&[i, j]]); 672 | } 673 | } 674 | } 675 | 676 | #[test] 677 | fn tensor_add_reuse() { 678 | let ref ctx = Context::new(); 679 | 680 | let a = Array::from_vec(vec![1, 10000], (0..10000).collect()); 681 | let b = Array::from_vec(vec![1, 10000], (0..10000).map(|x| x*2).collect()); 682 | 683 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 684 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 685 | 686 | add(ctx, &a_cl, -1, &b_cl, &b_cl); // b = a+b 687 | } 688 | 689 | #[test] 690 | fn tensor_add_axis() { 691 | let ref ctx = Context::new(); 692 | 693 | let a = Array::from_vec(vec![5, 3], (0..15).collect()); 694 | let b = Array::from_vec(vec![1, 3], (0..3).collect()); 695 | 696 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::Mut); 697 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 698 | 699 | add(ctx, &a_cl, 0, &b_cl, &a_cl); // a = a+b 700 | 701 | let a = a_cl.get(ctx); 702 | 703 | assert!(a.buffer() == &[0, 2, 4, 704 | 3, 5, 7, 705 | 6, 8, 10, 706 | 9, 11, 13, 707 | 12, 14, 16]); 708 | } 709 | 710 | #[test] 711 | fn tensor_multiply_axis1() { 712 | let ref ctx = Context::new(); 713 | 714 | let a = Array::from_vec(vec![3, 5], (0..15).collect()); 715 | let b = Array::from_vec(vec![3, 1], (0..3).collect()); 716 | 717 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::Mut); 718 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 719 | 720 | multiply(ctx, &a_cl, 1, &b_cl, &a_cl); // a = a*b 721 | 722 | let a = a_cl.get(ctx); 723 | 724 | assert!(a.buffer() == &[0, 0, 0, 0, 0, 725 | 5, 6, 7, 8, 9, 726 | 20, 22, 24, 26, 28]); 727 | } 728 | 729 | #[test] 730 | fn tensor_divide_axis1() { 731 | let ref ctx = Context::new(); 732 | 733 | let a = Array::from_vec(vec![2, 5], (1..11).map(|x| x as f32).collect()); 734 | let b = Array::from_vec(vec![2, 1], (1..3).map(|x| x as f32).collect()); 735 | 736 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::Mut); 737 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 738 | 739 | divide(ctx, &a_cl, 1, &b_cl, &a_cl); // a = a*b 740 | 741 | let a = a_cl.get(ctx); 742 | 743 | assert!(a.buffer() == &[1.0, 2.0, 3.0, 4.0, 5.0, 744 | 3.0, 3.5, 4.0, 4.5, 5.0]); 745 | } 746 | 747 | #[test] 748 | fn tensor_matmul() { 749 | let ref ctx = Context::new(); 750 | 751 | let a = Array::from_vec(vec![3, 5], (0i32..15).collect()); 752 | let b = Array::from_vec(vec![5, 2], (0..10).collect()); 753 | 754 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::Mut); 755 | let b_cl = Tensor::from_array(ctx, &b, TensorMode::In); 756 | let c_cl = Tensor::new(ctx, vec![3, 2], TensorMode::In); 757 | 758 | matmul(ctx, &a_cl, &b_cl, &c_cl); // c = a*b 759 | 760 | let c = c_cl.get(ctx); 761 | 762 | println!("{:?}", c); 763 | 764 | assert!(c.buffer() == &[60, 70, 765 | 160, 195, 766 | 260, 320]); 767 | } 768 | 769 | #[test] 770 | fn tensor_tanh() { 771 | let ref ctx = Context::new(); 772 | 773 | let a: Array = Array::from_vec(vec![5, 3], (0..15).map(|x| x as f32).collect()); 774 | 775 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 776 | let b_cl = Tensor::new(ctx, vec![5, 3], TensorMode::Out); 777 | 778 | tanh(ctx, &a_cl, &b_cl); 779 | let b = b_cl.get(ctx); 780 | 781 | println!("{:?}", b); 782 | assert!(b.buffer() == &[0.0, 0.7615941, 0.9640276, 783 | 0.9950548, 0.9993293, 0.99990916, 784 | 0.99998766, 0.99999833, 0.99999976, 785 | 1.0, 1.0, 1.0, 786 | 1.0, 1.0, 1.0]); 787 | 788 | } 789 | 790 | #[test] 791 | fn tensor_dtanh() { 792 | let ref ctx = Context::new(); 793 | 794 | let a: Array = Array::from_vec(vec![5, 3], (0..15).map(|x| x as f32).collect()); 795 | 796 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 797 | let b_cl = Tensor::new(ctx, vec![5, 3], TensorMode::Out); 798 | 799 | dtanh(ctx, &a_cl, &b_cl); 800 | let b = b_cl.get(ctx); 801 | 802 | println!("{:?}", b); 803 | assert!(b.buffer() == &[1.0, 0.4199744, 0.070650816, 804 | 0.009865999, 0.0013408661, 0.00018167496, 805 | 0.000024676323, 0.00000333786, 0.00000047683716, 806 | 0.0, 0.0, 0.0, 807 | 0.0, 0.0, 0.0]); 808 | 809 | } 810 | 811 | #[test] 812 | fn tensor_sigmoid() { 813 | let ref ctx = Context::new(); 814 | 815 | let a: Array = Array::from_vec(vec![5, 3], (0..15).map(|x| x as f32).collect()); 816 | 817 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 818 | let b_cl = Tensor::new(ctx, vec![5, 3], TensorMode::Out); 819 | 820 | sigmoid(ctx, &a_cl, &b_cl); 821 | let b = b_cl.get(ctx); 822 | 823 | println!("{:?}", b); 824 | assert!(b.buffer() == &[0.5, 0.7310586, 0.880797, 825 | 0.9525742, 0.98201376, 0.9933072, 826 | 0.9975274, 0.999089, 0.99966466, 827 | 0.9998766, 0.9999546, 0.9999833, 828 | 0.9999938, 0.99999774, 0.99999917]); 829 | 830 | } 831 | 832 | #[test] 833 | fn tensor_dsigmoid() { 834 | let ref ctx = Context::new(); 835 | 836 | let a: Array = Array::from_vec(vec![5, 3], (0..15).map(|x| x as f32).collect()); 837 | 838 | let a_cl = Tensor::from_array(ctx, &a, TensorMode::In); 839 | let b_cl = Tensor::new(ctx, vec![5, 3], TensorMode::Out); 840 | 841 | dsigmoid(ctx, &a_cl, &b_cl); 842 | let b = b_cl.get(ctx); 843 | 844 | println!("{:?}", b); 845 | assert!(b.buffer() == &[0.25, 0.19661193, 0.10499363, 846 | 0.0451766, 0.017662734, 0.006648033, 847 | 0.0024664658, 0.00091016747, 0.00033522327, 848 | 0.0001233664, 0.000045416677, 0.000016689022, 849 | 0.000006198845, 0.0000022649713, 0.00000083446434]); 850 | } 851 | 852 | #[test] 853 | fn test_add_slice() { 854 | use array::Array; 855 | use context::Context; 856 | use tensor::{Tensor, TensorMode}; 857 | 858 | let ref ctx = Context::new(); 859 | 860 | let a = Array::from_vec(vec![2, 4, 3], vec![2, 3, 4, 861 | 6, 7, 8, 862 | 10, 11, 12, 863 | 14, 15, 16, 864 | 865 | 17, 18, 19, 866 | 20, 21, 22, 867 | 23, 24, 25, 868 | 26, 27, 28]); 869 | let at = Tensor::from_array(ctx, &a, TensorMode::Mut); 870 | let atv = at.slice(s![0, 1..3, 1]); 871 | 872 | let b = Array::from_vec(vec![4, 4], vec![1, 2, 3, 4, 873 | 5, 6, 7, 8, 874 | 9, 10, 11, 12, 875 | 13, 14, 15, 16]); 876 | let bt = Tensor::from_array(ctx, &b, TensorMode::Mut); 877 | let btv = bt.slice(s![1..3, 3]); 878 | 879 | let c = Array::from_vec(vec![4, 4], vec![0; 16]); 880 | let ct = Tensor::from_array(ctx, &c, TensorMode::Mut); 881 | let ctv = ct.slice(s![2..4, 0]); 882 | 883 | add_slice(ctx, &btv, &atv, &ctv); 884 | println!("{:?}", ct.get(ctx)); 885 | assert!(ct.get(ctx).buffer() == &[0, 0, 0, 0, 886 | 0, 0, 0, 0, 887 | 15, 0, 0, 0, 888 | 23, 0, 0, 0]); 889 | } 890 | 891 | #[test] 892 | fn test_multiply_slice() { 893 | use array::Array; 894 | use context::Context; 895 | use tensor::{Tensor, TensorMode}; 896 | 897 | let ref ctx = Context::new(); 898 | 899 | let a = Array::from_vec(vec![2, 4, 3], vec![2, 3, 4, 900 | 6, 7, 8, 901 | 10, 11, 12, 902 | 14, 15, 16, 903 | 904 | 17, 18, 19, 905 | 20, 21, 22, 906 | 23, 24, 25, 907 | 26, 27, 28]); 908 | let at = Tensor::from_array(ctx, &a, TensorMode::Mut); 909 | let atv = at.slice(s![0, 1..3, 1]); 910 | 911 | let b = Array::from_vec(vec![4, 4], vec![1, 2, 3, 4, 912 | 5, 6, 7, 8, 913 | 9, 10, 11, 12, 914 | 13, 14, 15, 16]); 915 | let bt = Tensor::from_array(ctx, &b, TensorMode::Mut); 916 | let btv = bt.slice(s![1..3, 3]); 917 | 918 | let c = Array::from_vec(vec![4, 4], vec![0; 16]); 919 | let ct = Tensor::from_array(ctx, &c, TensorMode::Mut); 920 | let ctv = ct.slice(s![2..4, 0]); 921 | 922 | multiply_slice(ctx, &btv, &atv, &ctv); 923 | println!("{:?}", ct.get(ctx)); 924 | assert!(ct.get(ctx).buffer() == &[0, 0, 0, 0, 925 | 0, 0, 0, 0, 926 | 56, 0, 0, 0, 927 | 132, 0, 0, 0]); 928 | } 929 | 930 | #[test] 931 | fn test_fill_slice() { 932 | use array::Array; 933 | use context::Context; 934 | use tensor::{Tensor, TensorMode}; 935 | 936 | let ref ctx = Context::new(); 937 | 938 | let a = Array::from_vec(vec![4, 3], vec![0i32; 12]); 939 | let at = Tensor::from_array(ctx, &a, TensorMode::Mut); 940 | let atv = at.slice(s![1..3, 1]); 941 | 942 | fill_slice(ctx, &atv, 42); 943 | assert!(at.get(ctx).buffer() == &[0, 0, 0, 944 | 0, 42, 0, 945 | 0, 42, 0, 946 | 0, 0, 0]); 947 | } 948 | 949 | #[test] 950 | fn test_copy_to_slice() { 951 | use array::Array; 952 | use context::Context; 953 | use tensor::{Tensor, TensorMode}; 954 | 955 | let ref ctx = Context::new(); 956 | 957 | let a = Array::from_vec(vec![4, 3], vec![2i32, 3, 4, 958 | 6, 7, 8, 959 | 10, 11, 12, 960 | 14, 15, 16]); 961 | let at = Tensor::from_array(ctx, &a, TensorMode::Mut); 962 | let atv = at.slice(s![1..3, 1]); 963 | 964 | let b = Array::from_vec(vec![4, 4], vec![0; 16]); 965 | let bt = Tensor::from_array(ctx, &b, TensorMode::Mut); 966 | let btv = bt.slice(s![2..4, 3]); 967 | 968 | copy_to_slice(ctx, &atv, &btv); 969 | assert!(bt.get(ctx).buffer() == &[0, 0, 0, 0, 970 | 0, 0, 0, 0, 971 | 0, 0, 0, 7, 972 | 0, 0, 0, 11]); 973 | } 974 | --------------------------------------------------------------------------------