├── .gitignore ├── Contributors.md ├── benchmarks ├── xtime.rb ├── kostya_matmul.nim ├── kostya_matmul_lc_reshape.nim ├── kostya_matmul_hadamard.nim ├── kostya_matmul_mitems.nim ├── implementation │ ├── proc_method_closure_bench.nim │ └── stable_sigmoid_bench.nim └── ex01_xor.nim ├── src ├── autograd │ ├── utils.nim │ ├── accessors.nim │ ├── gates_reduce.nim │ ├── gates_basic.nim │ ├── gates_blas.nim │ └── autograd.nim ├── arraymancer_nn_primitives.nim ├── nn │ ├── layers │ │ ├── layer.nim │ │ └── linear.nim │ ├── loss │ │ ├── loss.nim │ │ └── sigmoid_cross_entropy.nim │ ├── activation │ │ ├── relu.nim │ │ └── sigmoid.nim │ └── optimizers │ │ └── optimizers.nim ├── arraymancer │ ├── comparison.nim │ ├── display_cuda.nim │ ├── backend │ │ ├── blis.nim │ │ ├── cuda_global_state.nim │ │ ├── openmp.nim │ │ ├── cublas_helper_proc.nim │ │ └── cuda.nim │ ├── filling_data.nim │ ├── math_functions.nim │ ├── aggregate.nim │ ├── exporting.nim │ ├── utils │ │ ├── ast_utils.nim │ │ ├── nested_containers.nim │ │ └── functional.nim │ ├── data_structure_helpers.nim │ ├── fallback │ │ ├── blas_l3_gemm_aux.nim │ │ ├── naive_l2_gemv.nim │ │ ├── blas_l3_gemm_macro_kernel.nim │ │ ├── blas_l3_gemm_packing.nim │ │ ├── blas_l3_gemm_micro_kernel.nim │ │ └── blas_l3_gemm.nim │ ├── shortcuts.nim │ ├── init_cpu_deprecated_0_2_0.nim │ ├── global_config.nim │ ├── accessors_cuda.nim │ ├── term_rewriting.nim │ ├── ufunc.nim │ ├── operators_blas_l2l3_cuda.nim │ ├── operators_blas_l1.nim │ ├── operators_blas_l1_cuda.nim │ ├── higher_order_deprecated.nim │ ├── elementwise_cuda.nim │ ├── shapeshifting_cuda.nim │ ├── higher_order_cuda.nim │ ├── init_cuda.nim │ └── display.nim ├── arraymancer_ag.nim ├── arraymancer_nn.nim ├── nn_primitives │ ├── linear_primitives.nim │ ├── activation_primitives.nim │ └── sigmoid_cross_entropy_primitives.nim └── arraymancer.nim ├── tests ├── all_tests_cuda.nim ├── tensors │ ├── test_optimization.nim │ ├── test_filling_data.nim │ ├── test_shapeshifting_deprecated.nim │ ├── test_bugtracker.nim │ ├── test_aggregate.nim │ ├── test_display.nim │ ├── test_comparison.nim │ ├── test_display_deprecated.nim │ ├── test_comparison_deprecated.nim │ ├── test_math_functions.nim │ ├── test_ufunc_deprecated.nim │ ├── test_init_deprecated.nim │ ├── test_accessors_deprecated.nim │ ├── test_higherorder.nim │ ├── test_aggregate_deprecated.nim │ ├── test_ufunc.nim │ ├── test_shapeshifting_cuda.nim │ ├── test_accessors.nim │ └── test_init.nim ├── manual_checks │ ├── autograd_mean_arraymancer.nim │ └── autograd_mean_pytorch.py ├── all_tests_deprecated.nim ├── all_tests.nim └── autograd │ └── test_gate_blas.nim ├── docs └── Linear algebra notation comparison.md ├── .appveyor.yml ├── .travis.yml ├── changelog.md ├── examples └── ex01_xor_perceptron_from_scratch.nim └── arraymancer.nimble /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | bin/ 3 | .DS_Store -------------------------------------------------------------------------------- /Contributors.md: -------------------------------------------------------------------------------- 1 | Arraymancer contributors (sorted alphabetically) 2 | 3 | ### Eduardo Bart 4 | - OpenMP 5 | - Several performance optimizations and fix including 6 | - Strided iterators 7 | - Uninitialized seq 8 | - Shapeshifting procs 9 | - Developing the ecosystem with [arraymancer-vision](https://github.com/edubart/arraymancer-vision) and [arraymancer-demos](https://github.com/edubart/arraymancer-demos) 10 | 11 | ### Mamy Ratsimbazafy 12 | - Lead dev -------------------------------------------------------------------------------- /benchmarks/xtime.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Copyright (c) 2014 'Konstantin Makarchev' 4 | # 5 | # MIT License 6 | # 7 | # https://github.com/kostya/benchmarks/ 8 | 9 | def mem(pid); `ps p #{pid} -o rss`.split.last.to_i; end 10 | t = Time.now 11 | pid = Process.spawn(*ARGV.to_a) 12 | mm = 0 13 | 14 | Thread.new do 15 | mm = mem(pid) 16 | while true 17 | sleep 0.1 18 | m = mem(pid) 19 | mm = m if m > mm 20 | end 21 | end 22 | 23 | Process.waitall 24 | STDERR.puts "%.2fs, %.1fMb" % [Time.now - t, mm / 1024.0] -------------------------------------------------------------------------------- /benchmarks/kostya_matmul.nim: -------------------------------------------------------------------------------- 1 | # From: https://github.com/kostya/benchmarks 2 | 3 | import os, strutils 4 | import ../arraymancer 5 | 6 | proc matgen(n: int): auto = 7 | result = newTensor(@[n,n],float64,Backend.Cpu) 8 | let tmp = 1.0 / (n*n).float64 9 | for i in 0 .. 0: 15 | n = parseInt(paramStr(1)) 16 | n = n div 2 * 2 17 | 18 | let a, b = matgen n 19 | let c = a * b 20 | 21 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8) 22 | 23 | # run with kostya_matmul 1500 -------------------------------------------------------------------------------- /benchmarks/kostya_matmul_lc_reshape.nim: -------------------------------------------------------------------------------- 1 | # From: https://github.com/kostya/benchmarks 2 | 3 | import os, strutils, future 4 | import ../arraymancer 5 | 6 | proc matgen(n: int): auto = 7 | result = newTensor(@[n,n],float64,Backend.Cpu) 8 | let tmp = 1.0 / (n*n).float64 9 | return lc[tmp * (i - j).float64 * (i + j).float64 | (i <- 0..0: 14 | n = parseInt(paramStr(1)) 15 | n = n div 2 * 2 16 | 17 | let a, b = matgen n 18 | let c = a * b 19 | 20 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8) 21 | 22 | # run with kostya_matmul 1500 -------------------------------------------------------------------------------- /src/autograd/utils.nim: -------------------------------------------------------------------------------- 1 | import macros, sequtils 2 | 3 | 4 | macro getSubType*(TT: typedesc): untyped = 5 | # Get the subtype T of an AnyTensor[T] input 6 | getTypeInst(TT)[1][1] 7 | 8 | 9 | ## The following should not be useful, if ops is possible in fprop 10 | ## Shape should be matching in bprop and we shoudln't need special scalar treatment 11 | 12 | # proc isScalar(t: AnyTensor): bool {.inline.}= 13 | # for dim in t.shape: 14 | # if dim != 1 and dim != 0: 15 | # return false 16 | # return true 17 | 18 | 19 | template product*[T: SomeNumber](s: seq[T]): T = 20 | ## Get the product of all numbers in a sequence or array 21 | s.foldl(a*b) -------------------------------------------------------------------------------- /benchmarks/kostya_matmul_hadamard.nim: -------------------------------------------------------------------------------- 1 | # From: https://github.com/kostya/benchmarks 2 | 3 | import os, strutils, sequtils 4 | import ../arraymancer 5 | 6 | proc matgen(n: int): auto = 7 | result = newTensor(@[n,n],float64,Backend.Cpu) 8 | let tmp = 1.0 / (n*n).float64 9 | let j_idx = @[toSeq(0..0: 16 | n = parseInt(paramStr(1)) 17 | n = n div 2 * 2 18 | 19 | let a, b = matgen n 20 | let c = a * b 21 | 22 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8) 23 | 24 | # run with kostya_matmul 1500 -------------------------------------------------------------------------------- /benchmarks/kostya_matmul_mitems.nim: -------------------------------------------------------------------------------- 1 | # From: https://github.com/kostya/benchmarks 2 | 3 | import os, strutils 4 | import ../arraymancer 5 | 6 | proc divmod[T: SomeInteger](n: T, b: T): (T, T) = 7 | ## return (n div base, n mod base) 8 | return (n div b, n mod b) 9 | 10 | proc matgen(n: int): auto = 11 | result = newTensor(@[n,n],float64,Backend.Cpu) 12 | let tmp = 1.0 / (n*n).float64 13 | var counter = 0 14 | for val in result.mitems: 15 | let (i, j) = counter.divmod(n) 16 | val = (i - j).float64 * (i + j).float64 * tmp 17 | inc counter 18 | 19 | var n = 100 20 | if paramCount()>0: 21 | n = parseInt(paramStr(1)) 22 | n = n div 2 * 2 23 | 24 | let a, b = matgen n 25 | let c = a * b 26 | 27 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8) 28 | 29 | # run with kostya_matmul 1500 -------------------------------------------------------------------------------- /tests/all_tests_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Please compile with -d:cuda switch 16 | import ../src/arraymancer, 17 | ./tensors/test_operators_blas_cuda, 18 | ./tensors/test_accessors_slicer_cuda, 19 | ./tensors/test_shapeshifting_cuda -------------------------------------------------------------------------------- /src/arraymancer_nn_primitives.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./arraymancer 16 | 17 | import ./nn_primitives/[activation_primitives, linear_primitives, sigmoid_cross_entropy_primitives] 18 | 19 | export activation_primitives, linear_primitives, sigmoid_cross_entropy_primitives -------------------------------------------------------------------------------- /src/nn/layers/layer.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer 16 | 17 | type Layer*[TT] = ref object of Gate[TT] 18 | ## Inherits from Gate (arity field) 19 | ## Add required fields for gradient descent 20 | weight*: TT 21 | bias*: TT 22 | dW*: TT 23 | dB*: TT -------------------------------------------------------------------------------- /tests/tensors/test_optimization.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, sequtils 17 | 18 | suite "Optimization": 19 | test "Test if contiguous slices are detected as contiguous": 20 | let a = [[1, 2, 3, 4, 5], 21 | [6, 7, 8, 9, 10]].toTensor 22 | 23 | check: a[1, 2..3].isContiguous == true -------------------------------------------------------------------------------- /src/arraymancer/comparison.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc `==`*[T](a,b: Tensor[T]): bool {.noSideEffect.}= 16 | ## Tensor comparison 17 | if a.shape != b.shape: return false 18 | 19 | for a, b in zip(a,b): 20 | ## Iterate through the tensors using stride-aware iterators 21 | ## Returns early if false 22 | if a != b: return false 23 | return true -------------------------------------------------------------------------------- /src/arraymancer_ag.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./autograd/utils, 16 | ./autograd/autograd, 17 | ./autograd/gates_basic, 18 | ./autograd/gates_blas, 19 | ./autograd/gates_reduce, 20 | ./autograd/accessors 21 | 22 | export autograd, 23 | gates_basic, 24 | gates_blas, 25 | gates_reduce, 26 | accessors -------------------------------------------------------------------------------- /src/arraymancer_nn.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./arraymancer, ./arraymancer_ag, ./arraymancer_nn_primitives 16 | 17 | import nn/activation/[sigmoid, relu], 18 | nn/layers/linear, 19 | nn/loss/sigmoid_cross_entropy, 20 | nn/optimizers/optimizers 21 | 22 | 23 | export sigmoid, relu 24 | export linear, sigmoid_cross_entropy 25 | export optimizers -------------------------------------------------------------------------------- /tests/tensors/test_filling_data.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import ../../src/arraymancer 17 | import unittest, math, future, sequtils 18 | 19 | 20 | suite "Testing miscellaneous data functions": 21 | test "Copy data from source": 22 | let a = [[1,2],[3,4]].toTensor.reshape(2,2) 23 | 24 | var b = ones[int](4,1) 25 | 26 | b.copy_from(a) 27 | 28 | check: b == [[1],[2], [3], [4]].toTensor -------------------------------------------------------------------------------- /src/nn/loss/loss.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, typetraits 16 | 17 | 18 | type Loss* [TT] = ref object of Gate[TT] 19 | batch_size*: seq[int] 20 | target*: TT 21 | 22 | 23 | method forward*[TT](self: Loss[TT], a: Variable[TT], target: TT): Variable[TT] {.base, inline.}= 24 | # Forward for loss layers 25 | raise newException(ValueError, "forward method is not implemented for " & $self.type.name) -------------------------------------------------------------------------------- /src/autograd/accessors.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./autograd, ../arraymancer 16 | 17 | template `[]`*[TT](v: Variable[TT], args: varargs[untyped]): Variable[TT] = 18 | var result: type(v) 19 | new result 20 | 21 | result.tape = v.tape 22 | result.ancestor = v.ancestor 23 | result.value = v.value.unsafeSlice(args) 24 | result.grad = v.grad.unsafeSlice(args) 25 | 26 | result 27 | 28 | # TODO: tests for slicing correspondance -------------------------------------------------------------------------------- /tests/manual_checks/autograd_mean_arraymancer.nim: -------------------------------------------------------------------------------- 1 | 2 | import ../src/arraymancer, ../src/arraymancer_ag 3 | import sequtils 4 | 5 | let ctx = newContext Tensor[float32] 6 | 7 | let 8 | a = ctx.variable(toSeq(1..12).toTensor.reshape(3,4).astype(float32)) 9 | b = ctx.variable(toSeq(2..13).toTensor.reshape(3,4).astype(float32)) 10 | c = ctx.variable(toSeq(3..11).toTensor.reshape(3,3).astype(float32)) 11 | x = ctx.variable(toSeq(4..15).toTensor.reshape(4,3).astype(float32)) 12 | y = ctx.variable(toSeq(5..16).toTensor.reshape(4,3).astype(float32)) 13 | 14 | 15 | # for t in [a,b,c,x,y]: 16 | # echo t.value 17 | 18 | 19 | proc forwardNeuron[T](a,b,c,x,y: T): T = 20 | let 21 | ax = a * x 22 | by = b * y 23 | axpby = ax + by 24 | axpbypc = axpby + c 25 | # s = axpbypc.sigmoid() 26 | return axpbypc 27 | 28 | 29 | var s = mean forwardNeuron(a,b,c,x,y) 30 | 31 | 32 | echo s.value 33 | 34 | s.backprop 35 | 36 | echo a.grad 37 | 38 | echo b.grad 39 | 40 | echo c.grad 41 | 42 | echo x.grad 43 | 44 | echo y.grad -------------------------------------------------------------------------------- /docs/Linear algebra notation comparison.md: -------------------------------------------------------------------------------- 1 | | Language/lib | Normal matmul | element-wise matmul (Hadamard) | vec-vec dot product | mat-vec multiplication| 2 | | ------------- | ---------------------------- | --- | --- | --- | 3 | | Arraymancer | A * B | .* | dot(A, B) | A * B | 4 | | neo/linalg | A * B | \|*\| | A * B | A * B | 5 | | Julia & Matlab | A * B | .* | dot(A, B) | A * B | 6 | | Numpy ndarray| np.dot(A, B) or np.matmul(A, B) or A @ B| np.multiply(A, B) or A * B | np.dot(A, B) or np.inner(A, B) | np.dot(A, B) | 7 | | R | A %*% B | A * B | A %*% B or dot(A, B)| A %*% B | 8 | | Tensorflow | tf.matmul(A, B) or A @ B | tf.multiply(A, B) | tf.matmul(a, b, transpose_a=False, transpose_b=True) or tf.tensordot(a, b, 1) or tf.einsum('i,i->', x, y) | same reshape/transpose/einsum shenanigans as vec-vec| 9 | | Torch/PyTorch | torch.mm(A,B) or torch.matmul(A,B) | torch.cmul(A, B) | torch.dot(A, B) or torch.matmul(A, B) | torch.mv(A, B) or torch.dot(A, B) 10 | | Theano | theano.tensor.dot(A, B) | A * B | dot(A, B) or vdot(A, B) ?| dot(A, B) or tensordot(A,B) ? | 11 | | Common math | -------------------------------------------------------------------------------- /tests/all_tests_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../src/arraymancer, 16 | ./tensors/test_init_deprecated, 17 | ./tensors/test_comparison_deprecated, 18 | ./tensors/test_accessors_deprecated, 19 | ./tensors/test_accessors_slicer_deprecated, 20 | ./tensors/test_display_deprecated, 21 | ./tensors/test_operators_blas_deprecated, 22 | ./tensors/test_aggregate_deprecated, 23 | ./tensors/test_shapeshifting_deprecated, 24 | ./tensors/test_ufunc_deprecated 25 | -------------------------------------------------------------------------------- /src/arraymancer/display_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc `$`*[T](t: CudaTensor[T]): string = 16 | ## Pretty-print a CudaTensor (when using ``echo`` for example) 17 | let desc = "Tensor of shape " & t.shape.join("x") & " of type \"" & T.name & "\" on backend \"" & "Cuda" & "\"" 18 | 19 | let cpu_t = t.cpu() 20 | 21 | if t.rank <= 2: 22 | return desc & "\n" & cpu_t.disp2d 23 | elif t.rank == 3: 24 | return desc & "\n" & cpu_t.disp3d 25 | elif t.rank == 4: 26 | return desc & "\n" & cpu_t.disp4d 27 | else: 28 | return desc & "\n" & " -- NotImplemented: Display not implemented for tensors of rank > 4" -------------------------------------------------------------------------------- /benchmarks/implementation/proc_method_closure_bench.nim: -------------------------------------------------------------------------------- 1 | import times 2 | 3 | type FooBase = ref object {.inheritable.} 4 | dummy: int 5 | 6 | type Foo{.final.} = ref object of FooBase 7 | value : float32 8 | 9 | 10 | proc inplace_add_proc(x: var Foo, a: float32) = 11 | x.value += a 12 | 13 | proc inplace_add_closure(x: var float32, a: float32) = 14 | proc add_closure(v: var float32) = v += a 15 | add_closure(x) 16 | 17 | method inplace_add_method(x: FooBase, a: float32) {.base.} = 18 | discard 19 | 20 | method inplace_add_method(x: Foo, a: float32) = 21 | x.value += a 22 | 23 | var bar : Foo 24 | new bar 25 | var start = cpuTime() 26 | for i in 0..<100000000: 27 | inplace_add_proc(bar, 1.0f) 28 | echo " Proc with ref object ", cpuTime() - start 29 | 30 | var x : float32 31 | start = cpuTime() 32 | for i in 0..<100000000: 33 | inplace_add_closure(x, 1.0f) 34 | echo " Closures ", cpuTime() - start 35 | 36 | var baz : Foo 37 | new baz 38 | start = cpuTime() 39 | for i in 0..<100000000: 40 | inplace_add_method(baz, 1.0f) 41 | echo " Methods ", cpuTime() - start 42 | 43 | # Results with -d:release on i5-5257U (dual-core mobile 2.7GHz, turbo 3.1) 44 | # Proc with ref object 0.099993 45 | # Closures 2.708598 46 | # Methods 0.3122219999999998 -------------------------------------------------------------------------------- /tests/all_tests.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../src/arraymancer, 16 | ./tensors/test_init, 17 | ./tensors/test_comparison, 18 | ./tensors/test_accessors, 19 | ./tensors/test_accessors_slicer, 20 | ./tensors/test_display, 21 | ./tensors/test_operators_blas, 22 | ./tensors/test_math_functions, 23 | ./tensors/test_higherorder, 24 | ./tensors/test_aggregate, 25 | ./tensors/test_shapeshifting, 26 | ./tensors/test_broadcasting, 27 | ./tensors/test_ufunc, 28 | ./tensors/test_filling_data, 29 | ./tensors/test_optimization, 30 | ./tensors/test_bugtracker, 31 | ./autograd/test_gate_blas 32 | -------------------------------------------------------------------------------- /src/arraymancer/backend/blis.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | when defined(blis): 16 | static: echo "--USING BLIS--" 17 | include ./blis_api 18 | let blis_status = bli_init() 19 | echo "Blis initiatialization status: " & $blis_status 20 | 21 | proc quit_blis() {.noconv.}= 22 | when defined(debug): 23 | echo "Blis quit status: " & $bli_finalize() 24 | else: 25 | discard bli_finalize() 26 | addQuitProc(quit_blis) 27 | 28 | # else: 29 | # static: echo "Consider adding BLIS from \"https://github.com/flame/blis\" " & 30 | # "and compile Arraymancer with \"-d:blis\" " & 31 | # "for operations on array slices without copy. " & 32 | # "OSX users can install it through Homebrew." 33 | 34 | -------------------------------------------------------------------------------- /tests/tensors/test_shapeshifting_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, future, sequtils 17 | 18 | suite "Shapeshifting": 19 | test "Reshape": 20 | let a = toSeq(1..4).toTensor(Cpu).reshape(2,2) 21 | check: a == [[1,2], 22 | [3,4]].toTensor(Cpu) 23 | 24 | test "Concatenation": 25 | let a = toSeq(1..4).toTensor(Cpu).reshape(2,2) 26 | 27 | let b = toSeq(5..8).toTensor(Cpu).reshape(2,2) 28 | 29 | check: concat(a,b, axis = 0) == [[1,2], 30 | [3,4], 31 | [5,6], 32 | [7,8]].toTensor(Cpu) 33 | check: concat(a,b, axis = 1) == [[1,2,5,6], 34 | [3,4,7,8]].toTensor(Cpu) -------------------------------------------------------------------------------- /src/arraymancer/filling_data.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc check_size(a, b:AnyTensor) {.noSideEffect.}= 16 | ## Check if the total number of elements match 17 | if a.size != b.size: 18 | raise newException(ValueError, "Both Tensors should have the same total number of elements") 19 | 20 | proc copy_from*[T](dst: var Tensor[T], src: Tensor[T]) = 21 | ## Copy the data from a source Tensor. Both tensors must have the same number of elements 22 | ## but do not need to have the same shape. 23 | ## Data is copied without re-allocation. 24 | ## Warning ⚠ 25 | ## The destination tensor data will be overwritten. It however conserves its shape and strides. 26 | 27 | when compileOption("boundChecks"): 28 | check_size(dst, src) 29 | 30 | for x, val in mzip(dst, src): 31 | x = val -------------------------------------------------------------------------------- /src/arraymancer/math_functions.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Non-operator math functions 17 | 18 | proc reciprocal*[T: SomeReal](t: Tensor[T]): Tensor[T] = 19 | # Return a tensor with the reciprocal 1/x of all elements 20 | t.mapT(1.T/x) 21 | 22 | proc mreciprocal*[T: SomeReal](t: var Tensor[T]) = 23 | # Apply the reciprocal 1/x in-place to all elements of the Tensor 24 | t.applyT(1.T/x) 25 | 26 | proc negate*[T: SomeSignedInt|SomeReal](t: Tensor[T]): Tensor[T] = 27 | # Return a tensor with all elements negated (10 -> -10) 28 | t.mapT(-x) 29 | 30 | proc mnegate*[T: SomeSignedInt|SomeReal](t: var Tensor[T]) = 31 | # Negate in-place all elements of the tensor (10 -> -10) 32 | t.applyT(-x) 33 | 34 | proc `-`*[T: SomeNumber](t: Tensor[T]): Tensor[T] = 35 | ## Negate all values of a Tensor 36 | t.mapT(-x) -------------------------------------------------------------------------------- /src/arraymancer/aggregate.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # ### Standard aggregate functions 16 | # TODO consider using stats from Nim standard lib: https://nim-lang.org/docs/stats.html#standardDeviation,RunningStat 17 | 18 | proc sum*[T: SomeNumber](t: Tensor[T]): T {.noSideEffect.}= 19 | ## Compute the sum of all elements of T 20 | 21 | result = 0.T 22 | for val in t: 23 | result += val 24 | 25 | proc sum*[T: SomeNumber](t: Tensor[T], axis: int): Tensor[T] {.inline.}= 26 | ## Compute the sum of all elements of T along an axis 27 | t.reduce(`+`, axis = axis) 28 | 29 | proc mean*[T: SomeReal](t: Tensor[T]): T {.inline.}= 30 | ## Compute the mean of all elements of T 31 | t.sum / t.size.T 32 | 33 | proc mean*[T: SomeReal](t: Tensor[T], axis: int): Tensor[T] {.inline.}= 34 | ## Compute the mean of T along an axis 35 | t.sum(axis) / t.shape[axis].T -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | version: '{build}' 2 | 3 | cache: 4 | - nim-0.17.2_x64.zip 5 | - x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z 6 | - packages -> **\packages.config 7 | - '%LocalAppData%\NuGet\Cache -> **\packages.config' 8 | 9 | matrix: 10 | fast_finish: true 11 | 12 | environment: 13 | matrix: 14 | - MINGW_ARCHIVE: x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z 15 | MINGW_DIR: mingw64 16 | MINGW_URL: https://ayera.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z 17 | NIM_ARCHIVE: nim-0.17.2_x64.zip 18 | NIM_DIR: nim-0.17.2 19 | NIM_URL: https://nim-lang.org/download/nim-0.17.2_x64.zip 20 | platform: x64 21 | 22 | install: 23 | - MKDIR %CD%\tools_tmp 24 | - IF not exist "%MINGW_ARCHIVE%" appveyor DownloadFile "%MINGW_URL%" -FileName "%MINGW_ARCHIVE%" 25 | - 7z x -y "%MINGW_ARCHIVE%" -o"%CD%\tools_tmp"> nul 26 | - IF not exist "%NIM_ARCHIVE%" appveyor DownloadFile "%NIM_URL%" -FileName "%NIM_ARCHIVE%" 27 | - 7z x -y "%NIM_ARCHIVE%" -o"%CD%\tools_tmp"> nul 28 | - SET PATH=%CD%\tools_tmp\%NIM_DIR%\bin;%CD%\tools_tmp\%MINGW_DIR%\bin;%PATH% 29 | - ps: nuget install OpenBLAS -o "${env:APPVEYOR_BUILD_FOLDER}" 30 | - ps: cp OpenBLAS.0.2.14.1/lib/native/bin/x64/libopenblas.dll blas.dll 31 | - SET PATH=%PATH%;%CD% 32 | 33 | build_script: 34 | - nimble.exe refresh 35 | 36 | test_script: 37 | - nimble.exe test 38 | 39 | deploy: off 40 | -------------------------------------------------------------------------------- /src/arraymancer/backend/cuda_global_state.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # ################################################### 17 | # Global Cuda and CuBLAS state 18 | 19 | # CuBLAS stream for parallel async processing on GPU 20 | # Computations/Memcpy on different streams are done in simultaneously 21 | # Streams are also necessary for async Cuda procs like cudaMemcpyAsync 22 | var defaultStream: cublas_api.cudaStream_t 23 | check cudaStreamCreate(addr defaultStream) 24 | 25 | # CuBLAS handle 26 | # Note: it prevents {.noSideEffect.} in all CuBLAS proc :/ 27 | var defaultHandle: cublasHandle_t 28 | check cublasCreate(addr defaultHandle) 29 | 30 | proc cudaRelease() {.noconv.}= 31 | # Release all cuda resources 32 | check cublasDestroy(defaultHandle) 33 | check cudaStreamDestroy(defaultStream) 34 | 35 | when defined(debug): 36 | echo "CUDA and CuBLAS resources successfully released" 37 | 38 | addQuitProc(cudaRelease) 39 | -------------------------------------------------------------------------------- /src/arraymancer/exporting.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc toRawSeq*[T](t:Tensor[T]): seq[T] {.noSideEffect.} = 16 | ## Convert a tensor to the raw sequence of data. 17 | 18 | # Due to forward declaration this proc must be declared 19 | # after "cpu" proc are declared in init_cuda 20 | when t is Tensor: 21 | return t.data 22 | elif t is CudaTensor: 23 | return t.cpu.data 24 | 25 | proc export_tensor*[T](t: Tensor[T]): 26 | tuple[shape: seq[int], strides: seq[int], data: seq[T]] {.noSideEffect.}= 27 | ## Export the tensor as a tuple containing 28 | ## - shape 29 | ## - strides 30 | ## - data 31 | ## If the tensor was not contiguous (a slice for example), it is reshaped. 32 | ## Data is exported in C order (last index changes the fastest, column in 2D case) 33 | 34 | let contig_t = t.unsafeContiguous 35 | 36 | result.shape = contig_t.shape 37 | result.strides = contig_t.strides 38 | result.data = contig_t.data -------------------------------------------------------------------------------- /src/arraymancer/utils/ast_utils.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Tools to manipulate Nim Abstract Syntax Tree 16 | 17 | proc hasType(x: NimNode, t: static[string]): bool {. compileTime .} = 18 | ## Compile-time type checking 19 | sameType(x, bindSym(t)) 20 | 21 | proc isInt(x: NimNode): bool {. compileTime .} = 22 | ## Compile-time type checking 23 | hasType(x, "int") 24 | 25 | proc isAllInt(slice_args: NimNode): bool {. compileTime .} = 26 | ## Compile-time type checking 27 | result = true 28 | for child in slice_args: 29 | # We don't use early return here as everything is evaluated at compile-time, 30 | # has no run-time impact and there are very few slice_args 31 | result = result and isInt(child) 32 | 33 | proc pop(tree: var NimNode): NimNode {. compileTime .}= 34 | ## varargs[untyped] consumes all arguments so the actual value should be popped 35 | ## https://github.com/nim-lang/Nim/issues/5855 36 | result = tree[tree.len-1] 37 | tree.del(tree.len-1) -------------------------------------------------------------------------------- /src/arraymancer/data_structure_helpers.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | proc isNaiveIterable(t: AnyTensor): bool {.inline.}= 17 | ## If t is not a slice we can iterate with a naive for loop 18 | return t.data.len == t.size 19 | 20 | proc isNaiveIterableWith(t1: AnyTensor, t2: AnyTensor): bool {.inline.}= 21 | ## If shape and strides are the same, we can iterate on both tensors at the same time 22 | ## modulo their offsets 23 | ## We don't need those to have data.len == size 24 | return (t1.strides == t2.strides) and (t1.shape == t2.shape) 25 | 26 | proc getTransposeTarget(t: AnyTensor): TransposeType {.noSideEffect.}= 27 | ## TransposeType is introduced by ``nimblas`` 28 | ## Default layout is Row major. 29 | ## Everytime it is worth it or fused with a BLAS operation we change the strides to Row-Major 30 | if is_C_contiguous(t): return TransposeType.noTranspose 31 | elif is_F_contiguous(t): return TransposeType.transpose 32 | else: raise newException(ValueError,"Operation not supported for this matrix. It has a non-contiguous layout") -------------------------------------------------------------------------------- /tests/tensors/test_bugtracker.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest 17 | 18 | 19 | suite "Testing specific issues from bug tracker": 20 | test "Span slicing inside dynamic type procs fails to compile": 21 | # https://github.com/mratsim/Arraymancer/issues/43 22 | proc boo[T](): T {.used.}= 23 | var a = zeros[int]([2,2]) 24 | echo a[1,_] #<-- Bug was undeclared identifier '_', 25 | # unfortunately there is no way to gracefully check this 26 | # with when not compiles for example 27 | 28 | # Check that our solution, export '_' doesn't create compatibility issue 29 | 30 | # tuple destructuring 31 | {.push hints: off.} ## TODO replaced by XDeclaredButNotUsed when https://github.com/nim-lang/Nim/issues/4044 32 | let (a, _, c) = (1, @[2,3],"hello") 33 | {.pop.} 34 | 35 | # https://github.com/mratsim/Arraymancer/issues/61 36 | proc foo[T](t: Tensor[T], x: int): Tensor[T] = 37 | t.unsafeSlice(x, _, _).unsafeReshape([t.shape[1], t.shape[2]]) 38 | 39 | discard zeros[int]([2,2,2]).foo(1) -------------------------------------------------------------------------------- /tests/manual_checks/autograd_mean_pytorch.py: -------------------------------------------------------------------------------- 1 | 2 | # Reference code 3 | 4 | import torch 5 | from torch.autograd import Variable 6 | 7 | a = Variable(torch.arange(1,13).view(3,4), requires_grad=True) 8 | b = Variable(torch.arange(2,14).view(3,4), requires_grad=True) 9 | c = Variable(torch.arange(3,12).view(3,3), requires_grad=True) 10 | x = Variable(torch.arange(4,16).view(4,3), requires_grad=True) 11 | y = Variable(torch.arange(5,17).view(4,3), requires_grad=True) 12 | 13 | 14 | def forwardNeuron(a,b,c,x,y): 15 | ax = a @ x 16 | by = b @ y 17 | axpby = ax + by 18 | axpbypc = axpby + c 19 | 20 | return axpbypc 21 | 22 | s = forwardNeuron(a,b,c,x,y).mean() 23 | 24 | print(s) 25 | Variable containing: 26 | 599 27 | [torch.FloatTensor of size 1] 28 | 29 | s.backward() 30 | 31 | print(a.grad) 32 | # Variable containing: 33 | # 1.6667 2.6667 3.6667 4.6667 34 | # 1.6667 2.6667 3.6667 4.6667 35 | # 1.6667 2.6667 3.6667 4.6667 36 | # [torch.FloatTensor of size 3x4] 37 | 38 | print(b.grad) 39 | # Variable containing: 40 | # 2 3 4 5 41 | # 2 3 4 5 42 | # 2 3 4 5 43 | # [torch.FloatTensor of size 3x4] 44 | 45 | print(c.grad) 46 | # Variable containing: 47 | # 0.1111 0.1111 0.1111 48 | # 0.1111 0.1111 0.1111 49 | # 0.1111 0.1111 0.1111 50 | # [torch.FloatTensor of size 3x3] 51 | 52 | print(x.grad) 53 | # Variable containing: 54 | # 1.6667 1.6667 1.6667 55 | # 2.0000 2.0000 2.0000 56 | # 2.3333 2.3333 2.3333 57 | # 2.6667 2.6667 2.6667 58 | # [torch.FloatTensor of size 4x3] 59 | 60 | print(y.grad) 61 | # Variable containing: 62 | # 2.0000 2.0000 2.0000 63 | # 2.3333 2.3333 2.3333 64 | # 2.6667 2.6667 2.6667 65 | # 3.0000 3.0000 3.0000 66 | # [torch.FloatTensor of size 4x3] -------------------------------------------------------------------------------- /tests/tensors/test_aggregate.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math 17 | 18 | suite "Testing aggregation functions": 19 | let t = [[0, 1, 2], 20 | [3, 4, 5], 21 | [6, 7, 8], 22 | [9, 10, 11]].toTensor() 23 | 24 | test "Sum all elements": 25 | check: t.sum == 66 26 | 27 | test "Sum over axis": 28 | let row_sum = [[18, 22, 26]].toTensor() 29 | let col_sum = [[3], 30 | [12], 31 | [21], 32 | [30]].toTensor() 33 | check: t.sum(axis=0) == row_sum 34 | check: t.sum(axis=1) == col_sum 35 | 36 | ## TODO: 3D axis sum 37 | test "Mean of all elements": 38 | check: t.astype(float).mean == 5.5 # Note: may fail due to float rounding 39 | 40 | test "Mean over axis": 41 | let row_mean = [[4.5, 5.5, 6.5]].toTensor() 42 | let col_mean = [[1.0], 43 | [4.0], 44 | [7.0], 45 | [10.0]].toTensor() 46 | check: t.astype(float).mean(axis=0) == row_mean 47 | check: t.astype(float).mean(axis=1) == col_mean -------------------------------------------------------------------------------- /tests/tensors/test_display.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import math, unittest 17 | 18 | 19 | suite "Displaying tensors": 20 | test "Display compiles": 21 | const 22 | a = @[1, 2, 3, 4, 5] 23 | b = @[1, 2, 3, 4, 5] 24 | 25 | var 26 | vandermonde: seq[seq[int]] 27 | row: seq[int] 28 | 29 | vandermonde = newSeq[seq[int]]() 30 | 31 | for i, aa in a: 32 | row = newSeq[int]() 33 | vandermonde.add(row) 34 | for j, bb in b: 35 | vandermonde[i].add(aa^bb) 36 | 37 | # @[@[1, 1, 1, 1, 1], @[2, 4, 8, 16, 32], @[3, 9, 27, 81, 243], @[4, 16, 64, 256, 1024], @[5, 25, 125, 625, 3125]] 38 | 39 | let t_van = vandermonde.toTensor() 40 | when not compiles(echo t_van): check: false 41 | 42 | # Tensor of shape 5x5 of type "int" on backend "Cpu" 43 | # |1 1 1 1 1| 44 | # |2 4 8 16 32| 45 | # |3 9 27 81 243| 46 | # |4 16 64 256 1024| 47 | # |5 25 125 625 3125| 48 | 49 | # TODO: Better display tests -------------------------------------------------------------------------------- /tests/tensors/test_comparison.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math 17 | 18 | 19 | suite "Testing tensor comparison": 20 | test "Testing for [1..^2, 1..3] slicing": 21 | const 22 | a = @[1, 2, 3, 4, 5] 23 | b = @[1, 2, 3, 4, 5] 24 | 25 | var 26 | vandermonde: seq[seq[int]] 27 | row: seq[int] 28 | 29 | vandermonde = newSeq[seq[int]]() 30 | 31 | for i, aa in a: 32 | row = newSeq[int]() 33 | vandermonde.add(row) 34 | for j, bb in b: 35 | vandermonde[i].add(aa^bb) 36 | 37 | let t_van = vandermonde.toTensor() 38 | 39 | # Tensor of shape 5x5 of type "int" on backend "Cpu" 40 | # |1 1 1 1 1| 41 | # |2 4 8 16 32| 42 | # |3 9 27 81 243| 43 | # |4 16 64 256 1024| 44 | # |5 25 125 625 3125| 45 | 46 | let test = @[@[4, 8, 16], @[9, 27, 81], @[16, 64, 256]] 47 | let t_test = test.toTensor() 48 | 49 | check: t_van[1..^2,1..3] == t_test 50 | check: t_van[1..3,1..3] == t_test -------------------------------------------------------------------------------- /tests/tensors/test_display_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import math, unittest 17 | 18 | 19 | suite "Displaying tensors": 20 | test "Display compiles": 21 | const 22 | a = @[1, 2, 3, 4, 5] 23 | b = @[1, 2, 3, 4, 5] 24 | 25 | var 26 | vandermonde: seq[seq[int]] 27 | row: seq[int] 28 | 29 | vandermonde = newSeq[seq[int]]() 30 | 31 | for i, aa in a: 32 | row = newSeq[int]() 33 | vandermonde.add(row) 34 | for j, bb in b: 35 | vandermonde[i].add(aa^bb) 36 | 37 | # @[@[1, 1, 1, 1, 1], @[2, 4, 8, 16, 32], @[3, 9, 27, 81, 243], @[4, 16, 64, 256, 1024], @[5, 25, 125, 625, 3125]] 38 | 39 | let t_van = vandermonde.toTensor(Cpu) 40 | when compiles(echo t_van): check: true 41 | 42 | # Tensor of shape 5x5 of type "int" on backend "Cpu" 43 | # |1 1 1 1 1| 44 | # |2 4 8 16 32| 45 | # |3 9 27 81 243| 46 | # |4 16 64 256 1024| 47 | # |5 25 125 625 3125| 48 | 49 | # TODO: Better display tests -------------------------------------------------------------------------------- /tests/tensors/test_comparison_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math 17 | 18 | 19 | suite "Testing tensor comparison": 20 | test "Testing for [1..^2, 1..3] slicing": 21 | const 22 | a = @[1, 2, 3, 4, 5] 23 | b = @[1, 2, 3, 4, 5] 24 | 25 | var 26 | vandermonde: seq[seq[int]] 27 | row: seq[int] 28 | 29 | vandermonde = newSeq[seq[int]]() 30 | 31 | for i, aa in a: 32 | row = newSeq[int]() 33 | vandermonde.add(row) 34 | for j, bb in b: 35 | vandermonde[i].add(aa^bb) 36 | 37 | let t_van = vandermonde.toTensor(Cpu) 38 | 39 | # Tensor of shape 5x5 of type "int" on backend "Cpu" 40 | # |1 1 1 1 1| 41 | # |2 4 8 16 32| 42 | # |3 9 27 81 243| 43 | # |4 16 64 256 1024| 44 | # |5 25 125 625 3125| 45 | 46 | let test = @[@[4, 8, 16], @[9, 27, 81], @[16, 64, 256]] 47 | let t_test = test.toTensor(Cpu) 48 | 49 | check: t_van[1..^2,1..3] == t_test 50 | check: t_van[1..3,1..3] == t_test -------------------------------------------------------------------------------- /src/arraymancer/fallback/blas_l3_gemm_aux.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Compute Y += alpha * X 16 | proc geaxpy[T]( m, n: int, 17 | alpha: T, 18 | X: ref array[MRNR, T], 19 | incRowX, incColX: int, 20 | Y: var seq[T], offY: int, 21 | incRowY, incColY: int) 22 | {.noSideEffect.}= 23 | 24 | if alpha != 1.T: 25 | for j in 0 ..< n: 26 | for i in 0 ..< m: 27 | Y[i*incRowY + j*incColY + offY] += alpha * X[i*incRowX + j*incColX] 28 | else: 29 | for j in 0 ..< n: 30 | for i in 0 ..< m: 31 | Y[i*incRowY + j*incColY + offY] += X[i*incRowX + j*incColX] 32 | 33 | # Compute X *= alpha 34 | proc gescal[T]( m, n: int, 35 | alpha: T, 36 | X: var seq[T], offX: int, 37 | incRowX, incColX: int) 38 | {.noSideEffect.} = 39 | 40 | if alpha != 0.T: 41 | for j in 0 ..< n: 42 | for i in 0 ..< m: 43 | X[i*incRowX + j*incColX + offX] *= alpha 44 | else: 45 | for j in 0 ..< n: 46 | for i in 0 ..< m: 47 | X[i*incRowX + j*incColX + offX] = 0 -------------------------------------------------------------------------------- /tests/tensors/test_math_functions.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, future, math 17 | 18 | suite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)": 19 | test "Reciprocal (element-wise 1/x)": 20 | var a = [1.0, 10, 20, 30].toTensor.reshape(4,1) 21 | 22 | 23 | check: a.reciprocal == [[1.0], 24 | [1.0/10.0], 25 | [1.0/20.0], 26 | [1.0/30.0]].toTensor 27 | 28 | a.mreciprocal 29 | 30 | check: a == [[1.0], 31 | [1.0/10.0], 32 | [1.0/20.0], 33 | [1.0/30.0]].toTensor 34 | 35 | test "Negate elements (element-wise -x)": 36 | block: # Out of place 37 | var a = [1.0, 10, 20, 30].toTensor.reshape(4,1) 38 | 39 | 40 | check: a.negate == [[-1.0], 41 | [-10.0], 42 | [-20.0], 43 | [-30.0]].toTensor 44 | 45 | a.mnegate 46 | 47 | check: a == [[-1.0], 48 | [-10.0], 49 | [-20.0], 50 | [-30.0]].toTensor -------------------------------------------------------------------------------- /src/nn/activation/relu.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils 16 | import ../../arraymancer_nn_primitives 17 | import math 18 | 19 | type ReluActivation* {.final.} [TT] = ref object of Gate[TT] 20 | cache: TT 21 | 22 | method forward*[TT](self: ReluActivation[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}= 23 | new result 24 | 25 | result.tape = a.tape 26 | result.value = relu a.value 27 | result.grad = zeros[getSubType(TT)](result.value.shape) 28 | 29 | method backward*[TT](self: ReluActivation[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 30 | result[0] = gradient.relu_backward(self.cache) 31 | 32 | proc relu*[TT](a: Variable[TT]): Variable[TT] = 33 | ## Input: 34 | ## - A variable 35 | 36 | # Gate 37 | var gate: ReluActivation[TT] 38 | new gate 39 | gate.arity = 1 40 | 41 | # Node 42 | var node: Node[TT] 43 | new node 44 | 45 | node.gate = gate 46 | node.parents[0] = a 47 | 48 | a.tape.push(node) 49 | 50 | # Resulting var 51 | result = gate.forward(a) 52 | result.ancestor = node 53 | node.child = result 54 | 55 | # Caching for backprop 56 | gate.cache = result.value -------------------------------------------------------------------------------- /src/nn/activation/sigmoid.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils 16 | import ../../arraymancer_nn_primitives 17 | 18 | type SigmoidActivation* {.final.} [TT] = ref object of Gate[TT] 19 | cache: TT 20 | 21 | method forward*[TT](self: SigmoidActivation[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}= 22 | new result 23 | 24 | result.tape = a.tape 25 | result.value = sigmoid a.value 26 | result.grad = zeros[getSubType(TT)](result.value.shape) 27 | 28 | method backward*[TT](self: SigmoidActivation[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 29 | result[0] = gradient.sigmoid_backward(self.cache) 30 | 31 | proc sigmoid*[TT](a: Variable[TT]): Variable[TT] = 32 | ## Input: 33 | ## - A variable 34 | 35 | # Gate 36 | var gate: SigmoidActivation[TT] 37 | new gate 38 | gate.arity = 1 39 | 40 | # Node 41 | var node: Node[TT] 42 | new node 43 | 44 | node.gate = gate 45 | node.parents[0] = a 46 | 47 | a.tape.push(node) 48 | 49 | # Resulting var 50 | result = gate.forward(a) 51 | result.ancestor = node 52 | node.child = result 53 | 54 | # Caching for backprop 55 | gate.cache = result.value -------------------------------------------------------------------------------- /src/arraymancer/fallback/naive_l2_gemv.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Notes on optimizing performance: 17 | # Google: https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt 18 | # UlmBLAS: https://github.com/michael-lehn/ulmBLAS/blob/master/ulmblas/level2/gemv.tcc 19 | 20 | 21 | proc naive_gemv_fallback[T: SomeInteger]( 22 | alpha: T, 23 | A: Tensor[T], 24 | x: Tensor[T], 25 | beta: T, 26 | y: var Tensor[T]) = 27 | ## y <- alpha * A * x + beta * y 28 | 29 | 30 | if alpha == 0.T and beta == 1.T: return 31 | 32 | # BLAS: scal (multiplication by a scalar) 33 | # WARNING: This will multiply all values, regardless of stepping. 34 | for val in y.mitems: 35 | val *= beta 36 | 37 | if alpha == 0.T: return 38 | 39 | # TODO: instead of a naive implementation use BLIS/ulmBLAS implementation with 40 | # - if A is colMajor, use fused axpy BLAS op 41 | # - if A is rowMajor, use fused dotu BLAS op 42 | # - packing 43 | 44 | # Naive implementation: split the matrices along vertical axis 45 | var i: int = 0 46 | let colA = A.shape[1] 47 | 48 | for ai in A.axis(0): 49 | y[i] = y[i] + alpha * dot(ai.reshape(colA),x) 50 | i += 1 51 | -------------------------------------------------------------------------------- /src/nn_primitives/linear_primitives.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../arraymancer 16 | import math 17 | 18 | # Sigmoid cross-entropy function that works directly on Tensors 19 | # and provide control without autograd 20 | 21 | # Linear forward and backward 22 | proc linear*[T](x: var Tensor[T], weight: Tensor[T], bias: Tensor[T]) {.inline.} = 23 | x = weight * x 24 | x .+= bias 25 | 26 | proc linear*[T](x: var Tensor[T], weight: Tensor[T]) {.inline.} = 27 | x = weight * x 28 | 29 | proc linear_backward*[T]( 30 | gradient: Tensor[T], 31 | cached_tensor, 32 | weight, bias: Tensor[T], 33 | dW, db: var Tensor[T]): Tensor[T] {.inline.} = 34 | result = weight.unsafeTranspose * gradient 35 | dW += gradient * cached_tensor.unsafeTranspose 36 | 37 | db = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html 38 | 39 | proc linear_backward*[T]( 40 | gradient: Tensor[T], 41 | cached_tensor, 42 | weight: Tensor[T], 43 | dW: var Tensor[T]): Tensor[T] {.inline.} = 44 | result = weight.unsafeTranspose * gradient 45 | dW += gradient * cached_tensor.unsafeTranspose 46 | 47 | -------------------------------------------------------------------------------- /src/nn/optimizers/optimizers.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer, typetraits 16 | 17 | type 18 | Optimizer*[T] = ref object {.inheritable.} 19 | # Base class for optimizer 20 | params*: seq[Variable[Tensor[T]]] # Todo: we can't specify a collection of generic types like AnyTensor currently 21 | lr*: T # Learning rate. Gradient update are scaled by the learning rate 22 | 23 | method update*(self: Optimizer) {.base.} = 24 | # Forward for loss layers 25 | raise newException(ValueError, "update method is not implemented for " & $self.type.name) 26 | 27 | proc zeroGrads*(o: Optimizer) = 28 | # Reset the gradients of the optimized params 29 | for v in o.params: 30 | v.grad = v.value.zeros_like 31 | 32 | type SGD*{.final.}[T] = ref object of Optimizer[T] 33 | 34 | proc newSGD*[T](params: varargs[Variable[Tensor[T]]], learning_rate: T): SGD[T] = 35 | SGD[T](params: @params, lr: learning_rate) 36 | 37 | method update*(self: SGD) = 38 | # Update the params with formula Value -= lr * gradient 39 | # Note: SGD expects gradient to be scaled by batchsize (done by default in Arraymancer) 40 | for v in self.params: 41 | v.value -= self.lr * v.grad 42 | v.grad = v.value.zeros_like 43 | -------------------------------------------------------------------------------- /src/arraymancer/shortcuts.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | template at*[T](t: Tensor[T], args: varargs[untyped]): untyped = 16 | ## Slice a Tensor and collapse singleton dimension. 17 | ## 18 | ## Input: 19 | ## - a Tensor 20 | ## - and: 21 | ## - specific coordinates (``varargs[int]``) 22 | ## - or a slice (cf. tutorial) 23 | ## Returns: 24 | ## - a value or a view of the Tensor corresponding to the slice 25 | ## Singleton dimension are collapsed 26 | ## Usage: 27 | ## See the ``[]`` macro 28 | t[args].unsafeSqueeze 29 | 30 | template unsafeAt*[T](t: Tensor[T], args: varargs[untyped]): untyped = 31 | ## Slice a Tensor and collapse singleton dimension. 32 | ## 33 | ## Data is shared between input and output. 34 | ## Input: 35 | ## - a Tensor 36 | ## - and: 37 | ## - specific coordinates (``varargs[int]``) 38 | ## - or a slice (cf. tutorial) 39 | ## Returns: 40 | ## - a value or a view of the Tensor corresponding to the slice 41 | ## Singleton dimension are collapsed 42 | ## Warning ⚠: 43 | ## This is a no-copy operation, data is shared with the input. 44 | ## This proc does not guarantee that a ``let`` value is immutable. 45 | ## Usage: 46 | ## See the ``[]`` macro 47 | t.unsafeSlice(args).unsafeSqueeze 48 | -------------------------------------------------------------------------------- /tests/autograd/test_gate_blas.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/[arraymancer, arraymancer_ag] 16 | import unittest, sequtils 17 | 18 | # # Differentiating through matmul: 19 | # # See http://cs231n.stanford.edu/vecDerivs.pdf 20 | # # And: https://danieltakeshi.github.io/2017/01/21/understanding-higher-order-local-gradient-computation-for-backpropagation-in-deep-neural-networks/ 21 | # # And: http://cs231n.stanford.edu/handouts/linear-backprop.pdf 22 | 23 | # # If base op is C = X * W 24 | # ∂C/∂X = previous_gradient * W.transpose 25 | # ∂C/∂W = X.transpose * previous_gradient 26 | 27 | # # If base op is C = W * X (our case) 28 | # ∂C/∂X = W.transpose * previous_gradient 29 | # ∂C/∂W = previous_gradient * X.transpose 30 | 31 | suite "Autograd of basic operations": 32 | test "Gradient of matrix multiplication": 33 | 34 | let W = toSeq(1..8).toTensor.reshape(2,4).astype(float32) 35 | let X = toSeq(11..22).toTensor.reshape(4,3).astype(float32) 36 | 37 | let ctx = newContext Tensor[float32] 38 | 39 | let w_ag = ctx.variable(W) 40 | let x_ag = ctx.variable(X) 41 | 42 | let C = w_ag * x_ag 43 | 44 | C.backprop 45 | 46 | let grad_C = ones[float32](2,3) 47 | check: w_ag.grad == grad_C * X.transpose 48 | check: x_ag.grad == W.transpose * grad_C 49 | -------------------------------------------------------------------------------- /src/arraymancer/init_cpu_deprecated_0_2_0.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | proc newTensor*(shape: openarray[int], T: typedesc): Tensor[T] {.noSideEffect, inline, deprecated.} = 17 | ## Creates a new Tensor on Cpu backend 18 | ## Input: 19 | ## - Shape of the Tensor 20 | ## - Type of its elements 21 | ## Result: 22 | ## - A Tensor of the proper shape initialized with 23 | ## the default type value (0 for numeric types) on Cpu backend 24 | tensorCpu(shape, result) 25 | result.data = newSeq[T](result.size) 26 | 27 | proc zeros*[T: SomeNumber](shape: openarray[int], typ: typedesc[T]): Tensor[T] {.noSideEffect, inline, deprecated.} = 28 | ## Creates a new Tensor filled with 0 29 | ## 30 | ## Input: 31 | ## - Shape of the Tensor 32 | ## - Type of its elements 33 | ## Result: 34 | ## - A zero-ed Tensor of the input shape on backend Cpu 35 | tensorCpu(shape, result) 36 | result.data = newSeq[T](result.size) 37 | 38 | 39 | proc ones*[T: SomeNumber](shape: openarray[int], typ: typedesc[T]): Tensor[T] {.noSideEffect,inline, deprecated.} = 40 | ## Creates a new Tensor filled with 1 41 | ## Input: 42 | ## - Shape of the Tensor 43 | ## - Type of its elements 44 | ## Result: 45 | ## - A one-ed Tensor of the same shape 46 | tensorCpu(shape, result) 47 | result.data = newSeqWith(result.size, 1.T) -------------------------------------------------------------------------------- /src/arraymancer/utils/nested_containers.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Tools to manipulate deep nested containers 17 | 18 | proc shape[T: not char](s: openarray[T], parent_shape: seq[int] = @[]): seq[int] {.noSideEffect.}= 19 | ## Helper function to get the shape of nested arrays/sequences 20 | ## C convention. Last index is the fastest changing (columns in 2D, depth in 3D) - Rows (slowest), Columns, Depth (fastest) 21 | ## The second argument "shape" is used for recursive call on nested arrays/sequences 22 | # Dimension check is using only the first nested element so further checking 23 | # must be one to confirm that the total number of elements match the shape. 24 | result = parent_shape & s.len 25 | when (T is seq|array): 26 | result = shape(s[0], result) 27 | 28 | iterator flatIter(s: string): string {.noSideEffect.} = 29 | yield s 30 | 31 | iterator flatIter[T: not char](s: openarray[T]): auto {.noSideEffect.}= 32 | ## Inline iterator on any-depth seq or array 33 | ## Returns values in order 34 | for item in s: 35 | when item is array|seq: 36 | for subitem in flatIter(item): 37 | yield subitem 38 | else: 39 | yield item 40 | 41 | 42 | proc shape(s: string|seq[char], parent_shape: seq[int] = @[]): seq[int] {.noSideEffect.}= 43 | ## Handle char / string 44 | if parent_shape == @[]: 45 | return @[1] 46 | else: return parent_shape 47 | 48 | -------------------------------------------------------------------------------- /src/nn/loss/sigmoid_cross_entropy.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils 16 | import ../../arraymancer_nn_primitives 17 | 18 | import ./loss 19 | 20 | type SigmoidCrossEntropyLoss* {.final.} [TT] = ref object of Loss[TT] 21 | cache: Variable[TT] 22 | # arity, from Gate 23 | # target, from Loss 24 | 25 | method forward*[TT](self: SigmoidCrossEntropyLoss[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}= 26 | new result 27 | 28 | result.tape = a.tape 29 | # We expect a in shape @[features, batch_size] 30 | result.value = [sigmoid_cross_entropy(a.value, target)].toTensor 31 | 32 | result.grad = zeros[getSubType(TT)](1) 33 | 34 | 35 | method backward*[TT](self: SigmoidCrossEntropyLoss[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 36 | result[0] = sigmoid_cross_entropy_backward(gradient, self.cache.value, self.target) 37 | 38 | proc sigmoid_cross_entropy*[TT](a: Variable[TT], target: TT): Variable[TT] = 39 | # Gate 40 | var gate: SigmoidCrossEntropyLoss[TT] 41 | new gate 42 | gate.arity = 1 43 | gate.cache = a 44 | gate.target = target 45 | 46 | # Node 47 | var node: Node[TT] 48 | new node 49 | 50 | node.gate = gate 51 | node.parents[0] = a 52 | 53 | a.tape.push(node) 54 | 55 | # Resulting var 56 | result = gate.forward(a, target) 57 | result.ancestor = node 58 | node.child = result -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copied from https://github.com/nim-lang/Nim/wiki/TravisCI 2 | language: c 3 | 4 | cache: ccache 5 | 6 | matrix: 7 | include: 8 | # Build and test against the master (stable) and devel branches of Nim 9 | # Build and test using both gcc and clang 10 | - os: linux 11 | env: CHANNEL=stable 12 | compiler: gcc 13 | 14 | - os: linux 15 | env: CHANNEL=devel 16 | compiler: gcc 17 | 18 | # For faster testing we don't test clang on linux, only on macOS 19 | # - os: linux 20 | # env: CHANNEL=stable 21 | # compiler: clang 22 | # 23 | # - os: linux 24 | # env: CHANNEL=devel 25 | # compiler: clang 26 | 27 | # On OSX we only test against clang (gcc is mapped to clang by default) 28 | # Note: for OpenMP, Homebrew will build flame/blis with GCC-5 29 | - os: osx 30 | env: CHANNEL=stable BLIS=true 31 | compiler: clang 32 | 33 | # For faster testing, we only test BLIS = true 34 | # - os: osx 35 | # env: CHANNEL=stable BLIS=false 36 | # compiler: clang 37 | 38 | allow_failures: 39 | # Ignore failures when building against the devel Nim branch 40 | - env: CHANNEL=devel 41 | fast_finish: true 42 | 43 | addons: 44 | apt: 45 | packages: 46 | # On Linux we need OpenBLAS, on OSX Apple Accelerate is present by default 47 | - libopenblas-dev 48 | 49 | before_install: 50 | # On MacOS flame/blis can be tested as it is an homebrew package 51 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update ; fi 52 | - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install homebrew/science/blis; fi 53 | 54 | install: 55 | - export CHOOSENIM_NO_ANALYTICS=1 56 | - curl https://nim-lang.org/choosenim/init.sh -sSf > init.sh 57 | - sh init.sh -y 58 | - export PATH=~/.nimble/bin:$PATH 59 | - echo "export PATH=~/.nimble/bin:$PATH" >> ~/.profile 60 | - choosenim $CHANNEL 61 | 62 | script: 63 | - nimble refresh 64 | - nimble test 65 | 66 | branches: 67 | except: 68 | - gh-pages 69 | -------------------------------------------------------------------------------- /src/autograd/gates_reduce.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./autograd, ../arraymancer, ./utils, sequtils 16 | 17 | type MeanGate* {.final.} [TT] = ref object of Gate[TT] 18 | ## TODO: generalize to C <- alpha AB + C 19 | a_shape: seq[int] 20 | 21 | method forward*[TT](self: MeanGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}= 22 | new result 23 | 24 | result.tape = a.tape 25 | result.value = [a.value.mean].toTensor 26 | 27 | result.grad = zeros[getSubType(TT)](1) 28 | 29 | 30 | method backward*[TT](self: MeanGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 31 | result[0] = gradient / getSubType(TT)(self.a_shape.product) # Conversion to subtype T, oh Higher kinded-types ... 32 | 33 | let z_shape = newSeqWith(self.a_shape.len, 1) # We create a shape of 1 dimension that we will expand with broadcast 34 | result[0] = result[0].unsafeReshape(z_shape).unsafeBroadcast(self.a_shape) 35 | 36 | proc mean*[TT](a: Variable[TT]): Variable[TT] = 37 | when compileOption("boundChecks"): 38 | check_ctx(a, b) 39 | 40 | # Gate 41 | var gate: MeanGate[TT] 42 | new gate 43 | gate.arity = 1 44 | gate.a_shape = a.value.shape # TODO use ref to avoid copy 45 | 46 | # Node 47 | var node: Node[TT] 48 | new node 49 | 50 | node.gate = gate 51 | node.parents[0] = a 52 | 53 | a.tape.push(node) 54 | 55 | # Resulting var 56 | result = gate.forward(a) 57 | result.ancestor = node 58 | node.child = result -------------------------------------------------------------------------------- /src/arraymancer/global_config.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | 17 | 18 | # This configures the maximum number of dimensions supported by Arraymancer 19 | # It should improve performance on Cuda and for iterator by storing temporary shape/strides 20 | # that will be used extensively in the loop on the stack. 21 | # For now this is only partly implemented and only on Cuda temporary shape/strides arrays. 22 | const MAXRANK = 8 # 8 because it's a nice number, more is possible upon request. 23 | 24 | 25 | const CUDA_HOF_TPB {.used.}: cint = 32 * 32 # TODO, benchmark and move that to cuda global config 26 | # Pascal GTX 1070+ have 1024 threads max 27 | const CUDA_HOF_BPG {.used.}: cint = 256 # should be (grid-stride+threadsPerBlock-1) div threadsPerBlock ? 28 | # From https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ 29 | # Lower allows threads re-use and limit overhead of thread creation/destruction 30 | 31 | 32 | const OMP_FOR_THRESHOLD = 1000 # Tensor number of elements threshold before using OpenMP multithreading 33 | 34 | # Full procesor optimization (AVX, AVX2, ARM neon, ... if applicable) 35 | when defined(native): 36 | {.passC: "-march=native".} 37 | 38 | # Note: Following https://github.com/mratsim/Arraymancer/issues/61 and 39 | # https://github.com/mratsim/Arraymancer/issues/43 40 | # Arraymancer export '_' for slicing (type is SteppedSlice) 41 | # '_' is configured in accessors_slicer -------------------------------------------------------------------------------- /benchmarks/ex01_xor.nim: -------------------------------------------------------------------------------- 1 | import ../src/arraymancer_nn, ../src/arraymancer_ag, ../src/arraymancer 2 | 3 | let ctx = newContext Tensor[float32] 4 | 5 | let bsz = 32 #batch size 6 | 7 | # We will create a tensor of size 3200 --> 100 batch sizes of 32 8 | # We create it as int between [0, 2[ (2 excluded) and convert to bool 9 | let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) 10 | 11 | # Let's build or truth labels. We need to apply xor between the 2 columns of the tensors 12 | proc xor_alt[T](x,y: T): T = 13 | ## xor is builtin and cannot be passed to map as is 14 | x xor y 15 | 16 | let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1]) 17 | 18 | 19 | # Convert to float and transpose so batch_size is last 20 | let x_train = ctx.variable(x_train_bool.astype(float32).transpose) 21 | let y = y_bool.astype(float32).transpose 22 | 23 | # First hidden layer of 3 neurons, with 2 features in 24 | # We initialize with random weights between -1 and 1 25 | let layer_3neurons = ctx.variable( 26 | randomTensor(3, 2, 2.0f) .- 1.0f 27 | ) 28 | 29 | # Classifier layer with 1 neuron per feature. (In our case only one neuron overall) 30 | # We initialize with random weights between -1 and 1 31 | let classifier_layer = ctx.variable( 32 | randomTensor(1, 3, 2.0f) .- 1.0f 33 | ) 34 | 35 | # Stochastic Gradient Descent 36 | let optim = newSGD[float32]( 37 | layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate 38 | ) 39 | 40 | for epoch in 0..100: 41 | 42 | for batch_id in 0..<100: 43 | 44 | # offset in the Tensor (Remember, batch size is last) 45 | let offset = batch_id * 32 46 | let x = x_train[_, offset ..< offset + 32] 47 | let target = y[_, offset ..< offset + 32] 48 | 49 | # Building the network 50 | let n1 = linear(x, layer_3neurons) 51 | let n1_act = n1.relu 52 | let n2 = linear(n1_act, classifier_layer) 53 | let loss = sigmoid_cross_entropy(n2, target) 54 | 55 | # Compute the gradient (i.e. contribution of each parameter to the loss) 56 | loss.backprop() 57 | 58 | # Correct the weights now that we have the gradient information 59 | optim.update() -------------------------------------------------------------------------------- /src/autograd/gates_basic.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # By convention a is the LHS (left-hand side) 16 | # b is the rhs (right-hand side) 17 | 18 | import ./autograd, ../arraymancer, ./utils 19 | 20 | type AddGate* {.final.} [TT] = ref object of Gate[TT] 21 | ab_shape: seq[int] 22 | 23 | method forward*[TT](self: AddGate[TT], a, b: Variable[TT]): Variable[TT] {.inline, locks:0.}= 24 | new result 25 | 26 | result.tape = a.tape 27 | result.value = a.value + b.value 28 | 29 | ## Unfortunately using broadcasts to save memory doesn't work 30 | # let z_shape = newSeqWith(result.value.rank, 1) # We create a shape of 1 dimension that we will expand with broadcast 31 | # let z = zeros[getSubType(TT)](z_shape) 32 | # result.grad = z.unsafeBroadcast(result.value.shape) # to save memory, we allocate as low as possible 33 | 34 | result.grad = zeros[getSubType(TT)](result.value.shape) 35 | 36 | method backward*[TT](self: AddGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 37 | result[0] = gradient 38 | result[1] = gradient 39 | 40 | proc `+`*[TT](a, b: Variable[TT]): Variable[TT] = 41 | when compileOption("boundChecks"): 42 | check_ctx(a, b) 43 | 44 | # Gate 45 | var gate: AddGate[TT] 46 | new gate 47 | gate.arity = 2 48 | gate.ab_shape = a.value.shape # Shape equality will be checked in the forward proc 49 | 50 | # Node 51 | var node: Node[TT] 52 | new node 53 | 54 | node.gate = gate 55 | node.parents[0] = a 56 | node.parents[1] = b 57 | 58 | a.tape.push(node) 59 | 60 | # Resulting var 61 | result = gate.forward(a, b) 62 | result.ancestor = node 63 | node.child = result 64 | 65 | 66 | -------------------------------------------------------------------------------- /src/autograd/gates_blas.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ./autograd, ../arraymancer, ./utils 16 | 17 | type MatMulGate* {.final.} [TT] = ref object of Gate[TT] 18 | ## TODO: generalize to C <- alpha AB + C 19 | a: Variable[TT] 20 | b: Variable[TT] 21 | 22 | method forward*[TT](self: MatMulGate[TT], a, b: Variable[TT]): Variable[TT] {.inline, locks:0.}= 23 | new result 24 | 25 | result.tape = a.tape 26 | result.value = a.value * b.value 27 | 28 | ## Unfortunately using broadcasts to save memory doesn't work 29 | # let z_shape = newSeqWith(result.value.rank, 1) # We create a shape of 1 dimension that we will expand with broadcast 30 | # let z = zeros[getSubType(TT)](z_shape) 31 | # result.grad = z.unsafeBroadcast(result.value.shape) # to save memory, we allocate as low as possible 32 | 33 | result.grad = zeros[getSubType(TT)](result.value.shape) 34 | 35 | method backward*[TT](self: MatMulGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 36 | result[0] = gradient * self.b.value.unsafeTranspose 37 | result[1] = self.a.value.unsafeTranspose * gradient 38 | 39 | proc `*`*[TT](a, b: Variable[TT]): Variable[TT] = 40 | when compileOption("boundChecks"): 41 | check_ctx(a, b) 42 | 43 | # Gate 44 | var gate: MatMulGate[TT] 45 | new gate 46 | gate.arity = 2 47 | gate.a = a # TODO use ref to avoid copy 48 | gate.b = b 49 | 50 | # Node 51 | var node: Node[TT] 52 | new node 53 | 54 | node.gate = gate 55 | node.parents[0] = a 56 | node.parents[1] = b 57 | 58 | a.tape.push(node) 59 | 60 | # Resulting var 61 | result = gate.forward(a, b) 62 | result.ancestor = node 63 | node.child = result -------------------------------------------------------------------------------- /src/arraymancer/backend/openmp.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | when defined(openmp): 16 | {.passC: "-fopenmp".} 17 | {.passL: "-fopenmp".} 18 | 19 | {. pragma: omp, header:"omp.h" .} 20 | 21 | proc omp_set_num_threads*(x: cint) {.omp.} 22 | proc omp_get_num_threads*(): cint {.omp.} 23 | proc omp_get_max_threads*(): cint {.omp.} 24 | proc omp_get_thread_num*(): cint {.omp.} 25 | 26 | else: 27 | template omp_set_num_threads*(x: cint) = discard 28 | template omp_get_num_threads*(): cint = 1 29 | template omp_get_max_threads*(): cint = 1 30 | template omp_get_thread_num*(): cint = 0 31 | 32 | const OMP_FOR_ANNOTATION = "if(ompsize > " & $OMP_FOR_THRESHOLD & ")" 33 | 34 | template omp_parallel_countup*(i: untyped, size: Natural, body: untyped): untyped = 35 | let ompsize = size 36 | for i in `||`(0, ompsize, OMP_FOR_ANNOTATION): 37 | body 38 | 39 | template omp_parallel_forup*(i: untyped, start, size: Natural, body: untyped): untyped = 40 | let ompsize = size 41 | for i in `||`(start, ompsize, OMP_FOR_ANNOTATION): 42 | body 43 | 44 | template omp_parallel_blocks*(block_offset, block_size: untyped, size: Natural, body: untyped): untyped = 45 | block ompblocks: 46 | when defined(openmp): 47 | if size >= OMP_FOR_THRESHOLD: 48 | let omp_num_threads = omp_get_max_threads() 49 | if size >= omp_num_threads: 50 | let bsize = size div omp_num_threads 51 | for j in 0||(omp_num_threads-1): 52 | let block_offset = bsize*j 53 | let block_size = if j < omp_num_threads-1: bsize else: size - block_offset 54 | block: 55 | body 56 | break ompblocks 57 | let block_offset = 0 58 | let block_size = size 59 | block: 60 | body 61 | -------------------------------------------------------------------------------- /tests/tensors/test_ufunc_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import math, unittest 17 | 18 | suite "Universal functions": 19 | test "Common math functions are exported": 20 | let a = @[@[1.0,2,3],@[4.0,5,6]] 21 | let b = @[@[7.0, 8],@[9.0, 10],@[11.0, 12]] 22 | 23 | let ta = a.toTensor(Cpu) 24 | let tb = b.toTensor(Cpu) 25 | 26 | let expected_a = @[@[cos(1'f64),cos(2'f64),cos(3'f64)],@[cos(4'f64),cos(5'f64),cos(6'f64)]] 27 | let expected_b = @[@[ln(7'f64), ln(8'f64)],@[ln(9'f64), ln(10'f64)],@[ln(11'f64), ln(12'f64)]] 28 | 29 | check: cos(ta) == expected_a.toTensor(Cpu) 30 | check: ln(tb) == expected_b.toTensor(Cpu) 31 | 32 | test "Creating custom universal functions is supported": 33 | proc square_plus_one(x: int): int = x ^ 2 + 1 34 | makeUniversalLocal(square_plus_one) 35 | 36 | let c = @[@[2,4,8],@[3,9,27]] 37 | let tc = c.toTensor(Cpu) 38 | 39 | let expected_c = @[@[5, 17, 65],@[10, 82, 730]] 40 | 41 | check: square_plus_one(tc) == expected_c.toTensor(Cpu) 42 | 43 | ## MakeUniversal cannot change Tensor[B,T] to Tensor[B,U] for now 44 | ## fmap must be used instead 45 | test "Universal functions that change types are supported": 46 | let d = @[@[2,4,8],@[3,9,27]] 47 | let e = @[@["2","4","8"],@["3","9","27"]] 48 | 49 | proc stringify(n: int): string = $n 50 | # makeUniversalLocal(stringify) 51 | 52 | let td = d.toTensor(Cpu) 53 | let te = e.toTensor(Cpu) 54 | 55 | when compiles (td == te): check: false 56 | 57 | check: td.fmap(stringify) == te 58 | check: td.fmap(stringify)[0,1] == "4" 59 | 60 | when compileOption("boundChecks"): 61 | expect(IndexError): 62 | discard td.fmap(stringify)[1,3] 63 | else: 64 | echo "Bound-checking is disabled. The incorrect seq shape test has been skipped." 65 | -------------------------------------------------------------------------------- /src/arraymancer/fallback/blas_l3_gemm_macro_kernel.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc gemm_macro_kernel[T](mc, nc, kc: int, 16 | alpha: T, 17 | beta: T, 18 | C: var seq[T], offC: int, 19 | incRowC, incColC: int, 20 | buffer_A: var ref array[MCKC, T], 21 | buffer_B: var ref array[KCNC, T], 22 | buffer_C: var ref array[MRNR, T]) 23 | {.noSideEffect.} = 24 | let mp = (mc+MR-1) div MR 25 | let np = (nc+NR-1) div NR 26 | 27 | let mod_mr = mc mod MR 28 | let mod_nr = nc mod NR 29 | 30 | var mr: int 31 | var nr: int 32 | 33 | for j in 0 ..< np: 34 | nr = if (j != np-1 or mod_nr == 0): NR 35 | else: mod_nr 36 | for i in 0 ..< mp: 37 | mr = if (i != mp-1 or mod_mr == 0): MR 38 | else: mod_mr 39 | 40 | if (mr==MR and nr==NR): 41 | gemm_micro_kernel(kc, alpha, 42 | buffer_A, i*kc*MR, 43 | buffer_B, j*kc*NR, 44 | beta, 45 | C, i*MR*incRowC+j*NR*incColC + offC, 46 | incRowC, incColC) 47 | else: 48 | gemm_micro_kernel(kc, alpha, 49 | buffer_A, i*kc*MR, 50 | buffer_B, j*kc*NR, 51 | 0.T, 52 | buffer_C, 0, 53 | 1, MR) 54 | gescal( mr, nr, beta, 55 | C, i*MR*incRowC+j*NR*incColC + offC, 56 | incRowC, incColC) 57 | geaxpy( mr, nr, 58 | 1.T, 59 | buffer_C, 60 | 1, MR, 61 | C, i*MR*incRowC+j*NR*incColC + offC, 62 | incRowC, incColC) 63 | -------------------------------------------------------------------------------- /src/arraymancer/fallback/blas_l3_gemm_packing.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc pack_panel[T, N](k: int, 16 | M: seq[T], offset: int, # Tensor data + offset 17 | lsm, ssm: int, # Leading and secondary (dimension) stride of M, Leading: incColA/incRowB. 18 | LR: static[int], # Leading block dimension, MR for A (MxK), NR for B (KxN) 19 | buffer: var ref array[N, T], # N = MCKC for A, KCNC for B 20 | offBuf: var int) {.noSideEffect.} = 21 | ## Pack blocks of size LR of the matrices in the corresponding buffer 22 | var offM = offset 23 | for s in 0 ..< k: # Loop along the leaing dimension 24 | for lead in 0 ..< LR: 25 | buffer[lead + offBuf] = M[lead*lsm + offM] 26 | offBuf += LR 27 | offM += ssm 28 | 29 | proc pack_dim[T, N](lc, kc: int, # lc = mc for A (MxK matrix) and lc = nc for B (KxN matrix) 30 | M: seq[T], offset: int, # Tensor data + offset 31 | lsm, ssm: int, # Leading and secondary (dimension) stride of M, Leading: incColA/incRowB. 32 | LR: static[int], # Leading block dimension, MR for A (MxK), NR for B (KxN) 33 | buffer: var ref array[N, T]) # N = MCKC for A, KCNC for B 34 | {.noSideEffect.} = 35 | 36 | let lp = lc div LR # Number of whole blocks along leading dim 37 | let lr = lc mod LR # Reminder of leading dim 38 | 39 | var offBuf = 0 40 | var offM = offset 41 | 42 | for lead in 0.. 0: 47 | for s in 0 ..< kc: 48 | for lead in 0 ..< lr: 49 | buffer[lead + offBuf] = M[lead * lsm + offM] 50 | for lead in lr ..< LR: 51 | buffer[lead + offBuf] = 0.T 52 | offBuf += LR 53 | offM += ssm -------------------------------------------------------------------------------- /tests/tensors/test_init_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math, sequtils 17 | 18 | suite "Creating a new Tensor": 19 | test "Creating from sequence": 20 | let t1 = @[1,2,3].toTensor(Cpu) 21 | check: t1.shape == @[3] 22 | check: t1.rank == 1 23 | 24 | const 25 | a = @[1, 2, 3, 4, 5] 26 | b = @[1, 2, 3, 4, 5] 27 | 28 | var 29 | vandermonde: seq[seq[int]] 30 | row: seq[int] 31 | 32 | vandermonde = newSeq[seq[int]]() 33 | 34 | for i, aa in a: 35 | row = newSeq[int]() 36 | vandermonde.add(row) 37 | for j, bb in b: 38 | vandermonde[i].add(aa^bb) 39 | 40 | let t2 = vandermonde.toTensor(Cpu) 41 | check: t2.rank == 2 42 | check: t2.shape == @[5, 5] 43 | 44 | let nest3 = @[ 45 | @[ 46 | @[1,2,3], 47 | @[1,2,3] 48 | ], 49 | @[ 50 | @[3,2,1], 51 | @[3,2,1] 52 | ], 53 | @[ 54 | @[4,4,5], 55 | @[4,4,4] 56 | ], 57 | @[ 58 | @[6,6,6], 59 | @[6,6,6] 60 | ] 61 | ] 62 | 63 | let t3 = nest3.toTensor(Cpu) 64 | check: t3.rank == 3 65 | check: t3.shape == @[4, 2, 3] # 4 rows, 2 cols, 3 depth. depth indices moves the fastest. Same scheme as Numpy. 66 | 67 | let u = @[@[1.0, -1, 2],@[0.0, -1]] 68 | expect(IndexError): 69 | discard u.toTensor(Cpu) 70 | 71 | test "Check that Tensor shape is in row-by-column order": 72 | let s = @[@[1,2,3],@[3,2,1]] 73 | let t = s.toTensor(Cpu) 74 | 75 | check: t.shape == @[2,3] 76 | 77 | let u = newTensor(@[2,3], int, Cpu) 78 | check: u.shape == @[2,3] 79 | 80 | check: u.shape == t.shape 81 | 82 | # TODO add tests for zeros, ones and randomTensor -------------------------------------------------------------------------------- /src/arraymancer/utils/functional.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Functional programming and iterator tooling 16 | 17 | template scanr[T](s: seq[T], operation: untyped): untyped = 18 | ## Template to scan a sequence from right to left, returning the accumulation and intermediate values. 19 | ## This is a foldr with intermediate steps returned 20 | 21 | ## @[2, 2, 3, 4].scanr(a * b) = @[48, 24, 12, 4] 22 | let len = s.len 23 | 24 | assert len > 0, "Can't scan empty sequences" 25 | var result = newSeq[T](len) 26 | 27 | result[result.high] = s[s.high] 28 | for i in countdown(len - 1, 1): 29 | let 30 | a {.inject.} = s[i-1] 31 | b {.inject.} = result[i] 32 | result[i-1] = operation 33 | result 34 | 35 | template scanl[T](s: seq[T], operation: untyped): untyped = 36 | ## Template to scan a sequence from left to right, returning the accumulation and intermediate values. 37 | ## This is a foldl with intermediate steps returned 38 | 39 | ## @[2, 2, 3, 4].scanl(a * b) = @[2, 4, 12, 48] 40 | let len = s.len 41 | 42 | assert len > 0, "Can't scan empty sequences" 43 | var result = newSeq[T](len) 44 | 45 | result[0] = s[0] 46 | for i in 1..s.high: 47 | let 48 | a {.inject.} = s[i] 49 | b {.inject.} = result[i-1] 50 | result[i] = operation 51 | result 52 | 53 | iterator zip[T1, T2](a: openarray[T1], b: openarray[T2]): (T1,T2) {.noSideEffect.} = 54 | ## Transform two lists in a list of tuples. 55 | ## Length of result will be the length of the smallest list, items from the longest will be discarded. 56 | let len = min(a.len, b.len) 57 | 58 | for i in 0.. offset and update 19 | # the data at this offset. 20 | # 21 | # For this we need: 22 | # - to store strides and offset on the cuda device to avoid copies 23 | # - a way to convert element #10 of the tensor to the real offset (column major), 24 | # the kernels won't use tensor[2,5] as an index 25 | 26 | 27 | proc getIndexOfElementID[T](t: Tensor[T], element_id: int): int {.noSideEffect,used.} = 28 | ## Convert "Give me element 10" to the real index/memory offset. 29 | ## Reference Nim CPU version 30 | ## This is not meant to be used on serial architecture due to the division overhead. 31 | ## On GPU however it will allow threads to address the real memory addresses independantly. 32 | 33 | when compileOption("boundChecks"): 34 | assert element_id < t.size 35 | 36 | result = t.offset 37 | var currentOffset = element_id 38 | var dimIdx: int 39 | 40 | for k in countdown(t.rank - 1,0): 41 | ## hopefully the compiler doesn't do division twice ... 42 | dimIdx = currentOffset mod t.shape[k] 43 | currentOffset = currentOffset div t.shape[k] 44 | 45 | # cf atIndex proc to compute real_idx 46 | result += dimIdx * t.strides[k] 47 | 48 | # Note we don't bound-checks the CUDA implementation 49 | {.emit:[""" 50 | static inline __device__ int cuda_getIndexOfElementID( 51 | const int rank, 52 | const int * __restrict__ shape, 53 | const int * __restrict__ strides, 54 | const int offset, 55 | const int element_id) { 56 | 57 | int real_idx = offset; 58 | int currentOffset = element_id; 59 | int dimIdx = 0; 60 | 61 | for (int k = rank - 1; k >= 0; --k) { 62 | dimIdx = currentOffset % shape[k]; 63 | currentOffset /= shape[k]; 64 | 65 | real_idx += dimIdx * strides[k]; 66 | } 67 | 68 | return real_idx; 69 | } 70 | """].} -------------------------------------------------------------------------------- /tests/tensors/test_higherorder.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math, future, sequtils 17 | 18 | suite "Testing higher-order functions": 19 | let t = [[0, 1, 2], 20 | [3, 4, 5], 21 | [6, 7, 8], 22 | [9, 10, 11]].toTensor() 23 | 24 | proc customAdd[T: SomeNumber](x, y: T): T = x + y 25 | 26 | test "Map function": 27 | 28 | let t2 = [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121].toTensor.reshape([4,3]) 29 | 30 | check: t.map(x => x*x) == t2 31 | 32 | test "Apply functions - with in-place and out of place closure": 33 | var t = toSeq(0..11).toTensor().reshape([4,3]) 34 | let t2 = toSeq(1..12).toTensor().reshape([4,3]) 35 | 36 | var tmp1 = t 37 | tmp1.apply(x => x+1) # out of place 38 | check: tmp1 == t2 39 | 40 | var tmp2 = t[_,2] 41 | 42 | proc plus_one[T](x: var T) = x += 1 43 | tmp2.apply(plus_one) # in-place 44 | check: tmp2 == t2[_,2] 45 | 46 | test "Reduce function": 47 | check: t.reduce(customAdd) == 66 48 | 49 | proc customConcat(x, y: string): string = x & y 50 | 51 | check: t.map(x => $x).reduce(customConcat) == "01234567891011" 52 | 53 | test "Reduce over an axis": 54 | proc customMin[T: SomeNumber](x,y: Tensor[T]): Tensor[T] = x - y 55 | 56 | check: t.reduce(customMin, axis = 0) == [-18, -20, -22].toTensor.reshape([1,3]) 57 | 58 | test "Fold with different in and result types": 59 | proc isEven(n: int): bool = 60 | return n mod 2 == 0 61 | 62 | # Check if all even 63 | check: t.fold(true, proc(x: bool,y: int): bool = x and y.isEven) == false 64 | 65 | check: (t * 2).fold(true, proc(x: bool,y: int): bool = x and y.isEven) == true 66 | 67 | test "Fold over axis": 68 | let col_sum_plus_1010 = [[4], 69 | [12], 70 | [22], 71 | [30]].toTensor() 72 | 73 | let initval = [1,0,1,0].toTensor.reshape([4,1]) 74 | 75 | check: t.fold(initval, `+`, axis = 1) == col_sum_plus_1010 76 | -------------------------------------------------------------------------------- /src/arraymancer/term_rewriting.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | template toTensorReshapeT(oa: typed, shape: varargs[int]): untyped = 16 | let data = toSeq(flatIter(oa)) 17 | let seq_shape = @shape 18 | 19 | when compileOption("boundChecks"): check_nested_elements(seq_shape, data.len) 20 | 21 | var t: Tensor[type(data[0])] 22 | tensorCpu(seq_shape, t) 23 | shallowCopy(t.data, data) 24 | return t 25 | 26 | proc toTensorReshape(oa: string, shape: varargs[int]): auto {.noSideEffect.}= 27 | ## Fuse toTensor and reshape in one operation. 28 | ## 29 | ## Deal specifically with strings/seq[char] 30 | 31 | toTensorReshapeT(oa, shape) 32 | 33 | proc toTensorReshape(oa: openarray, shape: varargs[int], dummy_bugfix: static[int] = 0): auto {.noSideEffect.}= 34 | ## Fuse toTensor and reshape in one operation 35 | ## 36 | # Dummy_bugfix param is necessary due to: https://github.com/nim-lang/Nim/issues/6343 37 | # TODO: remove 'dummy_bugfix' 38 | toTensorReshapeT(oa, shape) 39 | 40 | template rewriteToTensorReshape*{reshape(toTensor(oa, dummy_bugfix), shape)}( 41 | oa: openarray, 42 | shape: varargs[int], 43 | dummy_bugfix: static[int]): auto = 44 | ## Fuse ``sequence.toTensor.reshape(new_shape)`` into a single operation. 45 | ## 46 | ## Operation fusion leverage the Nim compiler and should not be called explicitly. 47 | toTensorReshape(oa, shape, dummy_bugfix) 48 | 49 | proc unsafeToTensorReshape*[T](data: seq[T], shape: varargs[int]): Tensor[T] {.noSideEffect.} = 50 | ## Fuse unsafeToTensor and unsafeReshape in one operation 51 | 52 | when compileOption("boundChecks"): check_nested_elements(@shape, data.len) 53 | 54 | tensorCpu(shape, result) 55 | shallowCopy(result.data, data) 56 | 57 | template rewriteUnsafeToTensorReshape*{unsafeReshape(unsafeToTensor(s), shape)}( 58 | s: seq, 59 | shape: varargs[int]): auto = 60 | ## Fuse ``sequence.unsafeToTensor().unsafeReshape(new_shape)`` into a single operation. 61 | ## 62 | ## Operation fusion leverage the Nim compiler and should not be called explicitly. 63 | unsafeToTensorReshape(s, shape, dummy_bugfix) 64 | -------------------------------------------------------------------------------- /benchmarks/implementation/stable_sigmoid_bench.nim: -------------------------------------------------------------------------------- 1 | import times, ../../src/arraymancer, math 2 | 3 | # The goal is to test the speed of various sigmoid implementation 4 | # Some are numericall stable for positive, negative or both value 5 | 6 | # We create a random tensor with randomly positive and negative value 7 | let a = randomTensor(1000, 1000, 100.0f) .- 50.0f 8 | 9 | proc sigmoid1[T: SomeReal](t: Tensor[T]): Tensor[T] = 10 | # Instable for negative 11 | proc sigmoid1_closure(x: T): T = 1.T / (1 + exp(-x)) 12 | return t.map(sigmoid1_closure) 13 | 14 | proc sigmoid2[T: SomeReal](t: Tensor[T]): Tensor[T] = 15 | # Instable for positive 16 | proc sigmoid2_closure(x: T): T = 17 | let z = exp(x) 18 | return z / (1.T + z) 19 | return t.map(sigmoid2_closure) 20 | 21 | proc sigmoid3[T: SomeReal](t: Tensor[T]): Tensor[T] = 22 | # Stable but branching in a loop 23 | proc sigmoid3_closure(x: T): T = 24 | if x >= 0: 25 | return 1.T / (1 + exp(-x)) 26 | let z = exp(x) 27 | return z / (1 + z) 28 | return t.map(sigmoid3_closure) 29 | 30 | proc sigmoid4*[T: SomeReal](t: Tensor[T]): Tensor[T] = 31 | # Stable but expensive tanh 32 | proc sigmoid4_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T) 33 | return t.map(sigmoid4_closure) 34 | 35 | proc sigmoid5*[T: SomeReal](t: Tensor[T]): Tensor[T] = 36 | # Stable and probably fastest 37 | proc sigmoid5_closure(x: T): T = 38 | let clip_x = max(-500, -x) 39 | return 1.T / (1 + exp(clip_x)) 40 | return t.map(sigmoid5_closure) 41 | 42 | ## Warmup for ondemand CPU 43 | for i in 0..<1000: 44 | discard a.sigmoid1 45 | 46 | var start = cpuTime() 47 | for i in 0..<1000: 48 | discard a.sigmoid1 49 | echo " Sigmoid1: 1 / (1 + exp(-x)) ", cpuTime() - start 50 | 51 | 52 | start = cpuTime() 53 | for i in 0..<1000: 54 | discard a.sigmoid2 55 | echo " Sigmoid2: exp(x) / (1 + exp(x)) ", cpuTime() - start 56 | 57 | start = cpuTime() 58 | for i in 0..<1000: 59 | discard a.sigmoid3 60 | echo " Sigmoid3: branching ", cpuTime() - start 61 | 62 | start = cpuTime() 63 | for i in 0..<1000: 64 | discard a.sigmoid4 65 | echo " Sigmoid4: 0.5 * (tanh(0.5 * x) + 1) ", cpuTime() - start 66 | 67 | start = cpuTime() 68 | for i in 0..<1000: 69 | discard a.sigmoid5 70 | echo " Sigmoid5: 1 / (1 + exp(max(-500,-x)) ", cpuTime() - start 71 | 72 | 73 | # Results with -d:release on i5-5257U (dual-core mobile 2.7GHz, turbo 3.1) 74 | # Note: results vary strongly depending on your number of cores due to cpuTime methodology 75 | # Sigmoid1: 1 / (1 + exp(-x)) 8.265147999999998 76 | # Sigmoid2: exp(x) / (1 + exp(x)) 7.757116 77 | # Sigmoid3: branching 12.477108 78 | # Sigmoid4: 0.5 * (tanh(0.5 * x) + 1) 11.162277 79 | # Sigmoid5: 1 / (1 + exp(max(-500,-x)) 10.050294 -------------------------------------------------------------------------------- /tests/tensors/test_aggregate_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math 17 | 18 | suite "Testing aggregation functions": 19 | let t = [[0, 1, 2], 20 | [3, 4, 5], 21 | [6, 7, 8], 22 | [9, 10, 11]].toTensor(Cpu) 23 | 24 | test "Sum all elements": 25 | check: t.sum == 66 26 | 27 | test "Sum over axis": 28 | let row_sum = [[18, 22, 26]].toTensor(Cpu) 29 | let col_sum = [[3], 30 | [12], 31 | [21], 32 | [30]].toTensor(Cpu) 33 | check: t.sum(axis=0) == row_sum 34 | check: t.sum(axis=1) == col_sum 35 | 36 | ## TODO: 3D axis sum 37 | test "Mean of all elements": 38 | check: t.astype(float).mean == 5.5 # Note: may fail due to float rounding 39 | 40 | test "Mean over axis": 41 | let row_mean = [[4.5, 5.5, 6.5]].toTensor(Cpu) 42 | let col_mean = [[1.0], 43 | [4.0], 44 | [7.0], 45 | [10.0]].toTensor(Cpu) 46 | check: t.astype(float).mean(axis=0) == row_mean 47 | check: t.astype(float).mean(axis=1) == col_mean 48 | 49 | test "Generic aggregate functions": 50 | # We can't pass built-ins to procvar 51 | proc addition[T](a, b: T): T= 52 | return a+b 53 | proc addition_inplace[T](a: var T, b: T)= 54 | a+=b 55 | 56 | check: t.agg(addition, start_val=0) == 66 57 | 58 | var z = 0 59 | z.agg_inplace(addition_inplace, t) 60 | check: z == 66 61 | 62 | #### Axis - `+`, `+=` for tensors are not "built-ins" 63 | let row_sum = [[18, 22, 26]].toTensor(Cpu) 64 | let col_sum = [[3], 65 | [12], 66 | [21], 67 | [30]].toTensor(Cpu) 68 | 69 | var z1 = zeros([1,3], int, Cpu) 70 | var z2 = zeros([4,1], int, Cpu) 71 | 72 | # Start with non-inplace proc 73 | check: t.agg(`+`, axis=0, start_val = z1 ) == row_sum 74 | check: t.agg(`+`, axis=1, start_val = z2 ) == col_sum 75 | 76 | # Inplace proc 77 | # z1.agg_inplace(`+=`, t, axis=0) 78 | # z2.agg_inplace(`+=`, t, axis=1) 79 | 80 | # check: z1 == row_sum 81 | # check: z2 == col_sum -------------------------------------------------------------------------------- /tests/tensors/test_ufunc.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import math, unittest 17 | 18 | suite "Universal functions": 19 | test "As type with slicing": 20 | let a = [1, 2, 3, 4].toTensor() 21 | let b = a[1..2].astype(float) 22 | check b == [2.0'f64,3.0'f64].toTensor() 23 | 24 | test "Common math functions are exported": 25 | let a = @[@[1.0,2,3],@[4.0,5,6]] 26 | let b = @[@[7.0, 8],@[9.0, 10],@[11.0, 12]] 27 | 28 | let ta = a.toTensor() 29 | let tb = b.toTensor() 30 | 31 | let expected_a = @[@[cos(1'f64),cos(2'f64),cos(3'f64)],@[cos(4'f64),cos(5'f64),cos(6'f64)]] 32 | let expected_b = @[@[ln(7'f64), ln(8'f64)],@[ln(9'f64), ln(10'f64)],@[ln(11'f64), ln(12'f64)]] 33 | 34 | check: cos(ta) == expected_a.toTensor() 35 | check: ln(tb) == expected_b.toTensor() 36 | 37 | test "Creating custom universal functions is supported": 38 | proc square_plus_one(x: int): int = x ^ 2 + 1 39 | makeUniversalLocal(square_plus_one) 40 | 41 | let c = @[@[2,4,8],@[3,9,27]] 42 | let tc = c.toTensor() 43 | 44 | let expected_c = @[@[5, 17, 65],@[10, 82, 730]] 45 | 46 | check: square_plus_one(tc) == expected_c.toTensor() 47 | 48 | ## MakeUniversal cannot change Tensor[B,T] to Tensor[B,U] for now 49 | ## map must be used instead 50 | test "Universal functions that change types are supported": 51 | let d = @[@[2,4,8],@[3,9,27]] 52 | let e = @[@["2","4","8"],@["3","9","27"]] 53 | 54 | proc stringify(n: int): string = $n 55 | # makeUniversalLocal(stringify) 56 | 57 | let td = d.toTensor() 58 | let te = e.toTensor() 59 | 60 | when compiles(td == te): check: false 61 | 62 | check: td.map(stringify) == te 63 | check: td.map(stringify)[0,1] == "4" 64 | 65 | when compileOption("boundChecks"): 66 | expect(IndexError): 67 | discard td.map(stringify)[1,3] 68 | else: 69 | echo "Bound-checking is disabled. The incorrect seq shape test has been skipped." 70 | 71 | 72 | test "Abs": 73 | let a = [-2,-1,0,1,2].toTensor() 74 | check abs(a) == [2,1,0,1,2].toTensor() 75 | let b = [-2.0,-1,0,1,2].toTensor() 76 | check abs(b) == [2.0,1,0,1,2].toTensor() -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Arraymancer v0.2.0 Sept. 24, 2017 "The Color of Magic" 4 | =========================================== 5 | 6 | I am very excited to announce the second release of Arraymancer which includes numerous improvements `blablabla` ... 7 | 8 | Without further ado: 9 | - Communauty 10 | - There is a Gitter room! 11 | - Breaking 12 | - `shallowCopy` is now `unsafeView` and accepts `let` arguments 13 | - Element-wise multiplication is now `.*` instead of `|*|` 14 | - vector dot product is now `dot` instead of `.*` 15 | - Deprecated 16 | - All tensor initialization proc have their `Backend` parameter deprecated. 17 | - `fmap` is now `map` 18 | - `agg` and `agg_in_place` are now `fold` and nothing (too bad!) 19 | 20 | - Initial support for Cuda !!! 21 | - All linear algebra operations are supported 22 | - Slicing (read-only) is supported 23 | - Transforming a slice to a new contiguous Tensor is supported 24 | - Tensors 25 | - Introduction of `unsafe` operations that works without copy: `unsafeTranspose`, `unsafeReshape`, `unsafebroadcast`, `unsafeBroadcast2`, `unsafeContiguous`, 26 | - Implicit broadcasting via `.+, .*, ./, .-` and their in-place equivalent `.+=, .-=, .*=, ./=` 27 | - Several shapeshifting operations: `squeeze`, `at` and their `unsafe` version. 28 | - New property: `size` 29 | - Exporting: `export_tensor` and `toRawSeq` 30 | - Reduce and reduce on axis 31 | - Ecosystem: 32 | - I express my deep thanks to @edubart for testing Arraymancer, contributing new functions, and improving its overall performance. He built [arraymancer-demos](https://github.com/edubart/arraymancer-demos) and [arraymancer-vision](https://github.com/edubart/arraymancer-vision),check those out you can load images in Tensor and do logistic regression on those! 33 | 34 | Also thanks to the Nim communauty on IRC/Gitter, they are a tremendous help (yes Varriount, Yardanico, Zachary, Krux). 35 | I probably would have struggled a lot more without the guidance of Andrea's code for Cuda in his [neo](https://github.com/unicredit/neo) and [nimcuda](https://github.com/unicredit/nimcuda) library. And obviously Araq and Dom for Nim which is an amazing language for performance, productivity, safety and metaprogramming. 36 | 37 | 38 | Minor revisions v0.1.1 to v0.1.3 39 | ================================ 40 | 41 | Arraymancer v0.1.0. July 12, 2017 "Magician Apprentice" 42 | =========================================== 43 | 44 | First public release. 45 | 46 | Include: 47 | 48 | - converting from deep nested proc or array 49 | - Slicing, and slice mutation 50 | - basic linear algebra operations, 51 | - reshaping, broadcasting, concatenating, 52 | - universal functions 53 | - iterators (in-place, axis, inline and closure versions) 54 | - BLAS and BLIS support for fast linear algebra 55 | -------------------------------------------------------------------------------- /src/arraymancer/fallback/blas_l3_gemm_micro_kernel.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | template gemm_micro_kernelT[T]( 16 | kc: int, 17 | alpha: T, 18 | A: typed, offA: int, 19 | B: typed, offB: int, 20 | beta: T, 21 | C: typed, 22 | offC: int, 23 | incRowC, incColC: int): untyped = 24 | 25 | {.pragma: align16, codegenDecl: "$# $# __attribute__((aligned(16)))".} 26 | var AB{.align16.}: array[MR*NR, T] 27 | var voffA = offA 28 | var voffB = offB 29 | 30 | ## Compute A*B 31 | for _ in 0 ..< kc: 32 | for j in 0 ..< NR: 33 | for i in 0 .. < MR: 34 | AB[i + j*MR] += A[i + voffA] * B[j + voffB] 35 | voffA += MR 36 | voffB += NR 37 | 38 | ## C <- beta * C 39 | if beta == 0.T: 40 | for j in 0 ..< NR: 41 | for i in 0 ..< MR: 42 | C[i*incRowC + j*incColC + offC] = 0.T 43 | elif beta != 1.T: 44 | for j in 0 ..< NR: 45 | for i in 0 ..< MR: 46 | C[i*incRowC + j*incColC + offC] *= beta 47 | 48 | ## C <- C + alpha*AB, alpha !=0 49 | if alpha == 1.T: 50 | for j in 0 ..< NR: 51 | for i in 0 ..< MR: 52 | C[i*incRowC + j*incColC + offC] += AB[i + j*MR] 53 | else: 54 | for j in 0 ..< NR: 55 | for i in 0 ..< MR: 56 | C[i*incRowC + j*incColC + offC] += alpha*AB[i + j*MR] 57 | 58 | proc gemm_micro_kernel[T](kc: int, 59 | alpha: T, 60 | A: ref array[MCKC, T], offA: int, 61 | B: ref array[KCNC, T], offB: int, 62 | beta: T, 63 | C: var ref array[MRNR, T], 64 | offC: int, 65 | incRowC, incColC: int) 66 | {.noSideEffect.} = 67 | gemm_micro_kernelT(kc, alpha, A, offA, B, offB, beta, C, offC, incRowC, incColc) 68 | 69 | proc gemm_micro_kernel[T](kc: int, 70 | alpha: T, 71 | A: ref array[MCKC, T], offA: int, 72 | B: ref array[KCNC, T], offB: int, 73 | beta: T, 74 | C: var seq[T], 75 | offC: int, 76 | incRowC, incColC: int) 77 | {.noSideEffect.} = 78 | gemm_micro_kernelT(kc, alpha, A, offA, B, offB, beta, C, offC, incRowC, incColc) -------------------------------------------------------------------------------- /tests/tensors/test_shapeshifting_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, future, sequtils 17 | 18 | suite "CUDA: Shapeshifting": 19 | ## Note: by default (momentarily), CudaTensors are column-major 20 | test "Contiguous conversion": 21 | let a = [7, 4, 3, 1, 8, 6, 22 | 8, 1, 6, 2, 6, 6, 23 | 2, 0, 4, 3, 2, 0].toTensor.reshape([3,6]).astype(float).cuda 24 | 25 | # Tensor of shape 3x6 of type "int" on backend "Cpu" 26 | # |7 4 3 1 8 6| 27 | # |8 1 6 2 6 6| 28 | # |2 0 4 3 2 0| 29 | 30 | let b = a.unsafeContiguous() 31 | check: b.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0] 32 | 33 | # a is already contiguous, even if wrong layout. 34 | # Nothing should be done 35 | let c = a.unsafeContiguous(colMajor) 36 | check: c.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0] 37 | 38 | # force parameter has been used. 39 | # Layout will change even if a was contiguous 40 | let d = a.unsafeContiguous(colMajor, force = true) 41 | check: d.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0] 42 | 43 | 44 | # Now test with a non-contiguous tensor 45 | let u = a[_,0..1] 46 | check: u.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0] 47 | check: u.cpu == [7.0,4,8,1,2,0].toTensor.reshape([3,2]) 48 | 49 | check: u.unsafeContiguous(rowMajor, force=true).cpu.toRawSeq == @[7.0,4,8,1,2,0] 50 | 51 | test "Unsafe reshape": 52 | block: 53 | let a = toSeq(1..4).toTensor().astype(float).cuda 54 | var a_view = a.unsafeReshape(2,2) 55 | check: a_view.cpu == [[1.0,2],[3.0,4]].toTensor() 56 | 57 | # TODO 58 | # a_view[_, _] = 0.0 59 | # check: a.cpu == [0.0,0,0,0].toTensor() 60 | 61 | # on slices 62 | block: 63 | # not that 'a' here a let variable, however 64 | # unsafeView and unsafeReshape allow us to 65 | # modify its elements value 66 | let a = toSeq(1..4).toTensor().astype(float).cuda 67 | var a_view = a[1..2].unsafeReshape(1,2) # a[1..2] == a.unsafeSlice(1..2) for CudaTensors 68 | check: a_view.cpu == [[2.0,3]].toTensor() 69 | 70 | # TODO: pending slice assignation 71 | # a_view[_, _].cpu = 0 72 | # check: a.cpu == [1.0,0,0,4].toTensor() -------------------------------------------------------------------------------- /src/nn/layers/linear.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils 16 | import ./layer 17 | 18 | type LinearGate* {.final.} [TT] = ref object of Gate[TT] 19 | ## TODO: use fused AddMatMul gate: C <- alpha AB + beta C 20 | x, W, b: Variable[TT] 21 | 22 | method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}= 23 | new result 24 | 25 | result.tape = a.tape 26 | result.value = self.W.value * a.value 27 | if not self.b.isNil: 28 | result.value .+= self.b.value # Bias is broadcasted other the whole batch size 29 | result.grad = zeros[getSubType(TT)](result.value.shape) 30 | 31 | method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}= 32 | result[0] = self.W.value.unsafeTranspose * gradient 33 | result[1] = gradient * self.x.value.unsafeTranspose 34 | 35 | if not self.b.isNil: 36 | result[2] = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html 37 | 38 | proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] = 39 | ## Input: 40 | ## - A x Variable of shape @[in_features, batch_size] 41 | ## - A weight Variable of shape @[out_features, in_features] 42 | ## - Optionally a bias Variable of shape @[out_features, 1] 43 | ## 44 | ## Return: Weight * x + bias 45 | 46 | when compileOption("boundChecks"): 47 | if x.value.rank > 2: 48 | raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)") 49 | 50 | check_ctx(x, weight) 51 | if not bias.isNil: 52 | check_ctx(x, bias) 53 | 54 | # weight has shape: Out_features * In_features 55 | # bias must have shape: Out_features * 1 56 | if not bias.isNil and not (bias.value.shape == @[weight.value.shape[0], 1]): 57 | raise newException(ValueError, "Incompatible shape: bias must be a vector of shape @[out_features, 1]") 58 | 59 | # Gate 60 | var gate: LinearGate[TT] 61 | new gate 62 | gate.arity = if bias.isNil: 2 else: 3 63 | gate.x = x 64 | gate.W = weight 65 | gate.b = bias 66 | 67 | # Node 68 | var node: Node[TT] 69 | new node 70 | 71 | node.gate = gate 72 | node.parents[0] = x 73 | node.parents[1] = weight 74 | if not bias.isNil: 75 | node.parents[2] = bias 76 | 77 | x.tape.push(node) 78 | 79 | # Resulting var 80 | result = gate.forward(x) 81 | result.ancestor = node 82 | node.child = result -------------------------------------------------------------------------------- /src/arraymancer/ufunc.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc astype*[T, U](t: Tensor[T], typ: typedesc[U]): Tensor[U] = 16 | ## Apply type conversion on the whole tensor 17 | result = t.map(x => x.U) 18 | 19 | 20 | # Built-in nim functions that doesn't work with makeUniversal 21 | proc abs*[T](t: Tensor[T]): Tensor[T] = 22 | t.mapT(abs(x)) 23 | 24 | 25 | # ############################################################# 26 | # Autogen universal functions 27 | 28 | # Note, the makeUniversal/Local documentation gets duplicated in docs at each template call 29 | # And shouldn't use ## 30 | template makeUniversal*(func_name: untyped) = 31 | # Lift an unary function into an exported universal function. 32 | # 33 | # Universal functions apply element-wise. 34 | # 35 | # ``makeUniversal`` does not work when the internal type of the Tensor changes, 36 | # for example, a function "isEven: int -> bool". 37 | # Use ``map`` in this case instead instead 38 | proc func_name*(t: Tensor): Tensor = 39 | ## Auto-generated universal version of the function. 40 | ## 41 | ## The function can be used directly on tensors and will work element-wise. 42 | t.mapT(func_name(x)) 43 | export func_name 44 | 45 | template makeUniversalLocal*(func_name: untyped) = 46 | # Lift an unary function into a non-exported universal function. 47 | # 48 | # Universal functions apply element-wise. 49 | # 50 | # ``makeUniversalLocal`` does not work when the internal type of the Tensor changes, 51 | # for example, a function "isEven: int -> bool". 52 | # Use ``map`` in this case instead instead 53 | proc func_name(t: Tensor): Tensor = 54 | t.mapT(func_name(x)) 55 | 56 | # Unary functions from Nim math library 57 | 58 | makeUniversal(fac) 59 | #makeUniversal(classify) 60 | #makeUniversal(isPowerOfTwo) 61 | #makeUniversal(nextPowerOfTwo) 62 | #makeUniversal(countBits32) 63 | #makeUniversal(sum) 64 | makeUniversal(sqrt) 65 | makeUniversal(cbrt) 66 | makeUniversal(ln) 67 | makeUniversal(log10) 68 | makeUniversal(log2) 69 | makeUniversal(exp) 70 | makeUniversal(arccos) 71 | makeUniversal(arcsin) 72 | makeUniversal(arctan) 73 | makeUniversal(cos) 74 | makeUniversal(cosh) 75 | makeUniversal(sinh) 76 | makeUniversal(sin) 77 | makeUniversal(tan) 78 | makeUniversal(tanh) 79 | makeUniversal(erf) 80 | makeUniversal(erfc) 81 | makeUniversal(lgamma) 82 | makeUniversal(tgamma) 83 | makeUniversal(floor) 84 | makeUniversal(ceil) 85 | makeUniversal(trunc) 86 | makeUniversal(round) 87 | #makeUniversal(splitDecimal) 88 | makeUniversal(degToRad) 89 | makeUniversal(radToDeg) 90 | -------------------------------------------------------------------------------- /src/nn_primitives/activation_primitives.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../arraymancer, math 16 | 17 | # Neural net activation functions that works directly on Tensors 18 | 19 | 20 | # Note: 21 | # 1. Canonical sigmoid "f(x) = 1 / (1 + exp(-x))" is unstable 22 | # for negative values < 709 (for float64) 23 | # 24 | # 2. Alternative expression stable for negative but unstable for positive is 25 | # "f(x) = exp(x) / (1 + exp(x))" 26 | # 27 | # 3. Introducing branching would be very costly. 28 | # 29 | # 4. Using tanh as 0.5 * (tanh(0.5 * x) + 1) is better than branching 30 | # but slow as well 31 | # 32 | # 5. Another alternative would be to clip x to max (-500, x) to avoid this instability 33 | # 34 | # Benchmarks available in the benchmark folder 35 | # 36 | 37 | proc sigmoid*[T: SomeReal](t: Tensor[T]): Tensor[T] {.inline.}= 38 | ## Logistic sigmoid activation function, :math:`f(x) = 1 / (1 + \exp(-x))` 39 | ## Note: Canonical sigmoid is not stable for large negative value 40 | 41 | proc sigmoid_closure(x: T): T = 1.T / (1.T + exp(-x)) 42 | 43 | # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T) 44 | 45 | return t.map(sigmoid_closure) 46 | 47 | proc msigmoid*[T: SomeReal](t: var Tensor[T]): Tensor[T] {.inline.}= 48 | ## Logistic sigmoid activation function, :math:`f(x) = 1 / (1 + \exp(-x))` 49 | ## Note: Canonical sigmoid is not stable for large negative value 50 | 51 | proc sigmoid_closure(x: T): T = 1.T / (1.T + exp(-x)) 52 | 53 | # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T) 54 | 55 | return t.map(sigmoid_closure) 56 | 57 | proc relu*[T](t: Tensor[T]): Tensor[T] {.inline.}= 58 | proc relu_closure(x: T): T = 59 | max(0.T, x) 60 | t.map(relu_closure) 61 | 62 | proc mrelu*[T](t: var Tensor[T]): Tensor[T] {.inline.}= 63 | proc relu_closure(x: T): T = 64 | max(0.T, x) 65 | t.apply(relu_closure) 66 | 67 | 68 | proc relu_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.inline.}= 69 | proc relu_backward_closure[T](x: T): T = 70 | if x <= 0.T: 71 | return 0.T 72 | return 1.T 73 | 74 | result = cached_tensor.map(relu_backward_closure) 75 | result .*= gradient 76 | 77 | proc sigmoid_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.inline.}= 78 | proc sigmoid_backward_closure[T](x: T): T = 79 | ## We suppose the input was already passed through the logistic sigmoid. 80 | ## Derivative is f' = f * (1 - f) 81 | x * (1 - x) 82 | 83 | result = cached_tensor.map(sigmoid_backward_closure) 84 | result .*= gradient -------------------------------------------------------------------------------- /src/arraymancer.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | when defined(doc): 16 | include ../docs/autogen_nim_API 17 | 18 | import sequtils, strutils, future, algorithm, nimblas, math, typetraits, macros, random 19 | 20 | # Export OrderType (rowMajor, colMajor) from nimblas 21 | export OrderType 22 | 23 | # include ../docs/autogen_nim_API 24 | include arraymancer/utils/functional, 25 | arraymancer/utils/nested_containers, 26 | arraymancer/utils/ast_utils, 27 | arraymancer/global_config, 28 | arraymancer/backend/blis, 29 | arraymancer/backend/openmp, 30 | arraymancer/data_structure, 31 | arraymancer/data_structure_helpers, 32 | arraymancer/init_cpu, 33 | arraymancer/init_deprecated_0_1_0, 34 | arraymancer/init_cpu_deprecated_0_2_0, # source of deprecation spam https://github.com/nim-lang/Nim/issues/6436 35 | arraymancer/accessors, 36 | arraymancer/accessors_macros_syntax, 37 | arraymancer/accessors_macros_desugar, 38 | arraymancer/accessors_macros_read, 39 | arraymancer/accessors_macros_write, 40 | arraymancer/comparison, 41 | arraymancer/higher_order, 42 | arraymancer/higher_order_deprecated, 43 | arraymancer/shapeshifting, 44 | arraymancer/display, 45 | arraymancer/ufunc, 46 | arraymancer/operators_blas_l1, 47 | arraymancer/fallback/blas_l3_gemm, 48 | arraymancer/fallback/naive_l2_gemv, 49 | arraymancer/operators_blas_l2l3, 50 | arraymancer/operators_broadcasted, 51 | arraymancer/math_functions, 52 | arraymancer/filling_data, 53 | arraymancer/aggregate, 54 | arraymancer/term_rewriting, 55 | arraymancer/shortcuts, 56 | arraymancer/exporting 57 | 58 | 59 | when defined(cuda): 60 | # Nimcuda poses issues with Nim docgen 61 | import nimcuda/[cuda_runtime_api, driver_types, cublas_api, cublas_v2, nimcuda] 62 | 63 | when defined(cuda) or defined(doc): 64 | include ./arraymancer/backend/cuda_global_state, 65 | ./arraymancer/backend/cuda, 66 | ./arraymancer/backend/cublas, 67 | # ./arraymancer/backend/cublas_helper_proc, 68 | ./arraymancer/init_cuda, 69 | ./arraymancer/accessors_cuda, 70 | ./arraymancer/display_cuda, 71 | ./arraymancer/elementwise_cuda.nim, 72 | ./arraymancer/elementwise_glue_cuda.nim, 73 | ./arraymancer/higher_order_cuda, 74 | ./arraymancer/operators_blas_l1_cuda, 75 | ./arraymancer/operators_blas_l2l3_cuda, 76 | ./arraymancer/shapeshifting_cuda -------------------------------------------------------------------------------- /tests/tensors/test_accessors.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math 17 | 18 | 19 | suite "Accessing and setting tensor values": 20 | test "Accessing and setting a single value": 21 | var a = zeros[int](@[2,3,4]) 22 | a[1,2,2] = 122 23 | check: a[1,2,2] == 122 24 | 25 | var b = zeros[int](@[3,4]) 26 | b[1,2] = 12 27 | check: b[1,2] == 12 28 | b[0,0] = 999 29 | check: b[0,0] == 999 30 | b[2,3] = 111 31 | check: b[2,3] == 111 32 | 33 | 34 | when compileOption("boundChecks"): 35 | test "Out of bounds checking": 36 | var a = newTensor(@[2,3,4], int, Backend.Cpu) 37 | expect(IndexError): 38 | a[2,0,0] = 200 39 | var b = newTensor(@[3,4], int, Backend.Cpu) 40 | expect(IndexError): 41 | b[3,4] = 999 42 | expect(IndexError): 43 | discard b[-1,0] 44 | expect(IndexError): 45 | discard b[0,-2] 46 | else: 47 | echo "Bound-checking is disabled. The out-of-bounds checking test has been skipped." 48 | 49 | test "Iterators": 50 | const 51 | a = @[1, 2, 3, 4, 5] 52 | b = @[1, 2, 3] 53 | var 54 | vd: seq[seq[int]] 55 | row: seq[int] 56 | vd = newSeq[seq[int]]() 57 | for i, aa in a: 58 | row = newSeq[int]() 59 | vd.add(row) 60 | for j, bb in b: 61 | vd[i].add(aa^bb) 62 | 63 | let nda_vd = vd.toTensor() 64 | 65 | let expected_seq = @[1,1,1,2,4,8,3,9,27,4,16,64,5,25,125] 66 | 67 | var seq_val: seq[int] = @[] 68 | for i in nda_vd: 69 | seq_val.add(i) 70 | 71 | check: seq_val == expected_seq 72 | 73 | var seq_validx: seq[tuple[idx: seq[int], val: int]] = @[] 74 | for i,j in nda_vd: 75 | seq_validx.add((i,j)) 76 | 77 | check: seq_validx[0] == (@[0,0], 1) 78 | check: seq_validx[10] == (@[3,1], 16) 79 | 80 | let t_nda = transpose(nda_vd) 81 | 82 | var seq_transpose: seq[tuple[idx: seq[int], val: int]] = @[] 83 | for i,j in t_nda: 84 | seq_transpose.add((i,j)) 85 | 86 | check: seq_transpose[0] == (@[0,0], 1) 87 | check: seq_transpose[8] == (@[1,3], 16) 88 | 89 | test "indexing + in-place operator": 90 | var a = newTensor[int]([3,3]) 91 | 92 | a[1,1] += 10 93 | 94 | a[1,1] *= 20 95 | 96 | check: a == [[0,0,0],[0,200,0],[0,0,0]].toTensor 97 | 98 | test "Zipping two tensors": 99 | let a = [[1,2],[3,4]].toTensor() 100 | let b = [[5,6],[7,8]].toTensor() 101 | 102 | var res = 0 103 | for ai, bi in zip(a, b): 104 | res += ai + bi 105 | check: res == 36 -------------------------------------------------------------------------------- /src/arraymancer/operators_blas_l2l3_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc cudaMV_y_eq_aAx_p_by[T: SomeReal]( 16 | alpha: T, a, x: CudaTensor[T], 17 | beta: T, y: var CudaTensor[T]) = 18 | # Matrix-Vector: y = alpha A matvecmul x + beta y 19 | 20 | # TODO: remove this contiguous layout constraint 21 | if not a.isContiguous: 22 | raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous") 23 | 24 | let 25 | a_is_colMajor = a.is_F_contiguous 26 | 27 | transpose_A = if a_is_colMajor: CUBLAS_OP_N 28 | else: CUBLAS_OP_T 29 | ld_A = if a_is_colMajor: a.strides[1] 30 | else: a.strides[0] 31 | 32 | cublas_gemv( 33 | transpose_A, a.shape[0], a.shape[1], 34 | alpha, a.get_data_ptr, ld_A, 35 | x.get_data_ptr, x.strides[0], 36 | beta, y.get_data_ptr, y.strides[0]) 37 | 38 | proc cudaMM_C_eq_aAB_p_bC[T: SomeReal]( 39 | alpha: T, a, b: CudaTensor[T], 40 | beta: T, c: var CudaTensor[T]) = 41 | # Matrix: C = alpha A matmul B + beta C 42 | 43 | # TODO: remove this contiguous layout constraint 44 | if not (a.isContiguous and b.isContiguous): 45 | raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous") 46 | 47 | let 48 | a_is_colMajor = a.is_F_contiguous 49 | b_is_colMajor = b.is_F_contiguous 50 | 51 | transpose_A = if a_is_colMajor: CUBLAS_OP_N 52 | else: CUBLAS_OP_T 53 | ld_A = if a_is_colMajor: a.strides[1] 54 | else: a.strides[0] 55 | 56 | transpose_B = if b_is_colMajor: CUBLAS_OP_N 57 | else: CUBLAS_OP_T 58 | ld_B = if b_is_colMajor: b.strides[1] 59 | else: b.strides[0] 60 | 61 | ld_C = c.strides[1] # C is always F contiguous (TODO test) 62 | 63 | cublas_gemm(transpose_A, transpose_B, 64 | a.shape[0], b.shape[1], a.shape[1], 65 | alpha, a.get_data_ptr, ld_A, 66 | b.get_data_ptr, ld_B, 67 | beta, c.get_data_ptr, ld_C) 68 | 69 | proc `*`*[T: SomeReal](a, b: CudaTensor[T]): CudaTensor[T] = 70 | ## Matrix multiplication (Matrix-Matrix and Matrix-Vector) on CUDA 71 | 72 | if a.rank == 2 and b.rank == 2: 73 | when compileOption("boundChecks"): check_matmat(a,b) 74 | result = newCudaTensor[T]([a.shape[0], b.shape[1]]) 75 | cudaMM_C_eq_aAB_p_bC(1.T, a, b, 0.T, result) 76 | elif a.rank == 2 and b.rank == 1: 77 | when compileOption("boundChecks"): check_matvec(a,b) 78 | result = newCudaTensor[T]([a.shape[0]]) 79 | cudaMV_y_eq_aAx_p_by(1.T,a, b, 0.T, result) 80 | else: raise newException(ValueError, "Matrix-Matrix or Matrix-Vector multiplication valid only if first Tensor is a Matrix and second is a Matrix or Vector") -------------------------------------------------------------------------------- /tests/tensors/test_init.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../../src/arraymancer 16 | import unittest, math, sequtils 17 | 18 | suite "Creating a new Tensor": 19 | test "Creating from sequence": 20 | let t1 = @[1,2,3].toTensor() 21 | check: t1.shape == @[3] 22 | check: t1.rank == 1 23 | 24 | const 25 | a = @[1, 2, 3, 4, 5] 26 | b = @[1, 2, 3, 4, 5] 27 | 28 | var 29 | vandermonde: seq[seq[int]] 30 | row: seq[int] 31 | 32 | vandermonde = newSeq[seq[int]]() 33 | 34 | for i, aa in a: 35 | row = newSeq[int]() 36 | vandermonde.add(row) 37 | for j, bb in b: 38 | vandermonde[i].add(aa^bb) 39 | 40 | let t2 = vandermonde.toTensor() 41 | check: t2.rank == 2 42 | check: t2.shape == @[5, 5] 43 | 44 | let nest3 = @[ 45 | @[ 46 | @[1,2,3], 47 | @[1,2,3] 48 | ], 49 | @[ 50 | @[3,2,1], 51 | @[3,2,1] 52 | ], 53 | @[ 54 | @[4,4,5], 55 | @[4,4,4] 56 | ], 57 | @[ 58 | @[6,6,6], 59 | @[6,6,6] 60 | ] 61 | ] 62 | 63 | let t3 = nest3.toTensor() 64 | check: t3.rank == 3 65 | check: t3.shape == @[4, 2, 3] # 4 rows, 2 cols, 3 depth. depth indices moves the fastest. Same scheme as Numpy. 66 | 67 | let u = @[@[1.0, -1, 2],@[0.0, -1]] 68 | 69 | when compileOption("boundChecks"): 70 | expect(IndexError): 71 | discard u.toTensor() 72 | else: 73 | echo "Bound-checking is disabled. The incorrect seq shape test has been skipped." 74 | 75 | test "Check that Tensor shape is in row-by-column order": 76 | let s = @[@[1,2,3],@[3,2,1]] 77 | let t = s.toTensor() 78 | 79 | check: t.shape == @[2,3] 80 | 81 | let u = newTensor[int](@[2,3]) 82 | check: u.shape == @[2,3] 83 | 84 | check: u.shape == t.shape 85 | 86 | test "Zeros": 87 | block: 88 | let t = zeros[float]([4,4,4]) 89 | for v in t.items: 90 | check v == 0.0f 91 | block: 92 | let t = zeros[int]([4,4,4]) 93 | for v in t.items: 94 | check v == 0 95 | 96 | test "Ones": 97 | block: 98 | let t = ones[float]([4,4,4]) 99 | for v in t.items: 100 | check v == 1.0f 101 | block: 102 | let t = ones[int]([4,4,4]) 103 | for v in t.items: 104 | check v == 1 105 | 106 | test "Filled new tensor": 107 | block: 108 | let t = newTensorWith([4,4,4], 2.0f) 109 | for v in t.items: 110 | check v == 2.0f 111 | block: 112 | let t = newTensorWith([4,4,4], 2) 113 | for v in t.items: 114 | check v == 2 115 | 116 | test "Random tensor": 117 | block: 118 | # Check that randomTensor doesn't silently convert float32 to float64 119 | let a = randomTensor([3, 4], 100'f32) 120 | 121 | check: a[0,0] is float32 122 | # TODO add tests for randomTensor 123 | 124 | -------------------------------------------------------------------------------- /src/arraymancer/operators_blas_l1.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Bounds checking functions 17 | proc check_dot_prod(a, b:AnyTensor) {.noSideEffect.}= 18 | if a.rank != 1 or b.rank != 1: raise newException(ValueError, "Dot product is only supported for vectors (tensors of rank 1)") 19 | if a.shape != b.shape: raise newException(ValueError, "Vector should be the same length") 20 | 21 | # #################################################################### 22 | # BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix) 23 | 24 | # FIXME: Can't use built-in proc `+` in map: https://github.com/nim-lang/Nim/issues/5702 25 | # map2(a, `+`, b) 26 | 27 | proc dot*[T: SomeReal](a, b: Tensor[T]): T {.noSideEffect.} = 28 | ## Vector to Vector dot (scalar) product 29 | when compileOption("boundChecks"): check_dot_prod(a,b) 30 | return dot(a.shape[0], a.get_data_ptr, a.strides[0], b.get_data_ptr, b.strides[0]) 31 | 32 | proc dot*[T: SomeInteger](a, b: Tensor[T]): T {.noSideEffect.} = 33 | ## Vector to Vector dot (scalar) product 34 | # Fallback for non-floats 35 | when compileOption("boundChecks"): check_dot_prod(a,b) 36 | for ai, bi in zip(a, b): 37 | result += ai * bi 38 | 39 | # ######################################################### 40 | # # Tensor-Tensor linear algebra 41 | # # shape checks are done in map2 proc 42 | 43 | proc `+`*[T: SomeNumber](a, b: Tensor[T]): Tensor[T] = 44 | ## Tensor addition 45 | map2T(a, b, x + y) 46 | 47 | proc `-`*[T: SomeNumber](a, b: Tensor[T]): Tensor[T] = 48 | ## Tensor substraction 49 | map2T(a, b, x - y) 50 | 51 | # ######################################################### 52 | # # Tensor-Tensor in-place linear algebra 53 | 54 | proc `+=`*[T: SomeNumber](a: var Tensor[T], b: Tensor[T]) = 55 | ## Tensor in-place addition 56 | a.apply2T(b, x + y) 57 | 58 | proc `-=`*[T: SomeNumber](a: var Tensor[T], b: Tensor[T]) = 59 | ## Tensor in-place substraction 60 | a.apply2T(b, x - y) 61 | 62 | # ######################################################### 63 | # # Tensor-scalar linear algebra 64 | 65 | proc `*`*[T: SomeNumber](a: T, t: Tensor[T]): Tensor[T] = 66 | ## Element-wise multiplication by a scalar 67 | t.mapT(x * a) 68 | 69 | proc `*`*[T: SomeNumber](t: Tensor[T], a: T): Tensor[T] = 70 | ## Element-wise multiplication by a scalar 71 | a * t 72 | 73 | proc `/`*[T: SomeReal](t: Tensor[T], a: T): Tensor[T] = 74 | ## Element-wise division by a float scalar 75 | t.mapT(x / a) 76 | 77 | proc `div`*[T: SomeInteger](t: Tensor[T], a: T): Tensor[T] = 78 | ## Element-wise division by an integer 79 | t.mapT(x div a) 80 | 81 | # ######################################################### 82 | # # Tensor-scalar in-place linear algebra 83 | 84 | proc `*=`*[T: SomeNumber](t: var Tensor[T], a: T) = 85 | ## Element-wise multiplication by a scalar (in-place) 86 | t.applyT(x * a) 87 | 88 | proc `/=`*[T: SomeReal](t: var Tensor[T], a: T) = 89 | ## Element-wise division by a scalar (in-place) 90 | t.applyT(x / a) 91 | 92 | proc `/=`*[T: SomeInteger](t: var Tensor[T], a: T) = 93 | ## Element-wise division by a scalar (in-place) 94 | t.applyT(x div a) 95 | -------------------------------------------------------------------------------- /src/arraymancer/backend/cublas_helper_proc.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Cublas helper procs for L1 BLAS 17 | # With custom kernels they shouldn't be needed anymore 18 | # They however have a nice interface to call for fused aX + Y or aA + Bb 19 | 20 | ################################################# 21 | ## In-place 22 | 23 | proc cudaVV_A_eq_A_p_bB[T: SomeReal]( 24 | a: var CudaTensor[T], beta: T, b: CudaTensor[T]) {.inline, deprecated.}= 25 | # Vector: A = A + beta B 26 | 27 | cublas_axpy(a.shape[0], 28 | beta, 29 | b.get_data_ptr, b.strides[0], 30 | a.get_data_ptr, a.strides[0]) 31 | 32 | proc cudaMM_A_eq_aA_p_bB[T: SomeReal]( 33 | alpha: T, a: var CudaTensor[T], 34 | beta: T, b: CudaTensor[T]) {.deprecated.}= 35 | # Matrix: A = alpha A + beta B 36 | 37 | # TODO: remove this contiguous layout constraint (via conversion or custom kernel) 38 | if not (isContiguous(a) and isContiguous(b)): 39 | raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous") 40 | 41 | if not is_F_contiguous(a): 42 | raise newException(ValueError, "NotImplemented: the modified tensor must have a column-major layout") 43 | 44 | let 45 | b_is_colMajor = b.is_F_contiguous 46 | 47 | transpose_B = if b_is_colMajor: CUBLAS_OP_N 48 | else: CUBLAS_OP_T 49 | 50 | ld_B = if b_is_colMajor: b.strides[1] 51 | else: b.strides[0] 52 | 53 | cublas_geam(CUBLAS_OP_N, transpose_B, 54 | a.shape[0], a.shape[1], 55 | alpha, 56 | a.get_data_ptr, a.strides[1], 57 | beta, 58 | b.get_data_ptr, ld_B, 59 | a.get_data_ptr, a.strides[1]) 60 | # In column-majour layout a.shape[0] == a.strides[1] 61 | 62 | ############################################################# 63 | ## Out-of-place 64 | 65 | proc cudaVV_C_eq_A_p_bB[T: SomeReal]( a: CudaTensor[T], 66 | beta: T, b: CudaTensor[T], 67 | result: var CudaTensor[T]) {.inline, deprecated.}= 68 | # Vector: C = A + beta B 69 | result = newCudaTensor[T](a.shape) 70 | 71 | cublas_copy(a.len, a.get_data_ptr, a.strides[0], 72 | result.get_data_ptr, result.strides[0]) 73 | 74 | cudaVV_A_eq_A_p_bB(result, beta, b) 75 | 76 | proc cudaMM_C_eq_aA_p_aB[T: SomeReal](alpha: T, a: CudaTensor[T], 77 | beta: T, b: CudaTensor[T], 78 | result: var CudaTensor[T]) {.deprecated.}= 79 | # TODO: remove this contiguous layout constraint (via conversion or custom kernel) 80 | if not (isContiguous(a) and isContiguous(b)): 81 | raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous") 82 | 83 | result = newCudaTensor[T](a.shape) # result is colMajor 84 | 85 | let 86 | a_is_colMajor = a.is_F_contiguous 87 | b_is_colMajor = b.is_F_contiguous 88 | 89 | transpose_A = if a_is_colMajor: CUBLAS_OP_N 90 | else: CUBLAS_OP_T 91 | ld_A = if a_is_colMajor: a.strides[1] 92 | else: a.strides[0] 93 | 94 | transpose_B = if b_is_colMajor: CUBLAS_OP_N 95 | else: CUBLAS_OP_T 96 | ld_B = if b_is_colMajor: b.strides[1] 97 | else: b.strides[0] 98 | 99 | cublas_geam(transpose_A, transpose_B, 100 | a.shape[0], a.shape[1], 101 | alpha, 102 | a.get_data_ptr, ld_A, 103 | beta, 104 | b.get_data_ptr, ld_B, 105 | result.get_data_ptr, result.strides[1]) -------------------------------------------------------------------------------- /src/arraymancer/operators_blas_l1_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # #################################################################### 16 | # BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix) 17 | 18 | proc dot*[T: SomeReal](a, b: CudaTensor[T]): T {.inline.}= 19 | ## Vector to Vector dot (scalar) product 20 | when compileOption("boundChecks"): check_dot_prod(a,b) 21 | cublas_dot( a.shape[0], 22 | a.get_data_ptr, a.strides[0], 23 | b.get_data_ptr, b.strides[0], 24 | addr result) 25 | 26 | proc cuda_inPlaceAdd = discard # This is a hack so that the symbol is open 27 | cuda_assign_glue(cuda_inPlaceAdd, "InPlaceAddOp") 28 | 29 | proc `+=`*[T: SomeReal](a: var CudaTensor[T], b: CudaTensor[T]) = 30 | ## CudaTensor in-place addition 31 | 32 | when compileOption("boundChecks"): 33 | check_elementwise(a,b) 34 | 35 | cuda_assign_call(cuda_inPlaceAdd, a, b) 36 | 37 | # TODO: if a and b share the same location, TEST 38 | 39 | proc cuda_Add = discard # This is a hack so that the symbol is open 40 | cuda_binary_glue(cuda_Add, "AddOp") 41 | 42 | proc `+`*[T: SomeReal](a,b: CudaTensor[T]): CudaTensor[T] = 43 | ## CudaTensor addition 44 | 45 | when compileOption("boundChecks"): 46 | check_elementwise(a,b) 47 | 48 | result = newCudaTensor[T](a.shape) 49 | cuda_binary_call(cuda_Add, result, a, b) 50 | 51 | proc cuda_inPlaceSub = discard # This is a hack so that the symbol is open 52 | cuda_assign_glue(cuda_inPlaceSub, "InPlaceSubOp") 53 | 54 | proc `-=`*[T: SomeReal](a: var CudaTensor[T], b: CudaTensor[T]) = 55 | ## CudaTensor in-place substraction 56 | 57 | when compileOption("boundChecks"): check_elementwise(a,b) 58 | 59 | cuda_assign_call(cuda_inPlaceSub, a, b) 60 | 61 | # TODO: if a and b share the same location, TEST 62 | 63 | 64 | proc cuda_Sub = discard # This is a hack so that the symbol is open 65 | cuda_binary_glue(cuda_Sub, "SubOp") 66 | 67 | proc `-`*[T: SomeReal](a,b: CudaTensor[T]): CudaTensor[T] = 68 | ## CudaTensor substraction 69 | 70 | when compileOption("boundChecks"): check_elementwise(a,b) 71 | 72 | result = newCudaTensor[T](a.shape) 73 | cuda_binary_call(cuda_Sub, result, a, b) 74 | 75 | proc `*=`*[T:SomeReal](t: var CudaTensor[T]; a: T) {.inline.}= 76 | ## CudaTensor inplace multiplication by a scalar 77 | 78 | # We multiply all elements of the CudaTensor regardless of shape/strides 79 | # So this operation can be applied to tensors of all ranks. 80 | # Hence we use the whole allocated length and a stride of 1 81 | cublas_scal(t.data.len, a, t.get_data_ptr, 1) 82 | 83 | proc `*`*[T:SomeReal](a: T, t: CudaTensor[T]): CudaTensor[T] {.inline.}= 84 | ## CudaTensor multiplication by a scalar 85 | 86 | # TODO replace by a custom kernel 87 | # Instead of a full clone we keep only the useful which is advantageous if t was a slice 88 | # It also makes it contiguous 89 | result = t.clone() 90 | result *= a 91 | 92 | proc `*`*[T:SomeReal](t: CudaTensor[T], a: T): CudaTensor[T] {.inline.}= 93 | ## CudaTensor multiplication by a scalar 94 | a * t 95 | 96 | proc `/=`*[T:SomeReal](t: var CudaTensor[T]; a: T) {.inline.}= 97 | ## CudaTensor in-place division by a scalar 98 | t *= (1/a) 99 | 100 | proc `/`*[T:SomeReal](t: CudaTensor[T], a: T): CudaTensor[T] {.inline.}= 101 | ## CudaTensor division by a scalar 102 | 103 | # TODO replace by a custom kernel 104 | # Instead of a full clone we keep only the useful which is advantageous if t was a slice 105 | # It also makes it contiguous 106 | # Furthermore doing t[i]/a instead of 1/a * t[i] will be much better for speed and numerical stability 107 | (1/a) * t 108 | 109 | proc `/`*[T:SomeReal](a: T, t: CudaTensor[T]): CudaTensor[T] {.inline.}= 110 | ## CudaTensor division by a scalar 111 | (1/a) * t -------------------------------------------------------------------------------- /src/arraymancer/higher_order_deprecated.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | proc fmap*[T, U](t: Tensor[T], f: T -> U): Tensor[U] 17 | {.deprecated, inline.}= 18 | ## DEPRECATED 19 | ## 20 | ## Replace by map2 21 | t.map(f) 22 | 23 | proc fmap2*[T, U, V](t1: Tensor[T], t2: Tensor[U], f: (T,U) -> V): Tensor[V] 24 | {.deprecated, inline.}= 25 | ## DEPRECATED 26 | ## 27 | ## Replaced by map2 28 | ## 29 | ## Note the new argument order of map2 to accomodate for 30 | ## t1.map2(`op`, t2) where op is an infix operator. 31 | t1.map2(f, t2) 32 | 33 | 34 | # # Compute aggregate/reduction/folds over tensors 35 | 36 | # ### Elementwise generic aggregate functions 37 | # Note: You can't pass builtins like `+` or `+=` due to Nim limitations 38 | # https://github.com/nim-lang/Nim/issues/2172 39 | 40 | proc agg*[T: SomeNumber](t: Tensor[T], 41 | f:(T, T)-> T, 42 | start_val: T 43 | ): T 44 | {.noSideEffect, inline, deprecated.}= 45 | ## DEPRECATED, use fold instead. 46 | ## 47 | ## Note: order between function f and start_val has changed 48 | ## 49 | ## Compute the aggregate 50 | ## Input: 51 | ## - A tensor to aggregate on 52 | ## - The aggregation function. It is applied this way: new_aggregate = f(old_aggregate, current_value) 53 | ## - The starting value 54 | ## - The axis 55 | t.fold(start_val, f) 56 | 57 | proc agg_inplace*[T: SomeNumber]( 58 | accum_val: var T, 59 | f: proc(x:var T, y:T), # We can't use the nice future syntax here for unknown reason 60 | t: Tensor[T], 61 | ) 62 | {.noSideEffect, inline, deprecated.}= 63 | ## DEPRECATED, use fold instead. 64 | ## 65 | ## You will have to switch to a non-inplace function. 66 | ## 67 | ## Compute the aggregate 68 | ## Input: 69 | ## - The accumulating value which will be modified in-place 70 | ## - The aggregation in-place function. It is applied this way: f(var old_aggregate, current_value) 71 | ## - A tensor to aggregate from 72 | ## - The axis 73 | for val in t: 74 | f(accum_val, val) 75 | 76 | 77 | # ### Axis generic functions 78 | # `+`, `+=` for tensors are not "built-ins" 79 | 80 | proc agg*[T: SomeNumber](t: Tensor[T], 81 | f:(Tensor[T], Tensor[T])-> Tensor[T], 82 | start_val: Tensor[T], 83 | axis: int 84 | ): Tensor[T] 85 | {.noSideEffect, inline, deprecated.}= 86 | ## DEPRECATED, use fold instead. 87 | ## 88 | ## Note: order between function f and start_val has changed 89 | ## 90 | ## Input: 91 | ## - A tensor to aggregate on 92 | ## - The aggregation function. It is applied this way: new_aggregate = f(old_aggregate, current_value) 93 | ## - The starting value 94 | ## - The axis 95 | 96 | t.fold(start_val, f, axis) 97 | 98 | proc agg_inplace*[T: SomeNumber]( 99 | accum_val: var Tensor[T], 100 | f: proc(x:var Tensor[T], y:Tensor[T]), # We can't use the nice future syntax here for unknown reason 101 | t: Tensor[T], 102 | axis: int 103 | ) 104 | {.noSideEffect, inline, deprecated.}= 105 | ## DEPRECATED, use fold instead. 106 | ## 107 | ## You will have to switch to a non-inplace function. 108 | ## 109 | ## Input: 110 | ## - The accumulating value which will be modified in-place 111 | ## - A tensor to aggregate from 112 | ## - The aggregation in-place function. It is applied this way: f(var old_aggregate, current_value) 113 | ## - The axis 114 | 115 | for val in t.axis(axis): 116 | f(accum_val, val) 117 | -------------------------------------------------------------------------------- /src/arraymancer/elementwise_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Collection of cuda basic element-wise operations 17 | # to be use by higher-order functions. 18 | # The end-goal is to have a macro/template that can auto-generate these from: 19 | # 20 | # elementwise: 21 | # C = (A + B*sin(D))/exp(-X) 22 | # 23 | # __ldg is a cuda intrinsics to load read-only data 24 | # from a special cache 25 | 26 | # Assignment op 27 | # Does element-wise A[i] `op=` B[i] 28 | template cuda_assign_op(op_name, op_symbol: string)= 29 | {.emit: [""" 30 | template 31 | struct """,op_name,"""{ 32 | __device__ __forceinline__ void operator()( 33 | T * __restrict__ dst, 34 | const T * __restrict__ src){ 35 | *dst """,op_symbol,""" __ldg(src); 36 | } 37 | }; 38 | """].} 39 | 40 | # Assignment with scalars 41 | template cuda_assignscal_op(op_name, op_symbol: string)= 42 | {.emit: [""" 43 | template 44 | struct """,op_name,"""{ 45 | __device__ __forceinline__ void operator()( 46 | T * __restrict__ dst, 47 | const T * __restrict__ scal){ 48 | *dst """,op_symbol,""" scal; 49 | } 50 | }; 51 | """].} 52 | 53 | # Binary op 54 | # Does C[i] = A[i] `op` B[i] 55 | template cuda_binary_op(op_name, op_symbol: string)= 56 | {.emit:[""" 57 | template 58 | struct """,op_name,"""{ 59 | __device__ __forceinline__ void operator()( 60 | T * __restrict__ dst, 61 | const T * __restrict__ A, 62 | const T * __restrict__ B){ 63 | *dst = __ldg(A)""", op_symbol, """ __ldg(B); 64 | } 65 | }; 66 | """].} 67 | 68 | # Binary op with scalar on the left 69 | # Does C[i] = a `op` B[i] 70 | template cuda_lscal_op(op_name, op_symbol: string)= 71 | {.emit:[""" 72 | template 73 | struct """,op_name,"""{ 74 | __device__ __forceinline__ void operator()( 75 | T * __restrict__ dst, 76 | const T alpha, 77 | const T * __restrict__ B){ 78 | *dst = alpha""", op_symbol, """ __ldg(B); 79 | } 80 | }; 81 | """].} 82 | 83 | # Binary op with scalar on the right 84 | # Does C[i] = A[i] `op` beta 85 | template cuda_rscal_op(op_name, op_symbol: string)= 86 | {.emit:[""" 87 | template 88 | struct """,op_name,"""{ 89 | __device__ __forceinline__ void operator()( 90 | T * __restrict__ dst, 91 | const T * __restrict__ A, 92 | const T beta){ 93 | *dst = __ldg(A)""", op_symbol, """ beta; 94 | } 95 | }; 96 | """].} 97 | 98 | # Unary op 99 | # Does C[i] = op(A[i]) 100 | template cuda_unary_op(op_name, op_symbol: string)= 101 | {.emit:[""" 102 | template 103 | struct """,op_name,"""{ 104 | __device__ __forceinline__ void operator()( 105 | T * __restrict__ dst, 106 | const T * __restrict__ src){ 107 | *dst = """, op_symbol, """(__ldg(src)); 108 | } 109 | }; 110 | """].} 111 | 112 | cuda_assign_op("CopyOp", "=") 113 | cuda_assign_op("InPlaceAddOp", "+=") 114 | cuda_assign_op("InPlaceSubOp", "-=") 115 | cuda_assign_op("InPlaceMulOp", "*=") 116 | cuda_assign_op("InPlaceDivOp", "/=") 117 | 118 | cuda_assignscal_op("CopyScalOp", "=") 119 | cuda_assignscal_op("InPscalAddOp", "+=") 120 | cuda_assignscal_op("InPscalSubOp", "-=") 121 | cuda_assignscal_op("InPscalMulOp", "*=") 122 | cuda_assignscal_op("InPscalDivOp", "/=") 123 | 124 | cuda_binary_op("AddOp", "+") 125 | cuda_binary_op("SubOp", "-") 126 | cuda_binary_op("MulOp", "*") 127 | cuda_binary_op("DivOp", "/") 128 | 129 | cuda_lscal_op("LscalMul","*") 130 | cuda_lscal_op("LscalDiv","/") 131 | cuda_lscal_op("LscalSub","-") 132 | 133 | cuda_rscal_op("RscalDiv","/") 134 | cuda_rscal_op("RscalSub","-") 135 | cuda_rscal_op("RscalAdd","+") 136 | 137 | cuda_unary_op("NegOp","-") 138 | cuda_unary_op("ExpOp","exp") 139 | cuda_unary_op("SinOp","sin") 140 | cuda_unary_op("CosOp","cos") 141 | cuda_unary_op("TanOp","tan") 142 | cuda_unary_op("TanhOp","tanh") 143 | -------------------------------------------------------------------------------- /src/arraymancer/shapeshifting_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc unsafeTranspose*(t: CudaTensor): CudaTensor {.noSideEffect.}= 16 | ## Transpose a Tensor. 17 | ## 18 | ## For N-d Tensor with shape (0, 1, 2 ... n-1) the resulting tensor will have shape (n-1, ... 2, 1, 0) 19 | ## 20 | ## Warning ⚠ CudaTensor temporary default: 21 | ## This is a no-copy operation, data is shared with the input. 22 | ## This proc does not guarantee that a ``let`` value is immutable. 23 | 24 | result.shape = t.shape.reversed 25 | result.strides = t.strides.reversed 26 | result.offset = t.offset 27 | result.data = t.data 28 | 29 | proc cuda_unsafeContiguous = discard # This is a hack so that the symbol is open 30 | cuda_assign_glue(cuda_unsafeContiguous, "CopyOp") 31 | 32 | proc unsafeContiguous*[T: SomeReal](t: CudaTensor[T], layout: OrderType = colMajor, force: bool = false): 33 | CudaTensor[T] {.noSideEffect.}= 34 | ## Transform a tensor with general striding to a Tensor with contiguous layout. 35 | ## 36 | ## By default CudaTensor will be colMajor (contrary to a cpu tensor). 37 | ## 38 | ## By default nothing is done if the tensor is already contiguous (C Major or F major) 39 | ## The "force" parameter can force re-ordering to a specific layout 40 | ## 41 | ## Warning ⚠ CudaTensor temporary default: 42 | ## If the CudaTensor is contiguous, this is a no-copy operation, data is shared with the input. 43 | ## This proc does not guarantee that a ``let`` value is immutable. 44 | 45 | if t.isContiguous and not force: 46 | return t 47 | elif t.is_F_contiguous and layout == colMajor: 48 | return t 49 | elif t.is_C_contiguous and layout == rowMajor: 50 | return t 51 | 52 | result = newCudaTensor[T](t.shape, layout) 53 | 54 | cuda_assign_call(cuda_unsafeContiguous, result, t) 55 | 56 | 57 | proc unsafeReshape*(t: CudaTensor, new_shape: varargs[int]): CudaTensor = 58 | ## Reshape a CudaTensor without copy. 59 | ## 60 | ## ⚠ Reshaping without copy is only possible on contiguous Tensors 61 | ## 62 | ## Warning ⚠: 63 | ## This is a no-copy operation, data is shared with the input. 64 | ## This proc does not guarantee that a ``let`` value is immutable. 65 | 66 | t.reshape_no_copy(new_shape) 67 | result.data = t.data 68 | 69 | proc unsafeBroadcast*(t: CudaTensor, shape: varargs[int]): CudaTensor {.noSideEffect.}= 70 | ## Explicitly broadcast a CudaTensor to the specified shape. 71 | ## The returned broadcasted CudaTensor share the underlying data with the input. 72 | ## 73 | ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating 74 | ## values along that dimension. 75 | ## 76 | ## Warning ⚠: 77 | ## This is a no-copy operation, data is shared with the input. 78 | ## This proc does not guarantee that a ``let`` value is immutable. 79 | ## A broadcasted tensor should not be modified and only used for computation. 80 | result = t 81 | result.broadcastT(shape) 82 | 83 | proc unsafeSqueeze*(t: CudaTensor, axis: int): CudaTensor {.noSideEffect.}= 84 | ## Collapse the given axis, if the dimension is not 1; it does nothing 85 | ## Input: 86 | ## - a CudaTensor 87 | ## - an axis (dimension) 88 | ## Returns: 89 | ## - a CudaTensor with singleton dimensions collapsed 90 | ## Warning ⚠: 91 | ## This is a no-copy operation, data is shared with the input. 92 | ## This proc does not guarantee that a ``let`` value is immutable. 93 | result = t 94 | result.squeezeT(axis) 95 | 96 | proc unsafeUnsqueeze*(t: CudaTensor, axis: int): CudaTensor {.noSideEffect.}= 97 | ## Insert a new axis just before the given axis, increasing the CudaTensor 98 | ## dimension (rank) by 1 99 | ## - a tensor with that new axis 100 | ## Warning ⚠: 101 | ## This is a no-copy operation, data is shared with the input. 102 | ## This proc does not guarantee that a ``let`` value is immutable. 103 | result = t 104 | result.unsqueezeT(axis) -------------------------------------------------------------------------------- /src/arraymancer/backend/cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Data structures to ease interfacing with Cuda and kernels 16 | 17 | proc cudaMalloc[T](size: int): ptr T {.noSideEffect, inline.}= 18 | ## Internal proc. 19 | ## Wrap CudaMAlloc(var pointer, size) -> Error_code 20 | let s = size * sizeof(T) 21 | check cudaMalloc(cast[ptr pointer](addr result), s) 22 | 23 | proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}= 24 | if not p[].isNil: 25 | check cudaFree(p[]) 26 | 27 | 28 | # ############################################################## 29 | # # Base CudaSeq type 30 | # # End goal is for it to have value semantics like Nim seq 31 | 32 | proc newCudaSeq[T: SomeReal](length: int): CudaSeq[T] {.noSideEffect.}= 33 | result.len = length 34 | new(result.data, deallocCuda) 35 | result.data[] = cast[ptr UncheckedArray[T]](cudaMalloc[T](result.len)) 36 | 37 | # ######################################################### 38 | # # Sending tensor layout to Cuda Kernel 39 | 40 | # # So that layout->strides can be used in Cuda kernel, it's easier if everything is declared from cpp 41 | # # pending https://github.com/nim-lang/Nim/issues/6415 42 | # 43 | # template create_CudaTensorLayout(N: static[int]) = 44 | # ## This Layout in C++ will be overriden by a CudaMemCpy from the Nim data structure 45 | # {. emit:[ """ 46 | # 47 | # template 48 | # struct CudaTensorLayout { 49 | # int rank; 50 | # int shape[""", N,"""]; 51 | # int strides[""", N,"""]; 52 | # int offset; 53 | # T * __restrict__ data; 54 | # }; 55 | # 56 | # 57 | # """].} 58 | # 59 | # create_CudaTensorLayout(MAXRANK) 60 | 61 | type 62 | # CudaLayoutArray = array[MAXRANK, cint] 63 | # This will replace the current ref[ptr T] for shape and strides in the future 64 | ## Using arrays instead of seq avoids having to indicate __restrict__ everywhere to indicate no-aliasing 65 | ## We also prefer stack allocated array sice the data will be used at every single loop iteration to compute elements position. 66 | ## Ultimately it avoids worrying about deallocation too 67 | CudaLayoutArray = ref[ptr cint] 68 | 69 | 70 | CudaTensorLayout [T: SomeReal] = object 71 | ## Mimicks CudaTensor 72 | ## This will be stored on GPU in the end 73 | ## Goal is to avoids clumbering proc with cudaMemcpyshape, strides, offset, data, rank, len 74 | ## 75 | ## Check https://github.com/mratsim/Arraymancer/issues/26 (Optimizing Host <-> Cuda transfer) 76 | ## on why I don't (yet?) use Unified Memory and choose to manage it manually. 77 | 78 | rank: cint # Number of dimension of the tensor 79 | shape: CudaLayoutArray 80 | strides: CudaLayoutArray 81 | offset: cint 82 | data: ptr T # Data on Cuda device 83 | len: cint # Number of elements allocated in memory 84 | 85 | proc layoutOnDevice*[T:SomeReal](t: CudaTensor[T]): CudaTensorLayout[T] {.noSideEffect.}= 86 | ## Store a CudaTensor shape, strides, etc information on the GPU 87 | # 88 | # TODO: instead of storing pointers to shape/stride/etc that are passed to each kernel 89 | # pass the layout object directly and call it with layout->shape, layout->rank 90 | 91 | result.rank = t.rank.cint 92 | 93 | result.offset = t.offset.cint 94 | result.data = t.get_data_ptr 95 | result.len = t.size.cint 96 | 97 | new result.shape, deallocCuda 98 | new result.strides, deallocCuda 99 | 100 | result.shape[] = cudaMalloc[cint](MAXRANK) 101 | result.strides[] = cudaMalloc[cint](MAXRANK) 102 | 103 | var 104 | tmp_shape: array[MAXRANK, cint] # CudaLayoutArray 105 | tmp_strides: array[MAXRANK, cint] # CudaLayoutArray 106 | 107 | for i in 0.. Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss 28 | 29 | # Let's go 30 | 31 | # First create a context that will store backpropagation information 32 | let ctx = newContext Tensor[float32] 33 | 34 | # We will pass batches of 32 samples 35 | let bsz = 32 #batch size 36 | 37 | # We will create a tensor of size 3200 --> 100 batch sizes of 32 38 | # We create it as int between [0, 2[ (2 excluded) and convert to bool 39 | let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination 40 | 41 | # Let's check the first 32 42 | echo x_train_bool[0..<32, _] 43 | # Tensor of shape 32x2 of type "bool" on backend "Cpu" 44 | # |true false| 45 | # |true true| 46 | # |false false| 47 | # |false true| 48 | # |false false| 49 | # |false false| 50 | # |false false| 51 | # ... 52 | 53 | # Let's build or truth labels. We need to apply xor between the 2 columns of the tensors 54 | 55 | proc xor_alt[T](x,y: T): T = 56 | ## xor is builtin and cannot be passed to map as is 57 | x xor y 58 | 59 | # We map or new xor function to matching elements of the subtensors 60 | let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1]) 61 | echo y_bool[0..<32, _] 62 | # Tensor of shape 32x1 of type "bool" on backend "Cpu" 63 | # true| 64 | # false| 65 | # false| 66 | # true| 67 | # false| 68 | # false| 69 | # false| 70 | # true| 71 | # false| 72 | # ... 73 | 74 | # Convert to float, 75 | # Important: To improve perf, Arraymancer expects batch size to be last 76 | # so we transpose 77 | let x_train = ctx.variable(x_train_bool.astype(float32).transpose) 78 | let y = y_bool.astype(float32).transpose 79 | 80 | # Now we create layer of neurons W that we will train to reproduce the xor function. 81 | # Weights are of this shape: [W: out_features, in_features] 82 | 83 | # First hidden layer of 3 neurons, with 2 features in 84 | # We initialize with random weights between -1 and 1 85 | let layer_3neurons = ctx.variable( 86 | randomTensor(3, 2, 2.0f) .- 1.0f 87 | ) 88 | 89 | # Classifier layer with 1 neuron per feature. (In our case only one neuron overall) 90 | # We initialize with random weights between -1 and 1 91 | let classifier_layer = ctx.variable( 92 | randomTensor(1, 3, 2.0f) .- 1.0f 93 | ) 94 | # We use Stochastic Gradient Descent as optimizer 95 | # With gradient descent the weigth are updated as follows: 96 | # W -= learning_rate * dW 97 | let optim = newSGD[float32]( 98 | layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate 99 | ) 100 | 101 | # Now let's setup the training loops. 102 | # First loop is passing the mini-batch, bacpropagating, updating the gradients. 103 | # We do it until the whole x_train tensor has been passed through. 104 | # This is one "epoch". 105 | 106 | # Usually after each epoch we "validate" with a test set that the network was never trained on 107 | # how the network generalized. In this example we won't go there to keep it short. 108 | 109 | # We will do 5 epochs, passing the 32*100 minibatches 110 | for epoch in 0..5: 111 | 112 | for batch_id in 0..<100: 113 | 114 | # offset in the Tensor (Remember, batch size is last) 115 | let offset = batch_id * 32 116 | let x = x_train[_, offset ..< offset + 32] 117 | let target = y[_, offset ..< offset + 32] 118 | 119 | # Building the network 120 | let n1 = linear(x, layer_3neurons) 121 | let n1_act = n1.relu 122 | let n2 = linear(n1_act, classifier_layer) 123 | let loss = sigmoid_cross_entropy(n2, target) 124 | 125 | echo "Epoch is:" & $epoch 126 | echo "Batch id:" & $batch_id 127 | 128 | echo "Loss is:" & $loss.value.data[0] 129 | 130 | # Compute the gradient (i.e. contribution of each parameter to the loss) 131 | loss.backprop() 132 | 133 | # Correct the weights now that we have the gradient information 134 | optim.update() -------------------------------------------------------------------------------- /arraymancer.nimble: -------------------------------------------------------------------------------- 1 | ### Package 2 | version = "0.2.0" 3 | author = "Mamy André-Ratsimbazafy" 4 | description = "A n-dimensional tensor (ndarray) library" 5 | license = "Apache License 2.0" 6 | 7 | ### Dependencies 8 | requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4" 9 | 10 | ## Install files 11 | srcDir = "src" 12 | 13 | ######################################################## 14 | # External libs configuration 15 | 16 | ### BLAS support 17 | ## OSX 18 | # switch("define","openblas") 19 | # switch("clibdir", "/usr/local/opt/openblas/lib") 20 | # switch("cincludes", "/usr/local/opt/openblas/include") 21 | 22 | ### BLIS support 23 | # switch("define","blis") 24 | 25 | ### MKL support 26 | # Check the mkl switches in the test file for single-threaded and openp version 27 | 28 | ### Cuda configuration 29 | ## Pass -d:cuda to build arraymancer with cuda support 30 | ## Use the cuda switches below 31 | ## Replace /opt/cuda by your own path 32 | ## TODO: auto detection or at least check in common directories 33 | ## Note: It is import to gate compiler flags like -march=native behind Xcompiler "-Xcompiler -march=native" 34 | 35 | template cudaSwitches() = 36 | switch("cincludes", "/opt/cuda/include") 37 | switch("cc", "gcc") # We trick Nim about nvcc being gcc, pending https://github.com/nim-lang/Nim/issues/6372 38 | switch("gcc.exe", "/opt/cuda/bin/nvcc") 39 | switch("gcc.linkerexe", "/opt/cuda/bin/nvcc") 40 | switch("gcc.cpp.exe", "/opt/cuda/bin/nvcc") 41 | switch("gcc.cpp.linkerexe", "/opt/cuda/bin/nvcc") 42 | # Due to the __ldg intrinsics in kernels 43 | # we only support compute capabilities 3.5+ 44 | # See here: http://docs.nvidia.com/cuda/pascal-compatibility-guide/index.html 45 | # And wikipedia for GPU capabilities: https://en.wikipedia.org/wiki/CUDA 46 | switch("gcc.options.always", "-arch=sm_61 --x cu") # Interpret .c files as .cu 47 | switch("gcc.cpp.options.always", "-arch=sm_61 --x cu -Xcompiler -fpermissive") # Interpret .c files as .cu, gate fpermissive behind Xcompiler 48 | 49 | when defined(cuda): 50 | cudaSwitches 51 | 52 | ######################################################## 53 | # Optimization 54 | 55 | # Multithreading 56 | # use the -d:openmp switch 57 | # which passC: -fopenmp to the compiler 58 | 59 | # Native processor optimization 60 | # use the -d:native 61 | # which passC: -march=native to the compiler 62 | 63 | 64 | ########################################################################## 65 | ## Testing tasks 66 | 67 | proc test(name: string, lang: string = "c") = 68 | if not dirExists "bin": 69 | mkDir "bin" 70 | if not dirExists "nimcache": 71 | mkDir "nimcache" 72 | --run 73 | --nimcache: "nimcache" 74 | switch("out", ("./bin/" & name)) 75 | setCommand lang, "tests/" & name & ".nim" 76 | 77 | task test, "Run all tests - Default BLAS": 78 | test "all_tests" 79 | 80 | task test_cuda, "Run all tests - Cuda backend with CUBLAS": 81 | switch("define","cuda") 82 | cudaSwitches # Unfortunately the "switch" line doesn't also trigger 83 | # the "when defined(cuda)" part of this nimble file 84 | # hence the need to call cudaSwitches explicitly 85 | test "all_tests_cuda", "cpp" 86 | 87 | task test_deprecated, "Run all tests on deprecated static[Backend] procs": 88 | test "all_tests_deprecated" 89 | 90 | task test_openblas, "Run all tests - OpenBLAS": 91 | ## Should work but somehow Nim doesn't find libopenblas.dylib on MacOS 92 | when defined(macosx): 93 | switch("define","blas=openblas") 94 | switch("clibdir", "/usr/local/opt/openblas/lib") 95 | switch("cincludes", "/usr/local/opt/openblas/include") 96 | test "all_tests" 97 | 98 | task test_blis, "Run all tests - BLIS": 99 | switch("define","blis") 100 | test "all_tests" 101 | 102 | task test_native, "Run all tests - march=native": 103 | switch("define","native") 104 | test "all_tests" 105 | 106 | task test_openmp, "Run all tests - OpenMP": 107 | switch("define","openmp") 108 | test "all_tests" 109 | 110 | task test_mkl, "Run all tests - Intel MKL - single threaded": 111 | switch("define","blas=mkl_intel_lp64") 112 | switch("clibdir", "/opt/intel/mkl/lib/intel64") 113 | switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a") 114 | switch("passl", "-lmkl_core") 115 | switch("passl", "-lmkl_sequential") 116 | switch("dynlibOverride","mkl_intel_lp64") 117 | test "all_tests" 118 | 119 | task test_mkl_omp, "Run all tests - Intel MKL + OpenMP": 120 | switch("define","openmp") 121 | switch("define","blas=mkl_intel_lp64") 122 | switch("clibdir", "/opt/intel/mkl/lib/intel64") 123 | switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a") 124 | switch("passl", "-lmkl_core") 125 | switch("passl", "-lmkl_gnu_thread") 126 | switch("passl", "-lgomp") 127 | switch("dynlibOverride","mkl_intel_lp64") 128 | test "all_tests" 129 | 130 | task test_release, "Run all tests - Release mode": 131 | switch("define","release") 132 | test "all_tests" 133 | 134 | task gen_doc, "Generate Arraymancer documentation": 135 | switch("define", "doc") 136 | exec "nim doc2 src/arraymancer" 137 | -------------------------------------------------------------------------------- /src/arraymancer/fallback/blas_l3_gemm.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # The following code is heavily inspired by ulmBLAS (http://apfel.mathematik.uni-ulm.de/~lehn/ulmBLAS/) 16 | # which is heavily inspired by BLIS (https://github.com/flame/blis) 17 | # A big difference (for now?) is instead of passing (const) pointers I pass the (var) array and a var offset. 18 | 19 | # # Reading 20 | # C++ version: https://stackoverflow.com/questions/35620853/how-to-write-a-matrix-matrix-product-that-can-compete-with-eigen 21 | # uBLAS C++: http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session1/page01.html 22 | # Blaze C++: http://www.mathematik.uni-ulm.de/~lehn/test_blaze/session1/page01.html 23 | # Rust BLIS inspired: https://github.com/bluss/matrixmultiply 24 | 25 | # ### TODO: 26 | # - OpenMP parallelization 27 | # {.passl: "-fopenmp".} # Issue: Clang OSX does not support openmp 28 | # {.passc: "-fopenmp".} # and the default GCC is actually a link to Clang 29 | 30 | # - Loop unrolling # Currently Nim `unroll` pragma exists but is ignored. 31 | # - Pass `-march=native` to the compiler 32 | # - Align memory # should be automatic 33 | # - Is there a way to get L1/L2 cache size at compile-time 34 | # - Is there a way to get number of registers at compile-time 35 | 36 | # Best numbers depend on 37 | # L1, L2, L3 cache and register size 38 | 39 | # L1 cache: 32 KB data + 32 KB instructions since Nehalem (per proc) 40 | # L2 cache: 256KB since Nehalem 41 | # X86-64 Register size: 16 registers 128-bit (16 Bytes) wide (SSE2), 256-bit with AVX 42 | # Loading int in AVX registers needs AVX2 support in CPU. 43 | # Everything must be aligned in memory for faster loading in registers. 44 | 45 | # Int/float64 takes 4B 46 | # float32 takes 2B 47 | # --> use "when" to parametrize size at compile-time? 48 | 49 | const MC = 96 50 | const KC = 256 51 | const NC = 4096 52 | 53 | # The following should be bigger (4x8) but somehow it hurts my performance 54 | # It might be because the compiler is not using the large AVX registers by default. 55 | const MR = 2 56 | const NR = 2 57 | 58 | # Panels of B of size KC * NR resides in L1 cache 59 | const MCKC = MC*KC # A resides in L2 cache 60 | const KCNC = KC*NC # B resides in L3 cache 61 | const MRNR = MR*NR # Work area: Fit in registers 62 | 63 | 64 | include ./blas_l3_gemm_packing 65 | include ./blas_l3_gemm_aux 66 | include ./blas_l3_gemm_micro_kernel 67 | include ./blas_l3_gemm_macro_kernel 68 | 69 | proc newBufferArray[T: SomeNumber](N: static[int], typ: typedesc[T]): ref array[N, T] {.noSideEffect.} = 70 | new result 71 | for i in 0 ..< N: 72 | result[i] = 0.T 73 | 74 | proc gemm_nn_fallback[T](m, n, k: int, 75 | alpha: T, 76 | A: seq[T], offA: int, 77 | incRowA, incColA: int, 78 | B: seq[T], offB: int, 79 | incRowB, incColB: int, 80 | beta: T, 81 | C: var seq[T], offC: int, 82 | incRowC, incColc: int) {.noSideEffect.} = 83 | 84 | let 85 | mb = (m + MC - 1) div MC 86 | nb = (n + NC - 1) div NC 87 | kb = (k + KC - 1) div KC 88 | 89 | mod_mc = m mod MC 90 | mod_nc = n mod NC 91 | mod_kc = k mod KC 92 | 93 | var mc, nc, kc: int 94 | var tmp_beta: T 95 | 96 | {.pragma: align16, codegenDecl: "$# $# __attribute__((aligned(16)))".} 97 | var buffer_A{.align16.} = newBufferArray(MCKC, T) 98 | var buffer_B{.align16.} = newBufferArray(KCNC, T) 99 | var buffer_C{.align16.} = newBufferArray(MRNR, T) 100 | 101 | if alpha == 0.T or k == 0: 102 | gescal(m, n, beta, C, offC, incRowC, incColC) 103 | return 104 | 105 | for j in 0 ..< nb: 106 | nc = if (j != nb-1 or mod_nc == 0): NC 107 | else: mod_nc 108 | 109 | for k in 0 ..< kb: 110 | kc = if (k != kb-1 or mod_kc == 0): KC 111 | else: mod_kc 112 | tmp_beta = if k == 0: beta 113 | else: 1.T 114 | 115 | pack_dim( nc, kc, 116 | B, k*KC*incRowB + j*NC*incColB + offB, 117 | incColB, incRowB, NR, 118 | buffer_B) 119 | 120 | for i in 0 ..< mb: 121 | mc = if (i != mb-1 or mod_mc == 0): MC 122 | else: mod_mc 123 | 124 | pack_dim( mc, kc, 125 | A, i*MC*incRowA + k*KC*incColA + offA, 126 | incRowA, incColA, MR, 127 | buffer_A) 128 | 129 | gemm_macro_kernel(mc, nc, kc, 130 | alpha, tmp_beta, 131 | C, i*MC*incRowC + j*NC*incColC + offC, 132 | incRowC, incColC, buffer_A, buffer_B, buffer_C) -------------------------------------------------------------------------------- /src/arraymancer/higher_order_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # Note: Maximum number of threads per block is 17 | # 1024 on Pascal GPU, i.e. 32 warps of 32 threads 18 | 19 | 20 | # Important CUDA optimization 21 | # To loop over each element of an array with arbitrary length 22 | # use grid-strides for loop: https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ 23 | # 24 | # Avoid branching in the same warp (32 threads), otherwise it reverts to serial execution. 25 | # "idx < length" can be converted to "idx = max( idx, 0); idx = min( idx, length);" 26 | # for example. (Beware of aliasing) 27 | 28 | # TODO, use an on-device struct to store, shape, strides, offset 29 | # And pass arguments via a struct pointer to limite register pressure 30 | 31 | {.emit:[""" 32 | template 33 | __global__ void cuda_apply2(const int rank, 34 | const int len, 35 | const int * __restrict__ dst_shape, 36 | const int * __restrict__ dst_strides, 37 | const int dst_offset, 38 | T * __restrict__ dst_data, 39 | Op f, 40 | const int * __restrict__ src_shape, 41 | const int * __restrict__ src_strides, 42 | const int src_offset, 43 | const T * __restrict__ src_data){ 44 | 45 | for (int elemID = blockIdx.x * blockDim.x + threadIdx.x; 46 | elemID < len; 47 | elemID += blockDim.x * gridDim.x) { 48 | 49 | // ## we can't instantiate the variable outside the loop 50 | // ## each threads will store its own in parallel 51 | const int dst_real_idx = cuda_getIndexOfElementID( 52 | rank, 53 | dst_shape, 54 | dst_strides, 55 | dst_offset, 56 | elemID); 57 | 58 | const int src_real_idx = cuda_getIndexOfElementID( 59 | rank, 60 | src_shape, 61 | src_strides, 62 | src_offset, 63 | elemID); 64 | 65 | f(&dst_data[dst_real_idx], &src_data[src_real_idx]); 66 | } 67 | } 68 | """].} 69 | 70 | 71 | {.emit:[""" 72 | template 73 | __global__ void cuda_apply3(const int rank, 74 | const int len, 75 | const int * __restrict__ dst_shape, 76 | const int * __restrict__ dst_strides, 77 | const int dst_offset, 78 | T * __restrict__ dst_data, 79 | const int * __restrict__ A_shape, 80 | const int * __restrict__ A_strides, 81 | const int A_offset, 82 | const T * __restrict__ A_data, 83 | Op f, 84 | const int * __restrict__ B_shape, 85 | const int * __restrict__ B_strides, 86 | const int B_offset, 87 | const T * __restrict__ B_data){ 88 | 89 | for (int elemID = blockIdx.x * blockDim.x + threadIdx.x; 90 | elemID < len; 91 | elemID += blockDim.x * gridDim.x) { 92 | 93 | // ## we can't instantiate the variable outside the loop 94 | // ## each threads will store its own in parallel 95 | const int dst_real_idx = cuda_getIndexOfElementID( 96 | rank, 97 | dst_shape, 98 | dst_strides, 99 | dst_offset, 100 | elemID); 101 | 102 | const int A_real_idx = cuda_getIndexOfElementID( 103 | rank, 104 | A_shape, 105 | A_strides, 106 | A_offset, 107 | elemID); 108 | 109 | const int B_real_idx = cuda_getIndexOfElementID( 110 | rank, 111 | B_shape, 112 | B_strides, 113 | B_offset, 114 | elemID); 115 | 116 | f(&dst_data[dst_real_idx], &A_data[A_real_idx], &B_data[B_real_idx]); 117 | } 118 | } 119 | """].} -------------------------------------------------------------------------------- /src/nn_primitives/sigmoid_cross_entropy_primitives.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import ../arraymancer 16 | import math 17 | 18 | # Sigmoid cross-entropy function that works directly on Tensors 19 | # and provide control without autograd 20 | 21 | proc check_input_target[T](input, target: Tensor[T]) {.inline.}= 22 | if input.shape != target.shape: 23 | raise newException(ValueError, "Input shape " & $input.shape & 24 | " and target shape " & $target.shape & " should be the same") 25 | 26 | proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T {.inline.} = 27 | ## Sigmoid function + Cross-Entropy loss fused in one layer. 28 | ## This leverage the log-sum-exp trick for improved numerical stability 29 | ## It is also faster than calling both separately 30 | ## 31 | ## Input: 32 | ## - A Tensor 33 | ## - The target values 34 | ## Returns: 35 | ## - Apply and sigmoid activation and returns the cross-entropy loss. 36 | ## Shape: 37 | ## - Both the cache and target shape should be @[features, batchsize] i.e. number of samples as last dimension 38 | 39 | 40 | # TODO: term rewriting macro for auto fusion 41 | 42 | when compileOption("boundChecks"): 43 | check_input_target(input, target) 44 | 45 | result = 0.T 46 | for xi, ti in zip(input, target): 47 | result += (-ti * xi + max(xi,0) + ln(1 + exp(-abs(xi))) ) / T(input.shape[1]) #input.shape[1] is the batch size 48 | 49 | 50 | proc sigmoid_cross_entropy_backward*[T]( 51 | gradient: Tensor[T] or T, 52 | cached_tensor: Tensor[T], 53 | target: Tensor[T] 54 | ): Tensor[T] {.inline.} = 55 | ## Derivatives of sigmoid_cross_entropy 56 | ## Input: 57 | ## - The input gradient as a scalar or a Tensor 58 | ## - A cache tensor that contains data from before the forward pass 59 | ## - The target values 60 | ## Shape: 61 | ## - Both the cache and target shape should be @[features, batchsize] i.e. number of samples as last dimension 62 | let batch_size = cached_tensor.shape[^1] 63 | 64 | # Deal with scalar and tensor gradient 65 | when gradient is T: 66 | let grad = gradient 67 | elif gradient is Tensor: 68 | let grad = gradient.data[gradient.offset] 69 | 70 | proc sigmoid_cross_entropy_backward_closure[T](xi, ti: T): T = 71 | grad * ( 1.T / (1.T + exp(-xi)) - ti) / T(batch_size) 72 | 73 | return map2(cached_tensor, sigmoid_cross_entropy_backward_closure, target) 74 | 75 | # ################################################ 76 | # Explanation of sigmoid cross-entropy algorithms: 77 | 78 | # ############ 79 | # Forward pass 80 | 81 | # Cross-entropy has the following form for a single sample 82 | # CEi(yi, yi') = − ( ti ln(yi) + (1−ti) ln(1−yi) ) 83 | 84 | # Since we pass a minibatch of several samples we should average by minibatch size (1/batchsize) 85 | # to keep the gradient magnitude/weight updates on the same scale as a single sample pass 86 | # CE(y, y') = − 1/n ∑i( ti ln(yi) + (1−ti) ln(1−yi) ) 87 | 88 | # yi = ln(sigmoid(xi)) = ln(1/(1+e^-xi)) = ln(e^xi/( 1 + e^xi )) 89 | # yi = x - ln(1 + e^xi) 90 | 91 | # 1 - yi = ln(1 - sigmoid(xi)) = ln(1 + e^xi - e^xi) / (1 + e^xi)) 92 | # 1 - yi = - ln(1 + e^xi) 93 | 94 | # Replacing Sigmoid Cross Entropy 95 | # SCE(x, y') = − 1/n ∑i(ti * (xi - ln(1 + e^xi)) + (1−ti) * -ln(1 + e^xi) ) 96 | # = − 1/n ∑i(ti * xi - ti * ln(1 + e^xi) -ln(1 + e^xi) + ti * ln(1 + e^xi) ) 97 | # = − 1/n ∑i(ti * xi - ln(1 + e^xi) ) 98 | # = − 1/n ∑i(ti * xi - ln(e^0 + e^xi) ) 99 | # 100 | # Using the logsumexp trick with factorize by a constant 101 | # c = max(xi, 0) 102 | # 103 | # SCE(x, y') = − 1/n ∑i(ti * xi - ln(e^c *( e^(0-c) + e^(xi-c)) 104 | # = − 1/n ∑i(ti * xi - ln(e^c *( e^(0-c) + e^(xi-c)) 105 | # = − 1/n ∑i(ti * xi - c - ln(e^-c + e^(xi-c)) 106 | # 107 | # If c = xi (xi > 0), ln(e^-c + e^(xi-c)) becomes ln(e^-xi + 1) 108 | # else c = 0 (xi < 0 ), ln(e^-c + e^(xi-c)) becomes ln(1 + e^xi) 109 | # Both cases are covered by ln(1 + e^-|xi|) 110 | # 111 | # Finally 112 | # SCE(x, y') = − 1/n ∑i(ti * xi - max(xi,0) - ln(1 + e^-|xi|) 113 | # 114 | # 115 | # 116 | # Other idea: streaming maximum (http://www.nowozin.net/sebastian/blog/streaming-log-sum-exp-computation.html) 117 | # 118 | 119 | # ############# 120 | # Backward pass 121 | 122 | # Derivative of Sigmoid-CE: 123 | # We start from this formula: SCE(x, y') = − 1/n ∑i(ti * xi - ln(1 + e^xi) ) 124 | # = 1/n ∑i(-ti * xi + ln(1 + e^xi) ) 125 | # 126 | # On a single sample: 127 | # dSCE/dxi = d/dxi (-ti * xi + ln(1 + e^xi)) 128 | # = -ti + e^xi * 1/(1 + e^xi)) 129 | # = -ti * sigmoid(xi) 130 | # 131 | # For a vector of samples 132 | # dSCE/dx = 1/n ∑i( sigmoid(xi) - ti ) 133 | -------------------------------------------------------------------------------- /src/arraymancer/init_cuda.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc unsafeView*[T](t: CudaTensor[T]): CudaTensor[T] {.inline,noSideEffect.}= 16 | ## Input: 17 | ## - A CudaTensor 18 | ## Returns: 19 | ## - A shallow copy. 20 | ## 21 | ## Warning ⚠ 22 | ## Both tensors shares the same memory. Data modification on one will be reflected on the other. 23 | ## However modifying the shape, strides or offset will not affect the other. 24 | 25 | # shape and strides fields have value semantics by default 26 | # CudaSeq has ref semantics 27 | system.`=`(result, t) 28 | 29 | proc clone*[T](t: CudaTensor[T]): CudaTensor[T] = 30 | ## Clone (deep copy) a CudaTensor. 31 | ## Copy will not share its data with the original. 32 | ## 33 | ## Tensor is copied as is. For example it will not be made contiguous. 34 | ## Use `unsafeContiguous` for this case 35 | 36 | # Note: due to modifying the defaultStream global var for async memcopy 37 | # proc cannot be tagged noSideEffect 38 | 39 | result.shape = t.shape 40 | result.strides = t.strides 41 | result.offset = t.offset 42 | result.data = newCudaSeq[T](t.data.len) 43 | let size = t.data.len * sizeof(T) 44 | 45 | check cudaMemCpyAsync(result.get_data_ptr, 46 | t.get_data_ptr, 47 | size, 48 | cudaMemcpyDeviceToDevice, 49 | defaultStream) # defaultStream is a cudaStream_t global var 50 | 51 | # ########################################################### 52 | # Implement value semantics for CudaTensor 53 | # Pending https://github.com/nim-lang/Nim/issues/6348 54 | # Tracked in https://github.com/mratsim/Arraymancer/issues/19 55 | # 56 | # proc `=`*[T](dest: var CudaTensor[T]; src: CudaTensor[T]) = 57 | # ## Overloading the assignment operator 58 | # ## It will have value semantics by default 59 | # dest.shape = src.shape 60 | # dest.strides = src.strides 61 | # dest.offset = src.offset 62 | # dest.data = newCudaSeq(src.data.len) 63 | # 64 | # let size = dest.size * sizeof(T) 65 | # 66 | # check cudaMemCpy(dest.get_data_ptr, 67 | # src.get_data_ptr, 68 | # size, 69 | # cudaMemcpyDeviceToDevice) 70 | # echo "Value copied" 71 | # 72 | # proc `=`*[T](dest: var CudaTensor[T]; src: CudaTensor[T]{call}) {.inline.}= 73 | # ## Overloading the assignment operator 74 | # ## Optimized version that knows that 75 | # ## the source CudaTensor is unique and thus don't need to be copied 76 | # system.`=`(result, t) 77 | # echo "Value moved" 78 | 79 | proc newCudaTensor[T: SomeReal](shape: varargs[int], layout: OrderType = colMajor): CudaTensor[T] {.noSideEffect.}= 80 | ## Internal proc 81 | ## Allocate a CudaTensor 82 | ## WARNING: The Cuda memory is not initialized to 0 83 | 84 | # TODO: default to RowMajor. Pending https://github.com/mratsim/Arraymancer/issues/22 85 | # As mentionned in design doc, an element-wise kernel will avoid relying on CuBLAS 86 | # for inplace operation that requires column major layout. 87 | 88 | result.shape = @shape 89 | result.strides = shape_to_strides(result.shape, layout) 90 | result.offset = 0 91 | result.data = newCudaSeq[T](result.size) 92 | 93 | proc cuda*[T:SomeReal](t: Tensor[T]): CudaTensor[T] = 94 | ## Convert a tensor on Cpu to a tensor on a Cuda device. 95 | # Note: due to modifying the defaultStream global var for async copy 96 | # proc cannot be tagged noSideEffect 97 | 98 | result = newCudaTensor[T](t.shape) 99 | 100 | # TODO: avoid reordering rowMajor tensors. This is only needed for inplace operation in CUBLAS. 101 | let contig_t = t.unsafeContiguous(colMajor, force = true) 102 | let size = result.size * sizeof(T) 103 | 104 | # For host to device we use non-blocking copy 105 | # Host can proceed with computation. 106 | # On CUDA device, next operations will be batch in the stream queue. 107 | check cudaMemCpyAsync(result.get_data_ptr, 108 | contig_t.get_data_ptr, 109 | size, 110 | cudaMemcpyHostToDevice, 111 | defaultStream) # defaultStream is a cudaStream_t global var 112 | 113 | proc cpu*[T:SomeReal](t: CudaTensor[T]): Tensor[T] {.noSideEffect.}= 114 | ## Convert a tensor on a Cuda device to a tensor on Cpu. 115 | # We use blocking copy in this case to make sure 116 | # all data is available for future computation 117 | 118 | result.shape = t.shape 119 | result.strides = t.strides 120 | result.offset = t.offset 121 | result.data = newSeqUninit[T](t.data.len) # We copy over all the memory allocated 122 | 123 | let size = t.data.len * sizeof(T) 124 | 125 | check cudaMemCpy(result.get_data_ptr, 126 | t.get_data_ptr, 127 | size, 128 | cudaMemcpyDeviceToHost) -------------------------------------------------------------------------------- /src/arraymancer/display.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Mamy André-Ratsimbazafy 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | proc bounds_display(t: Tensor, 16 | idx_data: tuple[val: string, idx: int] 17 | ): string {.noSideEffect.}= 18 | ## Internal routine, compare an index with the strides of a Tensor 19 | ## to check beginning and end of lines 20 | ## Add the delimiter "|" and line breaks at beginning and end of lines 21 | ## TODO: improve 3+D-tensors display 22 | let (val,idx) = idx_data 23 | let s = t.shape.reversed 24 | 25 | if val == "|": 26 | return " | " 27 | 28 | for i,j in s[0 .. ^2]: # We don't take the last element (the row in C convention) 29 | if idx mod j == 0: 30 | return "\t" & $val & "|\n" 31 | if idx mod j == 1: 32 | return "|" & $val 33 | return "\t" & $val 34 | 35 | # TODO: Create a generic n-dimensional display function using nested tables. 36 | # Example code in hTensor: https://github.com/albertoruiz/hTensor/blob/b36c3748b211c7f41c9af9d486c6ef320e2b7585/lib/Numeric/LinearAlgebra/Array/Display.hs#L92 37 | 38 | # Last dim always in column (except vector) 39 | # If rank is odd, first dim is along columns 40 | # if rank is even, first dim is along row 41 | 42 | # Expected for 2x3x3 43 | # 0 1 44 | # --------------------------------------- 45 | # 0,0 # 0,0,0 0,0,1 0,0,2 0,0,3 | 1,0 # 1,0,0 1,0,1 1,0,2 1,0,3 46 | # 0,1 # 0,1,0 0,1,1 0,1,2 0,1,3 | 1,1 # 1,1,0 1,1,1 1,1,2 1,1,3 47 | # 0,2 # 0,2,0 0,2,1 0,2,2 0,2,3 | 1,2 # 1,2,0 1,2,1 1,2,2 1,2,3 48 | # --------------------------------------- 49 | 50 | # Expected for 2x3x3x4 51 | # 1 2 3 4| 13 14 15 16 | 25 26 27 28 52 | # 5 6 7 8| 17 18 19 20 | 29 30 31 32 53 | # 9 10 11 12| 21 22 23 24 | 33 34 35 36 54 | # ---------------------------------------------- 55 | # 37 38 39 40| 49 59 51 52 | 61 62 63 64 56 | # 41 42 43 44| 53 54 55 56 | 65 66 67 68 57 | # 45 46 47 48| 57 58 59 60 | 69 70 71 72 58 | 59 | # Test with 60 | # let a = toSeq(1..24).toTensor(Cpu).reshape(2,3,4) 61 | # echo a 62 | # let b = toSeq(1..72).toTensor(Cpu).reshape(2,3,3,4) 63 | # echo b 64 | 65 | proc disp2d(t: Tensor): string {.noSideEffect.} = 66 | ## Display a 2D-tensor 67 | 68 | # Add a position index to each value in the Tensor. 69 | var indexed_data: seq[(string,int)] = @[] 70 | for i, value in t.enumerate: 71 | indexed_data.add(($value,i)) 72 | 73 | # Create a closure to apply the boundaries transformation for the specific input 74 | proc curry_bounds(tup: (string,int)): string {.noSideEffect.}= t.bounds_display(tup) 75 | 76 | return indexed_data.concatMap(curry_bounds) 77 | 78 | proc disp3d(t: Tensor): string = 79 | ## Display a 3D-tensor 80 | 81 | let sep: seq[string] = @["|"] 82 | let empty: seq[string] = @[] 83 | 84 | var buffer = empty.repeat(t.shape[1]).toTensor() 85 | 86 | for t0 in t.axis(0): 87 | buffer = buffer.concat( 88 | sep.repeat(t0.shape[1]).toTensor().reshape(t.shape[1],1), 89 | t0.map(x => $x).reshape(t.shape[1], t.shape[2]), 90 | axis = 1 91 | ) 92 | 93 | return buffer.disp2d 94 | 95 | proc disp4d(t: Tensor): string = 96 | ## Display a 4D-tensor 97 | 98 | let sep: seq[string] = @["|"] 99 | let sepv: seq[string] = @["-"] 100 | let empty: seq[string] = @[] 101 | 102 | # First create seq of tensor to concat horizontally 103 | var hbuffer = newSeqWith(t.shape[0], empty.repeat(t.shape[2]).toTensor()) 104 | 105 | var i = 0 106 | for s0 in t.axis(0): 107 | let s0r = s0.reshape(t.shape[1],t.shape[2],t.shape[3]) 108 | for s1 in s0r.axis(0): 109 | hbuffer[i] = hbuffer[i].concat( 110 | sep.repeat(t.shape[2]).toTensor().reshape(t.shape[2],1), 111 | s1.reshape(t.shape[2], t.shape[3]).map(x => $x), 112 | axis = 1 113 | ) 114 | inc i 115 | 116 | # Then concat vertically 117 | var vbuffer = empty.repeat(hbuffer[0].shape[1]).toTensor().reshape(0, hbuffer[0].shape[1]) 118 | 119 | for h in hbuffer: 120 | vbuffer = vbuffer.concat( 121 | sepv.repeat(hbuffer[0].shape[1]).toTensor().reshape(1, hbuffer[0].shape[1]), 122 | h.map(x => $x).reshape(hbuffer[0].shape[0], hbuffer[0].shape[1]), 123 | axis = 0 124 | ) 125 | return vbuffer.disp2d 126 | 127 | proc `$`*[T](t: Tensor[T]): string = 128 | ## Pretty-print a tensor (when using ``echo`` for example) 129 | let desc = "Tensor of shape " & t.shape.join("x") & " of type \"" & T.name & "\" on backend \"" & "Cpu" & "\"" 130 | if t.rank <= 2: 131 | return desc & "\n" & t.disp2d 132 | elif t.rank == 3: 133 | return desc & "\n" & t.disp3d 134 | elif t.rank == 4: 135 | return desc & "\n" & t.disp4d 136 | else: 137 | return desc & "\n" & " -- NotImplemented: Display not implemented for tensors of rank > 4" -------------------------------------------------------------------------------- /src/autograd/autograd.nim: -------------------------------------------------------------------------------- 1 | # Copyright 2017 the Arraymancer contributors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import typetraits 16 | 17 | const MAX_ARITY = 3 # Max arity/number of input of autograd operations 18 | 19 | type 20 | Gate*[TT] = ref object {.inheritable.} 21 | arity*: int 22 | # Base operator or layer 23 | # Inherit from it and add a forward and backward method. 24 | # Each operations should set its arity (number of input) 25 | # Additional fields like weights, cache for bprop should be added too. 26 | 27 | Node*[TT] = ref NodeObj[TT] 28 | Parents*[TT] = array[MAX_ARITY, Variable[TT]] 29 | SmallDiffs*[TT] = array[MAX_ARITY, TT] #TODO: how not to export that 30 | 31 | NodeObj[TT] = object 32 | # Store an operator/layer + its parent 33 | gate*: Gate[TT] #TODO: how not to export that 34 | parents*: Parents[TT] #TODO: how not to export that 35 | child*: Variable[TT] # Todo: avoid reference to child and add {.acyclic.} 36 | 37 | Context*[TT] = ref object 38 | ## Tape / Wengert list. Contains the list of applied operations or layers 39 | nodes: seq[Node[TT]] 40 | 41 | ## Considerations 42 | ## A variable can be used in 2 different computations, in that case both gate will point to it 43 | ## It can only have one ancestor 44 | 45 | Variable*[TT] = ref object 46 | ## Wrapper for values 47 | tape*: Context[TT] #TODO: how not to export that 48 | ancestor*: Node[TT] # Absence of ancestor will be represented by the nil value. TODO: Option type with no overhead: https://forum.nim-lang.org/t/3082 49 | value*: TT # TT should be a Tensor[T] or CudaTensor[T] or a scalar 50 | grad*: TT # gradient wrt to the last back propagation done 51 | # TODO make the grad initialization optional to optimize memory use 52 | 53 | 54 | # Somehow if you declare forward before backward, you get invalid declaration order 55 | # https://github.com/nim-lang/Nim/issues/5325 56 | method backward*[TT](self: Gate[TT], gradient: TT): SmallDiffs[TT] {.base, inline.} = 57 | raise newException(ValueError, "backward method is not implemented for " & $self.type.name) 58 | 59 | method forward*[TT](self: Gate[TT], a, b: Variable[TT]): Variable[TT] {.base, inline.} = 60 | # Binary forward 61 | raise newException(ValueError, "forward method is not implemented for " & $self.type.name) 62 | 63 | method forward*[TT](self: Gate[TT], a: Variable[TT]): Variable[TT] {.base, inline.}= 64 | # Unary forward 65 | raise newException(ValueError, "forward method is not implemented for " & $self.type.name) 66 | 67 | proc newContext*(TT: typedesc): Context[TT] {.inline, noSideEffect.} = 68 | ## Initialize a context (Tape / Wengert list) 69 | new result 70 | result.nodes = newSeq[Node[TT]]() 71 | 72 | proc variable*[TT](ctx: Context[TT], value: TT): Variable[TT] {.inline, noSideEffect.} = 73 | ## Wrap a variable to the context 74 | ## T is a Tensour[T, CudaTensor[T] or scalar T 75 | # TODO make the grad initialization optional to optimize memory use 76 | return Variable[TT](tape: ctx, ancestor: nil, value: value, grad: value.zeros_like) 77 | 78 | template len[TT](t: Context[TT]): int = 79 | ## Returns the number of operations applied in the context 80 | t.nodes.len() 81 | 82 | template push*[TT](t: Context[TT], node: Node[TT]) = #TODO: how not to export that 83 | ## Append a new operation to the context 84 | t.nodes.add(node) #Appending in Nim is add not push 85 | 86 | template value*[TT](v: Variable[TT]): TT = 87 | ## Unwrap the value from its context 88 | v.value 89 | 90 | proc check_ctx*(a, b: Variable) {.inline.} = 91 | if a.tape[].unsafeAddr != b.tape[].unsafeAddr: # compare pointer adress directly (avoid deep comparison) 92 | raise newException(ValueError, "You cannot combine variable from different contexts") 93 | 94 | proc backprop*[TT](v: Variable[TT]) = 95 | ## Differentiate the chain of operations w.r.t to this variable. 96 | ## Context will be reset 97 | 98 | # We initialize the Variable we want to backpropagate on with a Tensor of ones. 99 | # TODO, restrict to scalar backprop? 100 | v.grad = v.value.ones_like 101 | 102 | # We pop the context until we find the gate that produced our Variable 103 | while v.tape.len > 0 and v.tape.nodes[^1] != v.ancestor: 104 | discard v.tape.nodes.pop 105 | 106 | # Now, until the context is been all backpropagated through we update 107 | # each intermediate variables with its accumulated gradient and then pop the node 108 | # TODO: Count Toward Zero memory optimization: 109 | # https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation and https://github.com/Rufflewind/revad/blob/de509269fe878bc9d564775abc25c4fa663d8a5e/src/chain.rs 110 | 111 | while v.tape.len > 0: 112 | let curNode = v.tape.nodes.pop 113 | let curVar = curNode.child 114 | 115 | let diffs = curNode.gate.backward(curVar.grad) 116 | 117 | for i in 0 ..< curNode.gate.arity: 118 | curNode.parents[i].grad += diffs[i] --------------------------------------------------------------------------------