├── .gitignore
├── Contributors.md
├── benchmarks
    ├── xtime.rb
    ├── kostya_matmul.nim
    ├── kostya_matmul_lc_reshape.nim
    ├── kostya_matmul_hadamard.nim
    ├── kostya_matmul_mitems.nim
    ├── implementation
    │   ├── proc_method_closure_bench.nim
    │   └── stable_sigmoid_bench.nim
    └── ex01_xor.nim
├── src
    ├── autograd
    │   ├── utils.nim
    │   ├── accessors.nim
    │   ├── gates_reduce.nim
    │   ├── gates_basic.nim
    │   ├── gates_blas.nim
    │   └── autograd.nim
    ├── arraymancer_nn_primitives.nim
    ├── nn
    │   ├── layers
    │   │   ├── layer.nim
    │   │   └── linear.nim
    │   ├── loss
    │   │   ├── loss.nim
    │   │   └── sigmoid_cross_entropy.nim
    │   ├── activation
    │   │   ├── relu.nim
    │   │   └── sigmoid.nim
    │   └── optimizers
    │   │   └── optimizers.nim
    ├── arraymancer
    │   ├── comparison.nim
    │   ├── display_cuda.nim
    │   ├── backend
    │   │   ├── blis.nim
    │   │   ├── cuda_global_state.nim
    │   │   ├── openmp.nim
    │   │   ├── cublas_helper_proc.nim
    │   │   └── cuda.nim
    │   ├── filling_data.nim
    │   ├── math_functions.nim
    │   ├── aggregate.nim
    │   ├── exporting.nim
    │   ├── utils
    │   │   ├── ast_utils.nim
    │   │   ├── nested_containers.nim
    │   │   └── functional.nim
    │   ├── data_structure_helpers.nim
    │   ├── fallback
    │   │   ├── blas_l3_gemm_aux.nim
    │   │   ├── naive_l2_gemv.nim
    │   │   ├── blas_l3_gemm_macro_kernel.nim
    │   │   ├── blas_l3_gemm_packing.nim
    │   │   ├── blas_l3_gemm_micro_kernel.nim
    │   │   └── blas_l3_gemm.nim
    │   ├── shortcuts.nim
    │   ├── init_cpu_deprecated_0_2_0.nim
    │   ├── global_config.nim
    │   ├── accessors_cuda.nim
    │   ├── term_rewriting.nim
    │   ├── ufunc.nim
    │   ├── operators_blas_l2l3_cuda.nim
    │   ├── operators_blas_l1.nim
    │   ├── operators_blas_l1_cuda.nim
    │   ├── higher_order_deprecated.nim
    │   ├── elementwise_cuda.nim
    │   ├── shapeshifting_cuda.nim
    │   ├── higher_order_cuda.nim
    │   ├── init_cuda.nim
    │   └── display.nim
    ├── arraymancer_ag.nim
    ├── arraymancer_nn.nim
    ├── nn_primitives
    │   ├── linear_primitives.nim
    │   ├── activation_primitives.nim
    │   └── sigmoid_cross_entropy_primitives.nim
    └── arraymancer.nim
├── tests
    ├── all_tests_cuda.nim
    ├── tensors
    │   ├── test_optimization.nim
    │   ├── test_filling_data.nim
    │   ├── test_shapeshifting_deprecated.nim
    │   ├── test_bugtracker.nim
    │   ├── test_aggregate.nim
    │   ├── test_display.nim
    │   ├── test_comparison.nim
    │   ├── test_display_deprecated.nim
    │   ├── test_comparison_deprecated.nim
    │   ├── test_math_functions.nim
    │   ├── test_ufunc_deprecated.nim
    │   ├── test_init_deprecated.nim
    │   ├── test_accessors_deprecated.nim
    │   ├── test_higherorder.nim
    │   ├── test_aggregate_deprecated.nim
    │   ├── test_ufunc.nim
    │   ├── test_shapeshifting_cuda.nim
    │   ├── test_accessors.nim
    │   └── test_init.nim
    ├── manual_checks
    │   ├── autograd_mean_arraymancer.nim
    │   └── autograd_mean_pytorch.py
    ├── all_tests_deprecated.nim
    ├── all_tests.nim
    └── autograd
    │   └── test_gate_blas.nim
├── docs
    └── Linear algebra notation comparison.md
├── .appveyor.yml
├── .travis.yml
├── changelog.md
├── examples
    └── ex01_xor_perceptron_from_scratch.nim
└── arraymancer.nimble


/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | bin/
3 | .DS_Store


--------------------------------------------------------------------------------
/Contributors.md:
--------------------------------------------------------------------------------
 1 | Arraymancer contributors (sorted alphabetically)
 2 | 
 3 | ### Eduardo Bart
 4 |   - OpenMP
 5 |   - Several performance optimizations and fix including
 6 |     - Strided iterators
 7 |     - Uninitialized seq
 8 |   - Shapeshifting procs
 9 |   - Developing the ecosystem with [arraymancer-vision](https://github.com/edubart/arraymancer-vision) and [arraymancer-demos](https://github.com/edubart/arraymancer-demos)
10 | 
11 | ### Mamy Ratsimbazafy
12 |   - Lead dev


--------------------------------------------------------------------------------
/benchmarks/xtime.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Copyright (c) 2014 'Konstantin Makarchev'
 4 | # 
 5 | # MIT License
 6 | #
 7 | # https://github.com/kostya/benchmarks/
 8 | 
 9 | def mem(pid); `ps p #{pid} -o rss`.split.last.to_i; end
10 | t = Time.now
11 | pid = Process.spawn(*ARGV.to_a)
12 | mm = 0
13 | 
14 | Thread.new do
15 |   mm = mem(pid)
16 |   while true
17 |     sleep 0.1
18 |     m = mem(pid)
19 |     mm = m if m > mm
20 |   end
21 | end
22 | 
23 | Process.waitall
24 | STDERR.puts "%.2fs, %.1fMb" % [Time.now - t, mm / 1024.0]


--------------------------------------------------------------------------------
/benchmarks/kostya_matmul.nim:
--------------------------------------------------------------------------------
 1 | # From: https://github.com/kostya/benchmarks
 2 | 
 3 | import os, strutils
 4 | import ../arraymancer
 5 | 
 6 | proc matgen(n: int): auto =
 7 |     result = newTensor(@[n,n],float64,Backend.Cpu)
 8 |     let tmp = 1.0 / (n*n).float64
 9 |     for i in 0 .. <n:
10 |         for j in 0 .. <n:
11 |             result[i,j] = tmp * (i - j).float64 * (i + j).float64
12 | 
13 | var n = 100
14 | if paramCount()>0:
15 |     n = parseInt(paramStr(1))
16 | n = n div 2 * 2
17 | 
18 | let a, b = matgen n
19 | let c = a * b
20 | 
21 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8)
22 | 
23 | # run with kostya_matmul 1500


--------------------------------------------------------------------------------
/benchmarks/kostya_matmul_lc_reshape.nim:
--------------------------------------------------------------------------------
 1 | # From: https://github.com/kostya/benchmarks
 2 | 
 3 | import os, strutils, future
 4 | import ../arraymancer
 5 | 
 6 | proc matgen(n: int): auto =
 7 |     result = newTensor(@[n,n],float64,Backend.Cpu)
 8 |     let tmp = 1.0 / (n*n).float64
 9 |     return lc[tmp * (i - j).float64 * (i + j).float64 | (i <- 0..<n, j<- 0..<n), float64].toTensor(Cpu).reshape(n, n)
10 | 
11 | 
12 | var n = 100
13 | if paramCount()>0:
14 |     n = parseInt(paramStr(1))
15 | n = n div 2 * 2
16 | 
17 | let a, b = matgen n
18 | let c = a * b
19 | 
20 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8)
21 | 
22 | # run with kostya_matmul 1500


--------------------------------------------------------------------------------
/src/autograd/utils.nim:
--------------------------------------------------------------------------------
 1 | import macros, sequtils
 2 | 
 3 | 
 4 | macro getSubType*(TT: typedesc): untyped =
 5 |   # Get the subtype T of an AnyTensor[T] input
 6 |   getTypeInst(TT)[1][1]
 7 | 
 8 | 
 9 | ## The following should not be useful, if ops is possible in fprop
10 | ## Shape should be matching in bprop and we shoudln't need special scalar treatment
11 | 
12 | # proc isScalar(t: AnyTensor): bool {.inline.}=
13 | #   for dim in t.shape:
14 | #     if dim != 1 and dim != 0:
15 | #       return false
16 | #   return true
17 | 
18 | 
19 | template product*[T: SomeNumber](s: seq[T]): T =
20 |   ## Get the product of all numbers in a sequence or array
21 |   s.foldl(a*b)


--------------------------------------------------------------------------------
/benchmarks/kostya_matmul_hadamard.nim:
--------------------------------------------------------------------------------
 1 | # From: https://github.com/kostya/benchmarks
 2 | 
 3 | import os, strutils, sequtils
 4 | import ../arraymancer
 5 | 
 6 | proc matgen(n: int): auto =
 7 |     result = newTensor(@[n,n],float64,Backend.Cpu)
 8 |     let tmp = 1.0 / (n*n).float64
 9 |     let j_idx = @[toSeq(0..<n)].toTensor(Cpu).astype(float64).broadcast([n,n])
10 |     let i_idx = j_idx.transpose
11 |     ## TODO: +, -, .* are very slow
12 |     return (i_idx - j_idx) .* (i_idx + j_idx) * tmp
13 | 
14 | var n = 100
15 | if paramCount()>0:
16 |     n = parseInt(paramStr(1))
17 | n = n div 2 * 2
18 | 
19 | let a, b = matgen n
20 | let c = a * b
21 | 
22 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8)
23 | 
24 | # run with kostya_matmul 1500


--------------------------------------------------------------------------------
/benchmarks/kostya_matmul_mitems.nim:
--------------------------------------------------------------------------------
 1 | # From: https://github.com/kostya/benchmarks
 2 | 
 3 | import os, strutils
 4 | import ../arraymancer
 5 | 
 6 | proc divmod[T: SomeInteger](n: T, b: T): (T, T) =
 7 |     ## return (n div base, n mod base)
 8 |     return (n div b, n mod b)
 9 | 
10 | proc matgen(n: int): auto =
11 |     result = newTensor(@[n,n],float64,Backend.Cpu)
12 |     let tmp = 1.0 / (n*n).float64
13 |     var counter = 0
14 |     for val in result.mitems:
15 |         let (i, j) = counter.divmod(n)
16 |         val = (i - j).float64 * (i + j).float64 * tmp
17 |         inc counter
18 | 
19 | var n = 100
20 | if paramCount()>0:
21 |     n = parseInt(paramStr(1))
22 | n = n div 2 * 2
23 | 
24 | let a, b = matgen n
25 | let c = a * b
26 | 
27 | echo formatFloat(c[n div 2, n div 2], ffDefault, 8)
28 | 
29 | # run with kostya_matmul 1500


--------------------------------------------------------------------------------
/tests/all_tests_cuda.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Please compile with -d:cuda switch
16 | import ../src/arraymancer,
17 |         ./tensors/test_operators_blas_cuda,
18 |         ./tensors/test_accessors_slicer_cuda,
19 |         ./tensors/test_shapeshifting_cuda


--------------------------------------------------------------------------------
/src/arraymancer_nn_primitives.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./arraymancer
16 | 
17 | import ./nn_primitives/[activation_primitives, linear_primitives, sigmoid_cross_entropy_primitives]
18 | 
19 | export activation_primitives, linear_primitives, sigmoid_cross_entropy_primitives


--------------------------------------------------------------------------------
/src/nn/layers/layer.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer
16 | 
17 | type Layer*[TT] = ref object of Gate[TT]
18 |   ## Inherits from Gate (arity field)
19 |   ## Add required fields for gradient descent
20 |   weight*: TT
21 |   bias*: TT
22 |   dW*: TT
23 |   dB*: TT


--------------------------------------------------------------------------------
/tests/tensors/test_optimization.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, sequtils
17 | 
18 | suite "Optimization":
19 |   test "Test if contiguous slices are detected as contiguous":
20 |     let a = [[1, 2, 3, 4, 5],
21 |             [6, 7, 8, 9, 10]].toTensor
22 | 
23 |     check: a[1, 2..3].isContiguous == true


--------------------------------------------------------------------------------
/src/arraymancer/comparison.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc `==`*[T](a,b: Tensor[T]): bool {.noSideEffect.}=
16 |   ## Tensor comparison
17 |   if a.shape != b.shape: return false
18 | 
19 |   for a, b in zip(a,b):
20 |     ## Iterate through the tensors using stride-aware iterators
21 |     ## Returns early if false
22 |     if a != b: return false
23 |   return true


--------------------------------------------------------------------------------
/src/arraymancer_ag.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./autograd/utils,
16 |         ./autograd/autograd,
17 |         ./autograd/gates_basic,
18 |         ./autograd/gates_blas,
19 |         ./autograd/gates_reduce,
20 |         ./autograd/accessors
21 | 
22 | export autograd,
23 |        gates_basic,
24 |        gates_blas,
25 |        gates_reduce,
26 |        accessors


--------------------------------------------------------------------------------
/src/arraymancer_nn.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./arraymancer, ./arraymancer_ag, ./arraymancer_nn_primitives
16 | 
17 | import  nn/activation/[sigmoid, relu],
18 |         nn/layers/linear,
19 |         nn/loss/sigmoid_cross_entropy,
20 |         nn/optimizers/optimizers
21 | 
22 | 
23 | export sigmoid, relu
24 | export linear, sigmoid_cross_entropy
25 | export optimizers


--------------------------------------------------------------------------------
/tests/tensors/test_filling_data.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | import ../../src/arraymancer
17 | import unittest, math, future, sequtils
18 | 
19 | 
20 | suite "Testing miscellaneous data functions":
21 |   test "Copy data from source":
22 |     let a = [[1,2],[3,4]].toTensor.reshape(2,2)
23 | 
24 |     var b = ones[int](4,1)
25 | 
26 |     b.copy_from(a)
27 | 
28 |     check: b == [[1],[2], [3], [4]].toTensor


--------------------------------------------------------------------------------
/src/nn/loss/loss.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, typetraits
16 | 
17 | 
18 | type Loss* [TT] = ref object of Gate[TT]
19 |   batch_size*: seq[int]
20 |   target*: TT
21 | 
22 | 
23 | method forward*[TT](self: Loss[TT], a: Variable[TT], target: TT): Variable[TT] {.base, inline.}=
24 |   # Forward for loss layers
25 |   raise newException(ValueError, "forward method is not implemented for " & $self.type.name)


--------------------------------------------------------------------------------
/src/autograd/accessors.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./autograd, ../arraymancer
16 | 
17 | template `[]`*[TT](v: Variable[TT], args: varargs[untyped]): Variable[TT] =
18 |   var result: type(v)
19 |   new result
20 | 
21 |   result.tape = v.tape
22 |   result.ancestor = v.ancestor
23 |   result.value = v.value.unsafeSlice(args)
24 |   result.grad = v.grad.unsafeSlice(args)
25 | 
26 |   result
27 | 
28 |   # TODO: tests for slicing correspondance


--------------------------------------------------------------------------------
/tests/manual_checks/autograd_mean_arraymancer.nim:
--------------------------------------------------------------------------------
 1 | 
 2 | import ../src/arraymancer, ../src/arraymancer_ag
 3 | import sequtils
 4 | 
 5 | let ctx = newContext Tensor[float32]
 6 | 
 7 | let
 8 |     a = ctx.variable(toSeq(1..12).toTensor.reshape(3,4).astype(float32))
 9 |     b = ctx.variable(toSeq(2..13).toTensor.reshape(3,4).astype(float32))
10 |     c = ctx.variable(toSeq(3..11).toTensor.reshape(3,3).astype(float32))
11 |     x = ctx.variable(toSeq(4..15).toTensor.reshape(4,3).astype(float32))
12 |     y = ctx.variable(toSeq(5..16).toTensor.reshape(4,3).astype(float32))
13 | 
14 | 
15 | # for t in [a,b,c,x,y]:
16 | #   echo t.value
17 | 
18 | 
19 | proc forwardNeuron[T](a,b,c,x,y: T): T =
20 |   let
21 |       ax = a * x
22 |       by = b * y
23 |       axpby = ax + by
24 |       axpbypc = axpby + c
25 |       # s = axpbypc.sigmoid()
26 |   return axpbypc
27 | 
28 | 
29 | var s = mean forwardNeuron(a,b,c,x,y)
30 | 
31 | 
32 | echo s.value
33 | 
34 | s.backprop
35 | 
36 | echo a.grad
37 | 
38 | echo b.grad
39 | 
40 | echo c.grad
41 | 
42 | echo x.grad
43 | 
44 | echo y.grad


--------------------------------------------------------------------------------
/docs/Linear algebra notation comparison.md:
--------------------------------------------------------------------------------
 1 | | Language/lib      | Normal matmul | element-wise  matmul (Hadamard) | vec-vec dot product | mat-vec multiplication|
 2 | | ------------- | ---------------------------- | --- | --- | --- |
 3 | | Arraymancer  | A * B | .* | dot(A, B) | A * B |
 4 | | neo/linalg  | A * B | \|*\| | A * B | A * B |
 5 | | Julia & Matlab | A * B | .* | dot(A, B) | A * B |
 6 | | Numpy ndarray| np.dot(A, B) or np.matmul(A, B) or A @ B| np.multiply(A, B) or A * B | np.dot(A, B) or np.inner(A, B) | np.dot(A, B) |
 7 | | R | A %*% B | A * B | A %*% B or dot(A, B)| A %*% B |
 8 | | Tensorflow | tf.matmul(A, B) or A @ B | tf.multiply(A, B) | tf.matmul(a, b, transpose_a=False, transpose_b=True) or tf.tensordot(a, b, 1) or tf.einsum('i,i->', x, y) | same reshape/transpose/einsum shenanigans as vec-vec|
 9 | | Torch/PyTorch | torch.mm(A,B) or torch.matmul(A,B) | torch.cmul(A, B) | torch.dot(A, B) or torch.matmul(A, B) | torch.mv(A, B) or torch.dot(A, B)
10 | | Theano | theano.tensor.dot(A, B) | A * B | dot(A, B) or vdot(A, B) ?| dot(A, B) or tensordot(A,B) ? |
11 | | Common math |


--------------------------------------------------------------------------------
/tests/all_tests_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../src/arraymancer,
16 |         ./tensors/test_init_deprecated,
17 |         ./tensors/test_comparison_deprecated,
18 |         ./tensors/test_accessors_deprecated,
19 |         ./tensors/test_accessors_slicer_deprecated,
20 |         ./tensors/test_display_deprecated,
21 |         ./tensors/test_operators_blas_deprecated,
22 |         ./tensors/test_aggregate_deprecated,
23 |         ./tensors/test_shapeshifting_deprecated,
24 |         ./tensors/test_ufunc_deprecated
25 | 


--------------------------------------------------------------------------------
/src/arraymancer/display_cuda.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc `$`*[T](t: CudaTensor[T]): string =
16 |   ## Pretty-print a CudaTensor (when using ``echo`` for example)
17 |   let desc = "Tensor of shape " & t.shape.join("x") & " of type \"" & T.name & "\" on backend \"" & "Cuda" & "\""
18 |   
19 |   let cpu_t = t.cpu()
20 |   
21 |   if t.rank <= 2:
22 |     return desc & "\n" & cpu_t.disp2d
23 |   elif t.rank == 3:
24 |     return desc & "\n" & cpu_t.disp3d
25 |   elif t.rank == 4:
26 |     return desc & "\n" & cpu_t.disp4d
27 |   else:
28 |     return desc & "\n" & " -- NotImplemented: Display not implemented for tensors of rank > 4"


--------------------------------------------------------------------------------
/benchmarks/implementation/proc_method_closure_bench.nim:
--------------------------------------------------------------------------------
 1 | import times
 2 | 
 3 | type FooBase = ref object {.inheritable.}
 4 |   dummy: int
 5 | 
 6 | type Foo{.final.} = ref object of FooBase
 7 |   value : float32
 8 | 
 9 | 
10 | proc inplace_add_proc(x: var Foo, a: float32) =
11 |   x.value += a
12 | 
13 | proc inplace_add_closure(x: var float32, a: float32) =
14 |   proc add_closure(v: var float32) = v += a
15 |   add_closure(x)
16 | 
17 | method inplace_add_method(x: FooBase, a: float32) {.base.} =
18 |   discard
19 | 
20 | method inplace_add_method(x: Foo, a: float32) =
21 |   x.value += a
22 | 
23 | var bar : Foo
24 | new bar
25 | var start = cpuTime()
26 | for i in 0..<100000000:
27 |   inplace_add_proc(bar, 1.0f)
28 | echo " Proc with ref object ", cpuTime() - start
29 | 
30 | var x : float32
31 | start = cpuTime()
32 | for i in 0..<100000000:
33 |   inplace_add_closure(x, 1.0f)
34 | echo " Closures ", cpuTime() - start
35 | 
36 | var baz : Foo
37 | new baz
38 | start = cpuTime()
39 | for i in 0..<100000000:
40 |   inplace_add_method(baz, 1.0f)
41 | echo " Methods ", cpuTime() - start
42 | 
43 | # Results with -d:release on i5-5257U (dual-core mobile 2.7GHz, turbo 3.1)
44 | # Proc with ref object 0.099993
45 | # Closures 2.708598
46 | # Methods 0.3122219999999998


--------------------------------------------------------------------------------
/tests/all_tests.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../src/arraymancer,
16 |         ./tensors/test_init,
17 |         ./tensors/test_comparison,
18 |         ./tensors/test_accessors,
19 |         ./tensors/test_accessors_slicer,
20 |         ./tensors/test_display,
21 |         ./tensors/test_operators_blas,
22 |         ./tensors/test_math_functions,
23 |         ./tensors/test_higherorder,
24 |         ./tensors/test_aggregate,
25 |         ./tensors/test_shapeshifting,
26 |         ./tensors/test_broadcasting,
27 |         ./tensors/test_ufunc,
28 |         ./tensors/test_filling_data,
29 |         ./tensors/test_optimization,
30 |         ./tensors/test_bugtracker,
31 |         ./autograd/test_gate_blas
32 | 


--------------------------------------------------------------------------------
/src/arraymancer/backend/blis.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | when defined(blis):
16 |   static: echo "--USING BLIS--"
17 |   include ./blis_api
18 |   let blis_status = bli_init()
19 |   echo "Blis initiatialization status: " & $blis_status
20 | 
21 |   proc quit_blis() {.noconv.}=
22 |     when defined(debug):
23 |       echo "Blis quit status: " & $bli_finalize()
24 |     else:
25 |       discard bli_finalize()
26 |   addQuitProc(quit_blis)
27 | 
28 | # else:
29 | #   static: echo "Consider adding BLIS from \"https://github.com/flame/blis\" " &
30 | #           "and compile Arraymancer with \"-d:blis\" " &
31 | #           "for operations on array slices without copy. " &
32 | #           "OSX users can install it through Homebrew."
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/tensors/test_shapeshifting_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, future, sequtils
17 | 
18 | suite "Shapeshifting":
19 |   test "Reshape":
20 |     let a = toSeq(1..4).toTensor(Cpu).reshape(2,2)
21 |     check: a == [[1,2],
22 |                  [3,4]].toTensor(Cpu)
23 | 
24 |   test "Concatenation":
25 |     let a = toSeq(1..4).toTensor(Cpu).reshape(2,2)
26 | 
27 |     let b = toSeq(5..8).toTensor(Cpu).reshape(2,2)
28 | 
29 |     check: concat(a,b, axis = 0) == [[1,2],
30 |                                      [3,4],
31 |                                      [5,6],
32 |                                      [7,8]].toTensor(Cpu)
33 |     check: concat(a,b, axis = 1) == [[1,2,5,6],
34 |                                      [3,4,7,8]].toTensor(Cpu)


--------------------------------------------------------------------------------
/src/arraymancer/filling_data.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc check_size(a, b:AnyTensor)  {.noSideEffect.}=
16 |   ## Check if the total number of elements match
17 |   if a.size != b.size:
18 |     raise newException(ValueError, "Both Tensors should have the same total number of elements")
19 | 
20 | proc copy_from*[T](dst: var Tensor[T], src: Tensor[T]) =
21 |   ## Copy the data from a source Tensor. Both tensors must have the same number of elements
22 |   ## but do not need to have the same shape.
23 |   ## Data is copied without re-allocation.
24 |   ## Warning ⚠
25 |   ##   The destination tensor data will be overwritten. It however conserves its shape and strides.
26 | 
27 |   when compileOption("boundChecks"):
28 |     check_size(dst, src)
29 | 
30 |   for x, val in mzip(dst, src):
31 |     x = val


--------------------------------------------------------------------------------
/src/arraymancer/math_functions.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # Non-operator math functions
17 | 
18 | proc reciprocal*[T: SomeReal](t: Tensor[T]): Tensor[T] =
19 |   # Return a tensor with the reciprocal 1/x of all elements
20 |   t.mapT(1.T/x)
21 | 
22 | proc mreciprocal*[T: SomeReal](t: var Tensor[T]) =
23 |   # Apply the reciprocal 1/x in-place to all elements of the Tensor
24 |   t.applyT(1.T/x)
25 | 
26 | proc negate*[T: SomeSignedInt|SomeReal](t: Tensor[T]): Tensor[T] =
27 |   # Return a tensor with all elements negated (10 -> -10)
28 |   t.mapT(-x)
29 | 
30 | proc mnegate*[T: SomeSignedInt|SomeReal](t: var Tensor[T]) =
31 |   # Negate in-place all elements of the tensor (10 -> -10)
32 |   t.applyT(-x)
33 | 
34 | proc `-`*[T: SomeNumber](t: Tensor[T]): Tensor[T] =
35 |   ## Negate all values of a Tensor
36 |   t.mapT(-x)


--------------------------------------------------------------------------------
/src/arraymancer/aggregate.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # ### Standard aggregate functions
16 | # TODO consider using stats from Nim standard lib: https://nim-lang.org/docs/stats.html#standardDeviation,RunningStat
17 | 
18 | proc sum*[T: SomeNumber](t: Tensor[T]): T {.noSideEffect.}=
19 |   ## Compute the sum of all elements of T
20 | 
21 |   result = 0.T
22 |   for val in t:
23 |     result += val
24 | 
25 | proc sum*[T: SomeNumber](t: Tensor[T], axis: int): Tensor[T] {.inline.}=
26 |   ## Compute the sum of all elements of T along an axis
27 |   t.reduce(`+`, axis = axis)
28 | 
29 | proc mean*[T: SomeReal](t: Tensor[T]): T {.inline.}=
30 |   ## Compute the mean of all elements of T
31 |   t.sum / t.size.T
32 | 
33 | proc mean*[T: SomeReal](t: Tensor[T], axis: int): Tensor[T] {.inline.}=
34 |   ## Compute the mean of T along an axis
35 |   t.sum(axis) / t.shape[axis].T


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: '{build}'
 2 | 
 3 | cache:
 4 | - nim-0.17.2_x64.zip
 5 | - x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z
 6 | - packages -> **\packages.config
 7 | - '%LocalAppData%\NuGet\Cache -> **\packages.config'
 8 | 
 9 | matrix:
10 |   fast_finish: true
11 | 
12 | environment:
13 |   matrix:
14 |     - MINGW_ARCHIVE: x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z
15 |       MINGW_DIR: mingw64
16 |       MINGW_URL: https://ayera.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z
17 |       NIM_ARCHIVE: nim-0.17.2_x64.zip
18 |       NIM_DIR: nim-0.17.2
19 |       NIM_URL: https://nim-lang.org/download/nim-0.17.2_x64.zip
20 |       platform: x64
21 | 
22 | install:
23 |   - MKDIR %CD%\tools_tmp
24 |   - IF not exist "%MINGW_ARCHIVE%" appveyor DownloadFile "%MINGW_URL%" -FileName "%MINGW_ARCHIVE%"
25 |   - 7z x -y "%MINGW_ARCHIVE%" -o"%CD%\tools_tmp"> nul
26 |   - IF not exist "%NIM_ARCHIVE%" appveyor DownloadFile "%NIM_URL%" -FileName "%NIM_ARCHIVE%"
27 |   - 7z x -y "%NIM_ARCHIVE%" -o"%CD%\tools_tmp"> nul
28 |   - SET PATH=%CD%\tools_tmp\%NIM_DIR%\bin;%CD%\tools_tmp\%MINGW_DIR%\bin;%PATH%
29 |   - ps: nuget install OpenBLAS -o "${env:APPVEYOR_BUILD_FOLDER}"
30 |   - ps: cp OpenBLAS.0.2.14.1/lib/native/bin/x64/libopenblas.dll blas.dll
31 |   - SET PATH=%PATH%;%CD%
32 | 
33 | build_script:
34 |   - nimble.exe refresh
35 | 
36 | test_script:
37 |   - nimble.exe test
38 | 
39 | deploy: off
40 | 


--------------------------------------------------------------------------------
/src/arraymancer/backend/cuda_global_state.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # ###################################################
17 | # Global Cuda and CuBLAS state
18 | 
19 | # CuBLAS stream for parallel async processing on GPU
20 | # Computations/Memcpy on different streams are done in simultaneously
21 | # Streams are also necessary for async Cuda procs like cudaMemcpyAsync
22 | var defaultStream: cublas_api.cudaStream_t
23 | check cudaStreamCreate(addr defaultStream)
24 | 
25 | # CuBLAS handle
26 | # Note: it prevents {.noSideEffect.} in all CuBLAS proc :/
27 | var defaultHandle: cublasHandle_t
28 | check cublasCreate(addr defaultHandle)
29 | 
30 | proc cudaRelease() {.noconv.}=
31 |   # Release all cuda resources
32 |   check cublasDestroy(defaultHandle)
33 |   check cudaStreamDestroy(defaultStream)
34 | 
35 |   when defined(debug):
36 |     echo "CUDA and CuBLAS resources successfully released"
37 | 
38 | addQuitProc(cudaRelease)
39 | 


--------------------------------------------------------------------------------
/src/arraymancer/exporting.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc toRawSeq*[T](t:Tensor[T]): seq[T] {.noSideEffect.} =
16 |   ## Convert a tensor to the raw sequence of data.
17 | 
18 |   # Due to forward declaration this proc must be declared
19 |   # after "cpu" proc are declared in init_cuda
20 |   when t is Tensor:
21 |     return t.data
22 |   elif t is CudaTensor:
23 |     return t.cpu.data
24 | 
25 | proc export_tensor*[T](t: Tensor[T]):
26 |   tuple[shape: seq[int], strides: seq[int], data: seq[T]] {.noSideEffect.}=
27 |   ## Export the tensor as a tuple containing
28 |   ## - shape
29 |   ## - strides
30 |   ## - data
31 |   ## If the tensor was not contiguous (a slice for example), it is reshaped.
32 |   ## Data is exported in C order (last index changes the fastest, column in 2D case)
33 | 
34 |   let contig_t = t.unsafeContiguous
35 | 
36 |   result.shape = contig_t.shape
37 |   result.strides = contig_t.strides
38 |   result.data = contig_t.data


--------------------------------------------------------------------------------
/src/arraymancer/utils/ast_utils.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Tools to manipulate Nim Abstract Syntax Tree
16 | 
17 | proc hasType(x: NimNode, t: static[string]): bool {. compileTime .} =
18 |   ## Compile-time type checking
19 |   sameType(x, bindSym(t))
20 | 
21 | proc isInt(x: NimNode): bool {. compileTime .} =
22 |   ## Compile-time type checking
23 |   hasType(x, "int")
24 | 
25 | proc isAllInt(slice_args: NimNode): bool {. compileTime .} =
26 |   ## Compile-time type checking
27 |   result = true
28 |   for child in slice_args:
29 |     # We don't use early return here as everything is evaluated at compile-time,
30 |     # has no run-time impact and there are very few slice_args
31 |     result = result and isInt(child)
32 | 
33 | proc pop(tree: var NimNode): NimNode {. compileTime .}=
34 |   ## varargs[untyped] consumes all arguments so the actual value should be popped
35 |   ## https://github.com/nim-lang/Nim/issues/5855
36 |   result = tree[tree.len-1]
37 |   tree.del(tree.len-1)


--------------------------------------------------------------------------------
/src/arraymancer/data_structure_helpers.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | proc isNaiveIterable(t: AnyTensor): bool {.inline.}=
17 |   ## If t is not a slice we can iterate with a naive for loop
18 |   return t.data.len == t.size
19 | 
20 | proc isNaiveIterableWith(t1: AnyTensor, t2: AnyTensor): bool {.inline.}=
21 |   ## If shape and strides are the same, we can iterate on both tensors at the same time
22 |   ## modulo their offsets
23 |   ## We don't need those to have data.len == size
24 |   return (t1.strides == t2.strides) and (t1.shape == t2.shape)
25 | 
26 | proc getTransposeTarget(t: AnyTensor): TransposeType {.noSideEffect.}=
27 |   ## TransposeType is introduced by ``nimblas``
28 |   ## Default layout is Row major.
29 |   ## Everytime it is worth it or fused with a BLAS operation we change the strides to Row-Major
30 |   if is_C_contiguous(t): return TransposeType.noTranspose
31 |   elif is_F_contiguous(t): return TransposeType.transpose
32 |   else: raise newException(ValueError,"Operation not supported for this matrix. It has a non-contiguous layout")


--------------------------------------------------------------------------------
/tests/tensors/test_bugtracker.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest
17 | 
18 | 
19 | suite "Testing specific issues from bug tracker":
20 |   test "Span slicing inside dynamic type procs fails to compile":
21 |     # https://github.com/mratsim/Arraymancer/issues/43
22 |     proc boo[T](): T {.used.}=
23 |       var a = zeros[int]([2,2])
24 |       echo a[1,_] #<-- Bug was undeclared identifier '_',
25 |                   # unfortunately there is no way to gracefully check this
26 |                   # with when not compiles for example
27 | 
28 |     # Check that our solution, export '_' doesn't create compatibility issue
29 | 
30 |     # tuple destructuring
31 |     {.push hints: off.}  ## TODO replaced by XDeclaredButNotUsed when https://github.com/nim-lang/Nim/issues/4044
32 |     let (a, _, c) = (1, @[2,3],"hello")
33 |     {.pop.}
34 | 
35 |     # https://github.com/mratsim/Arraymancer/issues/61
36 |     proc foo[T](t: Tensor[T], x: int): Tensor[T] =
37 |       t.unsafeSlice(x, _, _).unsafeReshape([t.shape[1], t.shape[2]])
38 | 
39 |     discard zeros[int]([2,2,2]).foo(1)


--------------------------------------------------------------------------------
/tests/manual_checks/autograd_mean_pytorch.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Reference code
 3 | 
 4 | import torch
 5 | from torch.autograd import Variable
 6 | 
 7 | a = Variable(torch.arange(1,13).view(3,4), requires_grad=True)
 8 | b = Variable(torch.arange(2,14).view(3,4), requires_grad=True)
 9 | c = Variable(torch.arange(3,12).view(3,3), requires_grad=True)
10 | x = Variable(torch.arange(4,16).view(4,3), requires_grad=True)
11 | y = Variable(torch.arange(5,17).view(4,3), requires_grad=True)
12 | 
13 | 
14 | def forwardNeuron(a,b,c,x,y):
15 |   ax = a @ x
16 |   by = b @ y
17 |   axpby = ax + by
18 |   axpbypc = axpby + c
19 | 
20 |   return axpbypc
21 | 
22 | s = forwardNeuron(a,b,c,x,y).mean()
23 | 
24 | print(s)
25 | Variable containing:
26 |  599
27 | [torch.FloatTensor of size 1]
28 | 
29 | s.backward()
30 | 
31 | print(a.grad)
32 | # Variable containing:
33 | #  1.6667  2.6667  3.6667  4.6667
34 | #  1.6667  2.6667  3.6667  4.6667
35 | #  1.6667  2.6667  3.6667  4.6667
36 | # [torch.FloatTensor of size 3x4]
37 | 
38 | print(b.grad)
39 | # Variable containing:
40 | #  2  3  4  5
41 | #  2  3  4  5
42 | #  2  3  4  5
43 | # [torch.FloatTensor of size 3x4]
44 | 
45 | print(c.grad)
46 | # Variable containing:
47 | #  0.1111  0.1111  0.1111
48 | #  0.1111  0.1111  0.1111
49 | #  0.1111  0.1111  0.1111
50 | # [torch.FloatTensor of size 3x3]
51 | 
52 | print(x.grad)
53 | # Variable containing:
54 | #  1.6667  1.6667  1.6667
55 | #  2.0000  2.0000  2.0000
56 | #  2.3333  2.3333  2.3333
57 | #  2.6667  2.6667  2.6667
58 | # [torch.FloatTensor of size 4x3]
59 | 
60 | print(y.grad)
61 | # Variable containing:
62 | #  2.0000  2.0000  2.0000
63 | #  2.3333  2.3333  2.3333
64 | #  2.6667  2.6667  2.6667
65 | #  3.0000  3.0000  3.0000
66 | # [torch.FloatTensor of size 4x3]


--------------------------------------------------------------------------------
/tests/tensors/test_aggregate.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math
17 | 
18 | suite "Testing aggregation functions":
19 |   let t = [[0, 1, 2],
20 |           [3, 4, 5],
21 |           [6, 7, 8],
22 |           [9, 10, 11]].toTensor()
23 | 
24 |   test "Sum all elements":
25 |     check: t.sum == 66
26 | 
27 |   test "Sum over axis":
28 |     let row_sum = [[18, 22, 26]].toTensor()
29 |     let col_sum = [[3],
30 |                    [12],
31 |                    [21],
32 |                    [30]].toTensor()
33 |     check: t.sum(axis=0) == row_sum
34 |     check: t.sum(axis=1) == col_sum
35 | 
36 |     ## TODO: 3D axis sum
37 |   test "Mean of all elements":
38 |     check: t.astype(float).mean == 5.5 # Note: may fail due to float rounding
39 | 
40 |   test "Mean over axis":
41 |     let row_mean = [[4.5, 5.5, 6.5]].toTensor()
42 |     let col_mean = [[1.0],
43 |                     [4.0],
44 |                     [7.0],
45 |                     [10.0]].toTensor()
46 |     check: t.astype(float).mean(axis=0) == row_mean
47 |     check: t.astype(float).mean(axis=1) == col_mean


--------------------------------------------------------------------------------
/tests/tensors/test_display.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import math, unittest
17 | 
18 | 
19 | suite "Displaying tensors":
20 |   test "Display compiles":
21 |     const
22 |       a = @[1, 2, 3, 4, 5]
23 |       b = @[1, 2, 3, 4, 5]
24 | 
25 |     var
26 |       vandermonde: seq[seq[int]]
27 |       row: seq[int]
28 | 
29 |     vandermonde = newSeq[seq[int]]()
30 | 
31 |     for i, aa in a:
32 |       row = newSeq[int]()
33 |       vandermonde.add(row)
34 |       for j, bb in b:
35 |         vandermonde[i].add(aa^bb)
36 | 
37 |     # @[@[1, 1, 1, 1, 1], @[2, 4, 8, 16, 32], @[3, 9, 27, 81, 243], @[4, 16, 64, 256, 1024], @[5, 25, 125, 625, 3125]]
38 | 
39 |     let t_van = vandermonde.toTensor()
40 |     when not compiles(echo t_van): check: false
41 | 
42 |     # Tensor of shape 5x5 of type "int" on backend "Cpu"
43 |     # |1      1       1       1       1|
44 |     # |2      4       8       16      32|
45 |     # |3      9       27      81      243|
46 |     # |4      16      64      256     1024|
47 |     # |5      25      125     625     3125|
48 | 
49 |     # TODO: Better display tests


--------------------------------------------------------------------------------
/tests/tensors/test_comparison.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math
17 | 
18 | 
19 | suite "Testing tensor comparison":
20 |   test "Testing for [1..^2, 1..3] slicing":
21 |     const
22 |       a = @[1, 2, 3, 4, 5]
23 |       b = @[1, 2, 3, 4, 5]
24 | 
25 |     var
26 |       vandermonde: seq[seq[int]]
27 |       row: seq[int]
28 | 
29 |     vandermonde = newSeq[seq[int]]()
30 | 
31 |     for i, aa in a:
32 |       row = newSeq[int]()
33 |       vandermonde.add(row)
34 |       for j, bb in b:
35 |         vandermonde[i].add(aa^bb)
36 | 
37 |     let t_van = vandermonde.toTensor()
38 | 
39 |     # Tensor of shape 5x5 of type "int" on backend "Cpu"
40 |     # |1      1       1       1       1|
41 |     # |2      4       8       16      32|
42 |     # |3      9       27      81      243|
43 |     # |4      16      64      256     1024|
44 |     # |5      25      125     625     3125|
45 | 
46 |     let test = @[@[4, 8, 16], @[9, 27, 81], @[16, 64, 256]]
47 |     let t_test = test.toTensor()
48 |     
49 |     check: t_van[1..^2,1..3] == t_test
50 |     check: t_van[1..3,1..3] == t_test


--------------------------------------------------------------------------------
/tests/tensors/test_display_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import math, unittest
17 | 
18 | 
19 | suite "Displaying tensors":
20 |   test "Display compiles":
21 |     const
22 |       a = @[1, 2, 3, 4, 5]
23 |       b = @[1, 2, 3, 4, 5]
24 | 
25 |     var
26 |       vandermonde: seq[seq[int]]
27 |       row: seq[int]
28 | 
29 |     vandermonde = newSeq[seq[int]]()
30 | 
31 |     for i, aa in a:
32 |       row = newSeq[int]()
33 |       vandermonde.add(row)
34 |       for j, bb in b:
35 |         vandermonde[i].add(aa^bb)
36 | 
37 |     # @[@[1, 1, 1, 1, 1], @[2, 4, 8, 16, 32], @[3, 9, 27, 81, 243], @[4, 16, 64, 256, 1024], @[5, 25, 125, 625, 3125]]
38 | 
39 |     let t_van = vandermonde.toTensor(Cpu)
40 |     when compiles(echo t_van): check: true
41 | 
42 |     # Tensor of shape 5x5 of type "int" on backend "Cpu"
43 |     # |1      1       1       1       1|
44 |     # |2      4       8       16      32|
45 |     # |3      9       27      81      243|
46 |     # |4      16      64      256     1024|
47 |     # |5      25      125     625     3125|
48 |     
49 |     # TODO: Better display tests


--------------------------------------------------------------------------------
/tests/tensors/test_comparison_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math
17 | 
18 | 
19 | suite "Testing tensor comparison":
20 |   test "Testing for [1..^2, 1..3] slicing":
21 |     const
22 |       a = @[1, 2, 3, 4, 5]
23 |       b = @[1, 2, 3, 4, 5]
24 | 
25 |     var
26 |       vandermonde: seq[seq[int]]
27 |       row: seq[int]
28 | 
29 |     vandermonde = newSeq[seq[int]]()
30 | 
31 |     for i, aa in a:
32 |       row = newSeq[int]()
33 |       vandermonde.add(row)
34 |       for j, bb in b:
35 |         vandermonde[i].add(aa^bb)
36 | 
37 |     let t_van = vandermonde.toTensor(Cpu)
38 | 
39 |     # Tensor of shape 5x5 of type "int" on backend "Cpu"
40 |     # |1      1       1       1       1|
41 |     # |2      4       8       16      32|
42 |     # |3      9       27      81      243|
43 |     # |4      16      64      256     1024|
44 |     # |5      25      125     625     3125|
45 | 
46 |     let test = @[@[4, 8, 16], @[9, 27, 81], @[16, 64, 256]]
47 |     let t_test = test.toTensor(Cpu)
48 |     
49 |     check: t_van[1..^2,1..3] == t_test
50 |     check: t_van[1..3,1..3] == t_test


--------------------------------------------------------------------------------
/src/arraymancer/fallback/blas_l3_gemm_aux.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Compute Y += alpha * X
16 | proc geaxpy[T]( m, n: int,
17 |                 alpha: T,
18 |                 X: ref array[MRNR, T],
19 |                 incRowX, incColX: int,
20 |                 Y: var seq[T], offY: int,
21 |                 incRowY, incColY: int)
22 |                 {.noSideEffect.}=
23 | 
24 |   if alpha != 1.T:
25 |     for j in 0 ..< n:
26 |       for i in 0 ..< m:
27 |         Y[i*incRowY + j*incColY + offY] += alpha * X[i*incRowX + j*incColX]
28 |   else:
29 |     for j in 0 ..< n:
30 |       for i in 0 ..< m:
31 |         Y[i*incRowY + j*incColY + offY] += X[i*incRowX + j*incColX]
32 | 
33 | # Compute X *= alpha
34 | proc gescal[T]( m, n: int,
35 |                 alpha: T,
36 |                 X: var seq[T], offX: int,
37 |                 incRowX, incColX: int)
38 |                 {.noSideEffect.} =
39 | 
40 |   if alpha != 0.T:
41 |     for j in 0 ..< n:
42 |       for i in 0 ..< m:
43 |         X[i*incRowX + j*incColX + offX] *= alpha
44 |   else:
45 |     for j in 0 ..< n:
46 |       for i in 0 ..< m:
47 |         X[i*incRowX + j*incColX + offX] = 0


--------------------------------------------------------------------------------
/tests/tensors/test_math_functions.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, future, math
17 | 
18 | suite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
19 |   test "Reciprocal (element-wise 1/x)":
20 |     var a = [1.0, 10, 20, 30].toTensor.reshape(4,1)
21 | 
22 | 
23 |     check: a.reciprocal  == [[1.0],
24 |                             [1.0/10.0],
25 |                             [1.0/20.0],
26 |                             [1.0/30.0]].toTensor
27 | 
28 |     a.mreciprocal
29 | 
30 |     check: a == [[1.0],
31 |                 [1.0/10.0],
32 |                 [1.0/20.0],
33 |                 [1.0/30.0]].toTensor
34 | 
35 |   test "Negate elements (element-wise -x)":
36 |     block: # Out of place
37 |       var a = [1.0, 10, 20, 30].toTensor.reshape(4,1)
38 | 
39 | 
40 |       check: a.negate  == [[-1.0],
41 |                           [-10.0],
42 |                           [-20.0],
43 |                           [-30.0]].toTensor
44 | 
45 |       a.mnegate
46 | 
47 |       check: a == [[-1.0],
48 |                   [-10.0],
49 |                   [-20.0],
50 |                   [-30.0]].toTensor


--------------------------------------------------------------------------------
/src/nn/activation/relu.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils
16 | import ../../arraymancer_nn_primitives
17 | import math
18 | 
19 | type ReluActivation* {.final.} [TT] = ref object of Gate[TT]
20 |   cache: TT
21 | 
22 | method forward*[TT](self: ReluActivation[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
23 |   new result
24 | 
25 |   result.tape = a.tape
26 |   result.value = relu a.value
27 |   result.grad = zeros[getSubType(TT)](result.value.shape)
28 | 
29 | method backward*[TT](self: ReluActivation[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
30 |   result[0] = gradient.relu_backward(self.cache)
31 | 
32 | proc relu*[TT](a: Variable[TT]): Variable[TT] =
33 |   ## Input:
34 |   ##   - A variable
35 | 
36 |   # Gate
37 |   var gate: ReluActivation[TT]
38 |   new gate
39 |   gate.arity = 1
40 | 
41 |   # Node
42 |   var node: Node[TT]
43 |   new node
44 | 
45 |   node.gate = gate
46 |   node.parents[0] = a
47 | 
48 |   a.tape.push(node)
49 | 
50 |   # Resulting var
51 |   result = gate.forward(a)
52 |   result.ancestor = node
53 |   node.child = result
54 | 
55 |   # Caching for backprop
56 |   gate.cache = result.value


--------------------------------------------------------------------------------
/src/nn/activation/sigmoid.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils
16 | import ../../arraymancer_nn_primitives
17 | 
18 | type SigmoidActivation* {.final.} [TT] = ref object of Gate[TT]
19 |   cache: TT
20 | 
21 | method forward*[TT](self: SigmoidActivation[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
22 |   new result
23 | 
24 |   result.tape = a.tape
25 |   result.value = sigmoid a.value
26 |   result.grad = zeros[getSubType(TT)](result.value.shape)
27 | 
28 | method backward*[TT](self: SigmoidActivation[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
29 |   result[0] = gradient.sigmoid_backward(self.cache)
30 | 
31 | proc sigmoid*[TT](a: Variable[TT]): Variable[TT] =
32 |   ## Input:
33 |   ##   - A variable
34 | 
35 |   # Gate
36 |   var gate: SigmoidActivation[TT]
37 |   new gate
38 |   gate.arity = 1
39 | 
40 |   # Node
41 |   var node: Node[TT]
42 |   new node
43 | 
44 |   node.gate = gate
45 |   node.parents[0] = a
46 | 
47 |   a.tape.push(node)
48 | 
49 |   # Resulting var
50 |   result = gate.forward(a)
51 |   result.ancestor = node
52 |   node.child = result
53 | 
54 |   # Caching for backprop
55 |   gate.cache = result.value


--------------------------------------------------------------------------------
/src/arraymancer/fallback/naive_l2_gemv.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # Notes on optimizing performance:
17 | # Google: https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
18 | # UlmBLAS: https://github.com/michael-lehn/ulmBLAS/blob/master/ulmblas/level2/gemv.tcc
19 | 
20 | 
21 | proc naive_gemv_fallback[T: SomeInteger](
22 |           alpha: T,
23 |           A: Tensor[T],
24 |           x: Tensor[T],
25 |           beta: T,
26 |           y: var Tensor[T]) =
27 |   ## y <- alpha * A * x + beta * y
28 | 
29 | 
30 |   if alpha == 0.T and beta == 1.T: return
31 | 
32 |   # BLAS: scal (multiplication by a scalar)
33 |   # WARNING: This will multiply all values, regardless of stepping.
34 |   for val in y.mitems:
35 |     val *= beta
36 | 
37 |   if alpha == 0.T: return
38 | 
39 |   # TODO: instead of a naive implementation use BLIS/ulmBLAS implementation with
40 |   # - if A is colMajor, use fused axpy BLAS op
41 |   # - if A is rowMajor, use fused dotu BLAS op
42 |   # - packing
43 | 
44 |   # Naive implementation: split the matrices along vertical axis
45 |   var i: int = 0
46 |   let colA = A.shape[1]
47 | 
48 |   for ai in A.axis(0):
49 |     y[i] = y[i] + alpha * dot(ai.reshape(colA),x)
50 |     i += 1
51 | 


--------------------------------------------------------------------------------
/src/nn_primitives/linear_primitives.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../arraymancer
16 | import math
17 | 
18 | # Sigmoid cross-entropy function that works directly on Tensors
19 | # and provide control without autograd
20 | 
21 | # Linear forward and backward
22 | proc linear*[T](x: var Tensor[T], weight: Tensor[T], bias: Tensor[T]) {.inline.} =
23 |   x = weight * x
24 |   x .+= bias
25 | 
26 | proc linear*[T](x: var Tensor[T], weight: Tensor[T]) {.inline.} =
27 |   x = weight * x
28 | 
29 | proc linear_backward*[T](
30 |         gradient: Tensor[T],
31 |         cached_tensor,
32 |         weight, bias: Tensor[T],
33 |         dW, db: var Tensor[T]): Tensor[T] {.inline.} =
34 |   result = weight.unsafeTranspose * gradient
35 |   dW += gradient * cached_tensor.unsafeTranspose
36 | 
37 |   db = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
38 | 
39 | proc linear_backward*[T](
40 |         gradient: Tensor[T],
41 |         cached_tensor,
42 |         weight: Tensor[T],
43 |         dW: var Tensor[T]): Tensor[T] {.inline.} =
44 |   result = weight.unsafeTranspose * gradient
45 |   dW += gradient * cached_tensor.unsafeTranspose
46 | 
47 | 


--------------------------------------------------------------------------------
/src/nn/optimizers/optimizers.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer, typetraits
16 | 
17 | type
18 |   Optimizer*[T] = ref object {.inheritable.}
19 |     # Base class for optimizer
20 |     params*: seq[Variable[Tensor[T]]] # Todo: we can't specify a collection of generic types like AnyTensor currently
21 |     lr*: T # Learning rate. Gradient update are scaled by the learning rate
22 | 
23 | method update*(self: Optimizer) {.base.} =
24 |   # Forward for loss layers
25 |   raise newException(ValueError, "update method is not implemented for " & $self.type.name)
26 | 
27 | proc zeroGrads*(o: Optimizer) =
28 |   # Reset the gradients of the optimized params
29 |   for v in o.params:
30 |     v.grad = v.value.zeros_like
31 | 
32 | type SGD*{.final.}[T] = ref object of Optimizer[T]
33 | 
34 | proc newSGD*[T](params: varargs[Variable[Tensor[T]]], learning_rate: T): SGD[T] =
35 |   SGD[T](params: @params, lr: learning_rate)
36 | 
37 | method update*(self: SGD) =
38 |   # Update the params with formula Value -= lr * gradient
39 |   # Note: SGD expects gradient to be scaled by batchsize (done by default in Arraymancer)
40 |   for v in self.params:
41 |     v.value -= self.lr * v.grad
42 |     v.grad = v.value.zeros_like
43 | 


--------------------------------------------------------------------------------
/src/arraymancer/shortcuts.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | template at*[T](t: Tensor[T], args: varargs[untyped]): untyped =
16 |   ## Slice a Tensor and collapse singleton dimension.
17 |   ##
18 |   ## Input:
19 |   ##   - a Tensor
20 |   ##   - and:
21 |   ##     - specific coordinates (``varargs[int]``)
22 |   ##     - or a slice (cf. tutorial)
23 |   ## Returns:
24 |   ##   - a value or a view of the Tensor corresponding to the slice
25 |   ##     Singleton dimension are collapsed
26 |   ## Usage:
27 |   ##   See the ``[]`` macro
28 |   t[args].unsafeSqueeze
29 | 
30 | template unsafeAt*[T](t: Tensor[T], args: varargs[untyped]): untyped =
31 |   ## Slice a Tensor and collapse singleton dimension.
32 |   ##
33 |   ## Data is shared between input and output.
34 |   ## Input:
35 |   ##   - a Tensor
36 |   ##   - and:
37 |   ##     - specific coordinates (``varargs[int]``)
38 |   ##     - or a slice (cf. tutorial)
39 |   ## Returns:
40 |   ##   - a value or a view of the Tensor corresponding to the slice
41 |   ##     Singleton dimension are collapsed
42 |   ## Warning ⚠:
43 |   ##   This is a no-copy operation, data is shared with the input.
44 |   ##   This proc does not guarantee that a ``let`` value is immutable.
45 |   ## Usage:
46 |   ##   See the ``[]`` macro
47 |   t.unsafeSlice(args).unsafeSqueeze
48 | 


--------------------------------------------------------------------------------
/tests/autograd/test_gate_blas.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/[arraymancer, arraymancer_ag]
16 | import unittest, sequtils
17 | 
18 | # # Differentiating through matmul:
19 | # # See http://cs231n.stanford.edu/vecDerivs.pdf
20 | # # And: https://danieltakeshi.github.io/2017/01/21/understanding-higher-order-local-gradient-computation-for-backpropagation-in-deep-neural-networks/
21 | # # And: http://cs231n.stanford.edu/handouts/linear-backprop.pdf
22 | 
23 | # # If base op is C = X * W
24 | # ∂C/∂X = previous_gradient * W.transpose
25 | # ∂C/∂W = X.transpose * previous_gradient
26 | 
27 | # # If base op is C = W * X (our case)
28 | # ∂C/∂X = W.transpose * previous_gradient
29 | # ∂C/∂W = previous_gradient * X.transpose
30 | 
31 | suite "Autograd of basic operations":
32 |   test "Gradient of matrix multiplication":
33 | 
34 |     let W = toSeq(1..8).toTensor.reshape(2,4).astype(float32)
35 |     let X = toSeq(11..22).toTensor.reshape(4,3).astype(float32)
36 | 
37 |     let ctx = newContext Tensor[float32]
38 | 
39 |     let w_ag = ctx.variable(W)
40 |     let x_ag = ctx.variable(X)
41 | 
42 |     let C = w_ag * x_ag
43 | 
44 |     C.backprop
45 | 
46 |     let grad_C = ones[float32](2,3)
47 |     check: w_ag.grad == grad_C * X.transpose
48 |     check: x_ag.grad == W.transpose * grad_C
49 | 


--------------------------------------------------------------------------------
/src/arraymancer/init_cpu_deprecated_0_2_0.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | proc newTensor*(shape: openarray[int], T: typedesc): Tensor[T] {.noSideEffect, inline, deprecated.} =
17 |   ## Creates a new Tensor on Cpu backend
18 |   ## Input:
19 |   ##      - Shape of the Tensor
20 |   ##      - Type of its elements
21 |   ## Result:
22 |   ##      - A Tensor of the proper shape initialized with
23 |   ##        the default type value (0 for numeric types) on Cpu backend
24 |   tensorCpu(shape, result)
25 |   result.data = newSeq[T](result.size)
26 | 
27 | proc zeros*[T: SomeNumber](shape: openarray[int], typ: typedesc[T]): Tensor[T] {.noSideEffect, inline, deprecated.} =
28 |   ## Creates a new Tensor filled with 0
29 |   ##
30 |   ## Input:
31 |   ##      - Shape of the Tensor
32 |   ##      - Type of its elements
33 |   ## Result:
34 |   ##      - A zero-ed Tensor of the input shape on backend Cpu
35 |   tensorCpu(shape, result)
36 |   result.data = newSeq[T](result.size)
37 | 
38 | 
39 | proc ones*[T: SomeNumber](shape: openarray[int], typ: typedesc[T]): Tensor[T] {.noSideEffect,inline, deprecated.} =
40 |   ## Creates a new Tensor filled with 1
41 |   ## Input:
42 |   ##      - Shape of the Tensor
43 |   ##      - Type of its elements
44 |   ## Result:
45 |   ##      - A one-ed Tensor of the same shape
46 |   tensorCpu(shape, result)
47 |   result.data = newSeqWith(result.size, 1.T)


--------------------------------------------------------------------------------
/src/arraymancer/utils/nested_containers.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # Tools to manipulate deep nested containers
17 | 
18 | proc shape[T: not char](s: openarray[T], parent_shape: seq[int] = @[]): seq[int] {.noSideEffect.}=
19 |   ## Helper function to get the shape of nested arrays/sequences
20 |   ## C convention. Last index is the fastest changing (columns in 2D, depth in 3D) - Rows (slowest), Columns, Depth (fastest)
21 |   ## The second argument "shape" is used for recursive call on nested arrays/sequences
22 |   # Dimension check is using only the first nested element so further checking
23 |   # must be one to confirm that the total number of elements match the shape.
24 |   result = parent_shape & s.len
25 |   when (T is seq|array):
26 |     result = shape(s[0], result)
27 | 
28 | iterator flatIter(s: string): string {.noSideEffect.} =
29 |   yield s
30 | 
31 | iterator flatIter[T: not char](s: openarray[T]): auto {.noSideEffect.}=
32 |   ## Inline iterator on any-depth seq or array
33 |   ## Returns values in order
34 |   for item in s:
35 |     when item is array|seq:
36 |       for subitem in flatIter(item):
37 |         yield subitem
38 |     else:
39 |       yield item
40 | 
41 | 
42 | proc shape(s: string|seq[char], parent_shape: seq[int] = @[]): seq[int] {.noSideEffect.}=
43 |   ## Handle char / string
44 |   if parent_shape == @[]:
45 |     return @[1]
46 |   else: return parent_shape
47 | 
48 | 


--------------------------------------------------------------------------------
/src/nn/loss/sigmoid_cross_entropy.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils
16 | import ../../arraymancer_nn_primitives
17 | 
18 | import ./loss
19 | 
20 | type SigmoidCrossEntropyLoss* {.final.} [TT] = ref object of Loss[TT]
21 |   cache: Variable[TT]
22 |   # arity, from Gate
23 |   # target, from Loss
24 | 
25 | method forward*[TT](self: SigmoidCrossEntropyLoss[TT], a: Variable[TT], target: TT): Variable[TT] {.inline, locks:0.}=
26 |   new result
27 | 
28 |   result.tape = a.tape
29 |   # We expect a in shape @[features, batch_size]
30 |   result.value = [sigmoid_cross_entropy(a.value, target)].toTensor
31 | 
32 |   result.grad = zeros[getSubType(TT)](1)
33 | 
34 | 
35 | method backward*[TT](self: SigmoidCrossEntropyLoss[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
36 |   result[0] = sigmoid_cross_entropy_backward(gradient, self.cache.value, self.target)
37 | 
38 | proc sigmoid_cross_entropy*[TT](a: Variable[TT], target: TT): Variable[TT] =
39 |   # Gate
40 |   var gate: SigmoidCrossEntropyLoss[TT]
41 |   new gate
42 |   gate.arity = 1
43 |   gate.cache = a
44 |   gate.target = target
45 | 
46 |   # Node
47 |   var node: Node[TT]
48 |   new node
49 | 
50 |   node.gate = gate
51 |   node.parents[0] = a
52 | 
53 |   a.tape.push(node)
54 | 
55 |   # Resulting var
56 |   result = gate.forward(a, target)
57 |   result.ancestor = node
58 |   node.child = result


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Copied from https://github.com/nim-lang/Nim/wiki/TravisCI
 2 | language: c
 3 | 
 4 | cache: ccache
 5 | 
 6 | matrix:
 7 |   include:
 8 |     # Build and test against the master (stable) and devel branches of Nim
 9 |     # Build and test using both gcc and clang
10 |     - os: linux
11 |       env: CHANNEL=stable
12 |       compiler: gcc
13 | 
14 |     - os: linux
15 |       env: CHANNEL=devel
16 |       compiler: gcc
17 | 
18 |     # For faster testing we don't test clang on linux, only on macOS
19 |     # - os: linux
20 |     #   env: CHANNEL=stable
21 |     #   compiler: clang
22 |     #
23 |     # - os: linux
24 |     #   env: CHANNEL=devel
25 |     #   compiler: clang
26 | 
27 |     # On OSX we only test against clang (gcc is mapped to clang by default)
28 |     # Note: for OpenMP, Homebrew will build flame/blis with GCC-5
29 |     - os: osx
30 |       env: CHANNEL=stable BLIS=true
31 |       compiler: clang
32 | 
33 |     # For faster testing, we only test BLIS = true
34 |     # - os: osx
35 |     #   env: CHANNEL=stable BLIS=false
36 |     #   compiler: clang
37 | 
38 |   allow_failures:
39 |     # Ignore failures when building against the devel Nim branch
40 |     - env: CHANNEL=devel
41 |   fast_finish: true
42 | 
43 | addons:
44 |   apt:
45 |     packages:
46 |       # On Linux we need OpenBLAS, on OSX Apple Accelerate is present by default
47 |       - libopenblas-dev
48 | 
49 | before_install:
50 |   # On MacOS flame/blis can be tested as it is an homebrew package
51 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update          ; fi
52 |   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew install homebrew/science/blis; fi
53 | 
54 | install:
55 |   - export CHOOSENIM_NO_ANALYTICS=1
56 |   - curl https://nim-lang.org/choosenim/init.sh -sSf > init.sh
57 |   - sh init.sh -y
58 |   - export PATH=~/.nimble/bin:$PATH
59 |   - echo "export PATH=~/.nimble/bin:$PATH" >> ~/.profile
60 |   - choosenim $CHANNEL
61 | 
62 | script:
63 |     - nimble refresh
64 |     - nimble test
65 | 
66 | branches:
67 |   except:
68 |     - gh-pages
69 | 


--------------------------------------------------------------------------------
/src/autograd/gates_reduce.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./autograd, ../arraymancer, ./utils, sequtils
16 | 
17 | type MeanGate* {.final.} [TT] = ref object of Gate[TT]
18 |   ## TODO: generalize to C <- alpha AB + C
19 |   a_shape: seq[int]
20 | 
21 | method forward*[TT](self: MeanGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
22 |   new result
23 | 
24 |   result.tape = a.tape
25 |   result.value = [a.value.mean].toTensor
26 | 
27 |   result.grad = zeros[getSubType(TT)](1)
28 | 
29 | 
30 | method backward*[TT](self: MeanGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
31 |   result[0] = gradient / getSubType(TT)(self.a_shape.product) # Conversion to subtype T, oh Higher kinded-types ...
32 | 
33 |   let z_shape = newSeqWith(self.a_shape.len, 1) # We create a shape of 1 dimension that we will expand with broadcast
34 |   result[0] = result[0].unsafeReshape(z_shape).unsafeBroadcast(self.a_shape)
35 | 
36 | proc mean*[TT](a: Variable[TT]): Variable[TT] =
37 |   when compileOption("boundChecks"):
38 |     check_ctx(a, b)
39 | 
40 |   # Gate
41 |   var gate: MeanGate[TT]
42 |   new gate
43 |   gate.arity = 1
44 |   gate.a_shape = a.value.shape # TODO use ref to avoid copy
45 | 
46 |   # Node
47 |   var node: Node[TT]
48 |   new node
49 | 
50 |   node.gate = gate
51 |   node.parents[0] = a
52 | 
53 |   a.tape.push(node)
54 | 
55 |   # Resulting var
56 |   result = gate.forward(a)
57 |   result.ancestor = node
58 |   node.child = result


--------------------------------------------------------------------------------
/src/arraymancer/global_config.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | 
17 | 
18 | # This configures the maximum number of dimensions supported by Arraymancer
19 | # It should improve performance on Cuda and for iterator by storing temporary shape/strides
20 | # that will be used extensively in the loop on the stack.
21 | # For now this is only partly implemented and only on Cuda temporary shape/strides arrays.
22 | const MAXRANK = 8 # 8 because it's a nice number, more is possible upon request.
23 | 
24 | 
25 | const CUDA_HOF_TPB {.used.}: cint = 32 * 32 # TODO, benchmark and move that to cuda global config
26 |                                    # Pascal GTX 1070+ have 1024 threads max
27 | const CUDA_HOF_BPG {.used.}: cint = 256     # should be (grid-stride+threadsPerBlock-1) div threadsPerBlock ?
28 |                                    # From https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
29 |                                    # Lower allows threads re-use and limit overhead of thread creation/destruction
30 | 
31 | 
32 | const OMP_FOR_THRESHOLD = 1000    # Tensor number of elements threshold before using OpenMP multithreading
33 | 
34 | # Full procesor optimization (AVX, AVX2, ARM neon, ... if applicable)
35 | when defined(native):
36 |   {.passC: "-march=native".}
37 | 
38 | # Note: Following https://github.com/mratsim/Arraymancer/issues/61 and
39 | # https://github.com/mratsim/Arraymancer/issues/43
40 | # Arraymancer export '_' for slicing (type is SteppedSlice)
41 | # '_' is configured in accessors_slicer


--------------------------------------------------------------------------------
/benchmarks/ex01_xor.nim:
--------------------------------------------------------------------------------
 1 | import ../src/arraymancer_nn, ../src/arraymancer_ag, ../src/arraymancer
 2 | 
 3 | let ctx = newContext Tensor[float32]
 4 | 
 5 | let bsz = 32 #batch size
 6 | 
 7 | # We will create a tensor of size 3200 --> 100 batch sizes of 32
 8 | # We create it as int between [0, 2[ (2 excluded) and convert to bool
 9 | let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool)
10 | 
11 | # Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
12 | proc xor_alt[T](x,y: T): T =
13 |   ## xor is builtin and cannot be passed to map as is
14 |   x xor y
15 | 
16 | let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1])
17 | 
18 | 
19 | # Convert to float and transpose so batch_size is last
20 | let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
21 | let y = y_bool.astype(float32).transpose
22 | 
23 | # First hidden layer of 3 neurons, with 2 features in
24 | # We initialize with random weights between -1 and 1
25 | let layer_3neurons = ctx.variable(
26 |                       randomTensor(3, 2, 2.0f) .- 1.0f
27 |                       )
28 | 
29 | # Classifier layer with 1 neuron per feature. (In our case only one neuron overall)
30 | # We initialize with random weights between -1 and 1
31 | let classifier_layer = ctx.variable(
32 |                   randomTensor(1, 3, 2.0f) .- 1.0f
33 |                   )
34 | 
35 | # Stochastic Gradient Descent
36 | let optim = newSGD[float32](
37 |   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
38 | )
39 | 
40 | for epoch in 0..100:
41 | 
42 |   for batch_id in 0..<100:
43 | 
44 |     # offset in the Tensor (Remember, batch size is last)
45 |     let offset = batch_id * 32
46 |     let x = x_train[_, offset ..< offset + 32]
47 |     let target = y[_, offset ..< offset + 32]
48 | 
49 |     # Building the network
50 |     let n1 = linear(x, layer_3neurons)
51 |     let n1_act = n1.relu
52 |     let n2 = linear(n1_act, classifier_layer)
53 |     let loss = sigmoid_cross_entropy(n2, target)
54 | 
55 |     # Compute the gradient (i.e. contribution of each parameter to the loss)
56 |     loss.backprop()
57 | 
58 |     # Correct the weights now that we have the gradient information
59 |     optim.update()


--------------------------------------------------------------------------------
/src/autograd/gates_basic.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # By convention a is the LHS (left-hand side)
16 | # b is the rhs (right-hand side)
17 | 
18 | import ./autograd, ../arraymancer, ./utils
19 | 
20 | type AddGate* {.final.} [TT] = ref object of Gate[TT]
21 |   ab_shape: seq[int]
22 | 
23 | method forward*[TT](self: AddGate[TT], a, b: Variable[TT]): Variable[TT] {.inline, locks:0.}=
24 |   new result
25 | 
26 |   result.tape = a.tape
27 |   result.value = a.value + b.value
28 | 
29 |   ## Unfortunately using broadcasts to save memory doesn't work
30 |   # let z_shape = newSeqWith(result.value.rank, 1) # We create a shape of 1 dimension that we will expand with broadcast
31 |   # let z = zeros[getSubType(TT)](z_shape)
32 |   # result.grad = z.unsafeBroadcast(result.value.shape) # to save memory, we allocate as low as possible
33 | 
34 |   result.grad = zeros[getSubType(TT)](result.value.shape)
35 | 
36 | method backward*[TT](self: AddGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
37 |   result[0] = gradient
38 |   result[1] = gradient
39 | 
40 | proc `+`*[TT](a, b: Variable[TT]): Variable[TT] =
41 |   when compileOption("boundChecks"):
42 |     check_ctx(a, b)
43 | 
44 |   # Gate
45 |   var gate: AddGate[TT]
46 |   new gate
47 |   gate.arity = 2
48 |   gate.ab_shape = a.value.shape # Shape equality will be checked in the forward proc
49 | 
50 |   # Node
51 |   var node: Node[TT]
52 |   new node
53 | 
54 |   node.gate = gate
55 |   node.parents[0] = a
56 |   node.parents[1] = b
57 | 
58 |   a.tape.push(node)
59 | 
60 |   # Resulting var
61 |   result = gate.forward(a, b)
62 |   result.ancestor = node
63 |   node.child = result
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/src/autograd/gates_blas.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ./autograd, ../arraymancer, ./utils
16 | 
17 | type MatMulGate* {.final.} [TT] = ref object of Gate[TT]
18 |   ## TODO: generalize to C <- alpha AB + C
19 |   a: Variable[TT]
20 |   b: Variable[TT]
21 | 
22 | method forward*[TT](self: MatMulGate[TT], a, b: Variable[TT]): Variable[TT] {.inline, locks:0.}=
23 |   new result
24 | 
25 |   result.tape = a.tape
26 |   result.value = a.value * b.value
27 | 
28 |   ## Unfortunately using broadcasts to save memory doesn't work
29 |   # let z_shape = newSeqWith(result.value.rank, 1) # We create a shape of 1 dimension that we will expand with broadcast
30 |   # let z = zeros[getSubType(TT)](z_shape)
31 |   # result.grad = z.unsafeBroadcast(result.value.shape) # to save memory, we allocate as low as possible
32 | 
33 |   result.grad = zeros[getSubType(TT)](result.value.shape)
34 | 
35 | method backward*[TT](self: MatMulGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
36 |   result[0] = gradient * self.b.value.unsafeTranspose
37 |   result[1] = self.a.value.unsafeTranspose * gradient
38 | 
39 | proc `*`*[TT](a, b: Variable[TT]): Variable[TT] =
40 |   when compileOption("boundChecks"):
41 |     check_ctx(a, b)
42 | 
43 |   # Gate
44 |   var gate: MatMulGate[TT]
45 |   new gate
46 |   gate.arity = 2
47 |   gate.a = a # TODO use ref to avoid copy
48 |   gate.b = b
49 | 
50 |   # Node
51 |   var node: Node[TT]
52 |   new node
53 | 
54 |   node.gate = gate
55 |   node.parents[0] = a
56 |   node.parents[1] = b
57 | 
58 |   a.tape.push(node)
59 | 
60 |   # Resulting var
61 |   result = gate.forward(a, b)
62 |   result.ancestor = node
63 |   node.child = result


--------------------------------------------------------------------------------
/src/arraymancer/backend/openmp.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | when defined(openmp):
16 |   {.passC: "-fopenmp".}
17 |   {.passL: "-fopenmp".}
18 | 
19 |   {. pragma: omp, header:"omp.h" .}
20 | 
21 |   proc omp_set_num_threads*(x: cint) {.omp.}
22 |   proc omp_get_num_threads*(): cint {.omp.}
23 |   proc omp_get_max_threads*(): cint {.omp.}
24 |   proc omp_get_thread_num*(): cint {.omp.}
25 | 
26 | else:
27 |   template omp_set_num_threads*(x: cint) = discard
28 |   template omp_get_num_threads*(): cint = 1
29 |   template omp_get_max_threads*(): cint = 1
30 |   template omp_get_thread_num*(): cint = 0
31 | 
32 | const OMP_FOR_ANNOTATION = "if(ompsize > " & $OMP_FOR_THRESHOLD & ")"
33 | 
34 | template omp_parallel_countup*(i: untyped, size: Natural, body: untyped): untyped =
35 |   let ompsize = size
36 |   for i in `||`(0, ompsize, OMP_FOR_ANNOTATION):
37 |     body
38 | 
39 | template omp_parallel_forup*(i: untyped, start, size: Natural, body: untyped): untyped =
40 |   let ompsize = size
41 |   for i in `||`(start, ompsize, OMP_FOR_ANNOTATION):
42 |     body
43 | 
44 | template omp_parallel_blocks*(block_offset, block_size: untyped, size: Natural, body: untyped): untyped =
45 |   block ompblocks:
46 |     when defined(openmp):
47 |       if size >= OMP_FOR_THRESHOLD:
48 |         let omp_num_threads = omp_get_max_threads()
49 |         if size >= omp_num_threads:
50 |           let bsize = size div omp_num_threads
51 |           for j in 0||(omp_num_threads-1):
52 |             let block_offset = bsize*j
53 |             let block_size = if j < omp_num_threads-1: bsize else: size - block_offset
54 |             block:
55 |               body
56 |           break ompblocks
57 |     let block_offset = 0
58 |     let block_size = size
59 |     block:
60 |       body
61 | 


--------------------------------------------------------------------------------
/tests/tensors/test_ufunc_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import math, unittest
17 | 
18 | suite "Universal functions":
19 |   test "Common math functions are exported":
20 |     let a = @[@[1.0,2,3],@[4.0,5,6]]
21 |     let b = @[@[7.0, 8],@[9.0, 10],@[11.0, 12]]
22 | 
23 |     let ta = a.toTensor(Cpu)
24 |     let tb = b.toTensor(Cpu)
25 | 
26 |     let expected_a = @[@[cos(1'f64),cos(2'f64),cos(3'f64)],@[cos(4'f64),cos(5'f64),cos(6'f64)]]
27 |     let expected_b = @[@[ln(7'f64), ln(8'f64)],@[ln(9'f64), ln(10'f64)],@[ln(11'f64), ln(12'f64)]]
28 | 
29 |     check: cos(ta) == expected_a.toTensor(Cpu)
30 |     check: ln(tb) == expected_b.toTensor(Cpu)
31 | 
32 |   test "Creating custom universal functions is supported":
33 |     proc square_plus_one(x: int): int = x ^ 2 + 1
34 |     makeUniversalLocal(square_plus_one)
35 | 
36 |     let c = @[@[2,4,8],@[3,9,27]]
37 |     let tc = c.toTensor(Cpu)
38 | 
39 |     let expected_c = @[@[5, 17, 65],@[10, 82, 730]]
40 | 
41 |     check: square_plus_one(tc) == expected_c.toTensor(Cpu)
42 | 
43 |   ## MakeUniversal cannot change Tensor[B,T] to Tensor[B,U] for now
44 |   ## fmap must be used instead
45 |   test "Universal functions that change types are supported":
46 |     let d = @[@[2,4,8],@[3,9,27]]
47 |     let e = @[@["2","4","8"],@["3","9","27"]]
48 | 
49 |     proc stringify(n: int): string = $n
50 |     # makeUniversalLocal(stringify)
51 | 
52 |     let td = d.toTensor(Cpu)
53 |     let te = e.toTensor(Cpu)
54 | 
55 |     when compiles (td == te): check: false
56 | 
57 |     check: td.fmap(stringify) == te
58 |     check: td.fmap(stringify)[0,1] == "4"
59 | 
60 |     when compileOption("boundChecks"):
61 |       expect(IndexError):
62 |         discard td.fmap(stringify)[1,3]
63 |     else:
64 |       echo "Bound-checking is disabled. The incorrect seq shape test has been skipped."
65 | 


--------------------------------------------------------------------------------
/src/arraymancer/fallback/blas_l3_gemm_macro_kernel.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc gemm_macro_kernel[T](mc, nc, kc: int,
16 |                           alpha: T,
17 |                           beta: T,
18 |                           C: var seq[T], offC: int,
19 |                           incRowC, incColC: int,
20 |                           buffer_A: var ref array[MCKC, T],
21 |                           buffer_B: var ref array[KCNC, T],
22 |                           buffer_C: var ref array[MRNR, T])
23 |                           {.noSideEffect.} =
24 |   let mp = (mc+MR-1) div MR
25 |   let np = (nc+NR-1) div NR
26 | 
27 |   let mod_mr = mc mod MR
28 |   let mod_nr = nc mod NR
29 | 
30 |   var mr: int
31 |   var nr: int
32 | 
33 |   for j in 0 ..< np:
34 |     nr = if (j != np-1 or mod_nr == 0): NR
35 |          else: mod_nr
36 |     for i in 0 ..< mp:
37 |       mr = if (i != mp-1 or mod_mr == 0): MR
38 |            else: mod_mr
39 | 
40 |       if (mr==MR and nr==NR):
41 |         gemm_micro_kernel(kc, alpha,
42 |                           buffer_A, i*kc*MR,
43 |                           buffer_B, j*kc*NR,
44 |                           beta,
45 |                           C, i*MR*incRowC+j*NR*incColC + offC,
46 |                           incRowC, incColC)
47 |       else:
48 |         gemm_micro_kernel(kc, alpha,
49 |                           buffer_A, i*kc*MR,
50 |                           buffer_B, j*kc*NR,
51 |                           0.T,
52 |                           buffer_C, 0,
53 |                           1, MR)
54 |         gescal( mr, nr, beta,
55 |                 C, i*MR*incRowC+j*NR*incColC + offC,
56 |                 incRowC, incColC)
57 |         geaxpy( mr, nr,
58 |                 1.T,
59 |                 buffer_C,
60 |                 1, MR,
61 |                 C, i*MR*incRowC+j*NR*incColC + offC,
62 |                 incRowC, incColC)
63 | 


--------------------------------------------------------------------------------
/src/arraymancer/fallback/blas_l3_gemm_packing.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc pack_panel[T, N](k: int,
16 |                       M: seq[T], offset: int, # Tensor data + offset
17 |                       lsm, ssm: int, # Leading and secondary (dimension) stride of M, Leading: incColA/incRowB.
18 |                       LR: static[int], # Leading block dimension, MR for A (MxK), NR for B (KxN)
19 |                       buffer: var ref array[N, T], # N = MCKC for A, KCNC for B
20 |                       offBuf: var int) {.noSideEffect.} =
21 |   ## Pack blocks of size LR of the matrices in the corresponding buffer
22 |   var offM = offset
23 |   for s in 0 ..< k: # Loop along the leaing dimension
24 |     for lead in 0 ..< LR:
25 |       buffer[lead + offBuf] = M[lead*lsm + offM]
26 |     offBuf += LR
27 |     offM += ssm
28 | 
29 | proc pack_dim[T, N](lc, kc: int, # lc = mc for A (MxK matrix) and lc = nc for B (KxN matrix)
30 |                     M: seq[T], offset: int, # Tensor data + offset
31 |                     lsm, ssm: int, # Leading and secondary (dimension) stride of M, Leading: incColA/incRowB.
32 |                     LR: static[int], # Leading block dimension, MR for A (MxK), NR for B (KxN)
33 |                     buffer: var ref array[N, T]) # N = MCKC for A, KCNC for B
34 |                     {.noSideEffect.} =
35 | 
36 |   let lp = lc div LR # Number of whole blocks along leading dim
37 |   let lr = lc mod LR # Reminder of leading dim
38 | 
39 |   var offBuf = 0
40 |   var offM = offset
41 | 
42 |   for lead in 0..<lp:
43 |     pack_panel(kc, M, offM, lsm, ssm, LR, buffer, offBuf)
44 |     offM  += LR*lsm
45 | 
46 |   if lr > 0:
47 |     for s in 0 ..< kc:
48 |       for lead in 0 ..< lr:
49 |         buffer[lead + offBuf] = M[lead * lsm + offM]
50 |       for lead in lr ..< LR:
51 |         buffer[lead + offBuf] = 0.T
52 |       offBuf += LR
53 |       offM   += ssm


--------------------------------------------------------------------------------
/tests/tensors/test_init_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math, sequtils
17 | 
18 | suite "Creating a new Tensor":
19 |   test "Creating from sequence":
20 |     let t1 = @[1,2,3].toTensor(Cpu)
21 |     check: t1.shape == @[3]
22 |     check: t1.rank == 1
23 | 
24 |     const
25 |       a = @[1, 2, 3, 4, 5]
26 |       b = @[1, 2, 3, 4, 5]
27 | 
28 |     var
29 |       vandermonde: seq[seq[int]]
30 |       row: seq[int]
31 | 
32 |     vandermonde = newSeq[seq[int]]()
33 | 
34 |     for i, aa in a:
35 |       row = newSeq[int]()
36 |       vandermonde.add(row)
37 |       for j, bb in b:
38 |         vandermonde[i].add(aa^bb)
39 |     
40 |     let t2 = vandermonde.toTensor(Cpu)
41 |     check: t2.rank == 2
42 |     check: t2.shape == @[5, 5]
43 | 
44 |     let nest3 = @[
45 |             @[
46 |               @[1,2,3],
47 |               @[1,2,3]
48 |             ],
49 |             @[
50 |               @[3,2,1],
51 |               @[3,2,1]
52 |             ],
53 |             @[
54 |               @[4,4,5],
55 |               @[4,4,4]
56 |             ],
57 |             @[
58 |               @[6,6,6],
59 |               @[6,6,6]
60 |             ]
61 |           ]
62 |     
63 |     let t3 = nest3.toTensor(Cpu)
64 |     check: t3.rank == 3
65 |     check: t3.shape == @[4, 2, 3]  # 4 rows, 2 cols, 3 depth. depth indices moves the fastest. Same scheme as Numpy.
66 | 
67 |     let u = @[@[1.0, -1, 2],@[0.0, -1]]
68 |     expect(IndexError):
69 |       discard u.toTensor(Cpu)
70 | 
71 |   test "Check that Tensor shape is in row-by-column order":
72 |     let s = @[@[1,2,3],@[3,2,1]]
73 |     let t = s.toTensor(Cpu)
74 |     
75 |     check: t.shape == @[2,3]
76 | 
77 |     let u = newTensor(@[2,3], int, Cpu)
78 |     check: u.shape == @[2,3]
79 | 
80 |     check: u.shape == t.shape
81 | 
82 |   # TODO add tests for zeros, ones and randomTensor


--------------------------------------------------------------------------------
/src/arraymancer/utils/functional.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Functional programming and iterator tooling
16 | 
17 | template scanr[T](s: seq[T], operation: untyped): untyped =
18 |   ## Template to scan a sequence from right to left, returning the accumulation and intermediate values.
19 |   ## This is a foldr with intermediate steps returned
20 | 
21 |   ## @[2, 2, 3, 4].scanr(a * b) = @[48, 24, 12, 4]
22 |   let len = s.len
23 | 
24 |   assert len > 0, "Can't scan empty sequences"
25 |   var result = newSeq[T](len)
26 | 
27 |   result[result.high] = s[s.high]
28 |   for i in countdown(len - 1, 1):
29 |     let
30 |       a {.inject.} = s[i-1]
31 |       b {.inject.} = result[i]
32 |     result[i-1] = operation
33 |   result
34 | 
35 | template scanl[T](s: seq[T], operation: untyped): untyped =
36 |   ## Template to scan a sequence from left to right, returning the accumulation and intermediate values.
37 |   ## This is a foldl with intermediate steps returned
38 | 
39 |   ## @[2, 2, 3, 4].scanl(a * b) = @[2, 4, 12, 48]
40 |   let len = s.len
41 | 
42 |   assert len > 0, "Can't scan empty sequences"
43 |   var result = newSeq[T](len)
44 | 
45 |   result[0] = s[0]
46 |   for i in 1..s.high:
47 |     let
48 |       a {.inject.} = s[i]
49 |       b {.inject.} = result[i-1]
50 |     result[i] = operation
51 |   result
52 | 
53 | iterator zip[T1, T2](a: openarray[T1], b: openarray[T2]): (T1,T2) {.noSideEffect.} =
54 |   ## Transform two lists in a list of tuples.
55 |   ## Length of result will be the length of the smallest list, items from the longest will be discarded.
56 |   let len = min(a.len, b.len)
57 | 
58 |   for i in 0..<len:
59 |     yield (a[i], b[i])
60 | 
61 | template product[T: SomeNumber](s: seq[T]): T =
62 |   ## Get the product of all numbers in a sequence or array
63 |   s.foldl(a*b)
64 | 
65 | proc concatMap[T](s: seq[T], f: proc(ss: T):string): string  {.noSideEffect.}=
66 |   ## Map a function to a sequence of T and concatenate the result as string
67 |   return s.foldl(a & f(b), "")


--------------------------------------------------------------------------------
/tests/tensors/test_accessors_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math
17 | 
18 | 
19 | suite "Accessing and setting tensor values":
20 |   test "Accessing and setting a single value":
21 |     var a = zeros(@[2,3,4], int, Backend.Cpu)
22 |     a[1,2,2] = 122
23 |     check: a[1,2,2] == 122
24 | 
25 |     var b = zeros(@[3,4], int, Backend.Cpu)
26 |     b[1,2] = 12
27 |     check: b[1,2] == 12
28 |     b[0,0] = 999
29 |     check: b[0,0] == 999
30 |     b[2,3] = 111
31 |     check: b[2,3] == 111
32 | 
33 | 
34 |   test "Out of bounds checking":
35 |     var a = newTensor(@[2,3,4], int, Backend.Cpu)
36 |     expect(IndexError):
37 |       a[2,0,0] = 200
38 |     var b = newTensor(@[3,4], int, Backend.Cpu)
39 |     expect(IndexError):
40 |       b[3,4] = 999
41 |     expect(IndexError):
42 |       discard b[-1,0]
43 |     expect(IndexError):
44 |       discard b[0,-2]
45 | 
46 |   test "Iterators":
47 |     const
48 |       a = @[1, 2, 3, 4, 5]
49 |       b = @[1, 2, 3]
50 |     var
51 |       vd: seq[seq[int]]
52 |       row: seq[int]
53 |     vd = newSeq[seq[int]]()
54 |     for i, aa in a:
55 |       row = newSeq[int]()
56 |       vd.add(row)
57 |       for j, bb in b:
58 |         vd[i].add(aa^bb)
59 | 
60 |     let nda_vd = vd.toTensor(Cpu)
61 | 
62 |     let expected_seq = @[1,1,1,2,4,8,3,9,27,4,16,64,5,25,125]
63 | 
64 |     var seq_val: seq[int] = @[]
65 |     for i in nda_vd:
66 |       seq_val.add(i)
67 | 
68 |     check: seq_val == expected_seq
69 | 
70 |     var seq_validx: seq[tuple[idx: seq[int], val: int]] = @[]
71 |     for i,j in nda_vd:
72 |       seq_validx.add((i,j))
73 |     
74 |     check: seq_validx[0] == (@[0,0], 1)
75 |     check: seq_validx[10] == (@[3,1], 16)
76 | 
77 |     let t_nda = transpose(nda_vd)
78 | 
79 |     var seq_transpose: seq[tuple[idx: seq[int], val: int]] = @[]
80 |     for i,j in t_nda:
81 |       seq_transpose.add((i,j))
82 |     
83 |     check: seq_transpose[0] == (@[0,0], 1)
84 |     check: seq_transpose[8] == (@[1,3], 16)


--------------------------------------------------------------------------------
/src/arraymancer/accessors_cuda.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | 
17 | # For element-wise operations, instead of a sequential iterator like for CPU,
18 | # it will be faster to have many threads compute the index -> offset and update
19 | # the data at this offset.
20 | #
21 | # For this we need:
22 | #   - to store strides and offset on the cuda device to avoid copies
23 | #   - a way to convert element #10 of the tensor to the real offset (column major),
24 | #     the kernels won't use tensor[2,5] as an index
25 | 
26 | 
27 | proc getIndexOfElementID[T](t: Tensor[T], element_id: int): int {.noSideEffect,used.} =
28 |   ## Convert "Give me element 10" to the real index/memory offset.
29 |   ## Reference Nim CPU version
30 |   ## This is not meant to be used on serial architecture due to the division overhead.
31 |   ## On GPU however it will allow threads to address the real memory addresses independantly.
32 | 
33 |   when compileOption("boundChecks"):
34 |     assert element_id < t.size
35 | 
36 |   result = t.offset
37 |   var currentOffset = element_id
38 |   var dimIdx: int
39 | 
40 |   for k in countdown(t.rank - 1,0):
41 |     ## hopefully the compiler doesn't do division twice ...
42 |     dimIdx = currentOffset mod t.shape[k]
43 |     currentOffset = currentOffset div t.shape[k]
44 | 
45 |     # cf atIndex proc to compute real_idx
46 |     result += dimIdx * t.strides[k]
47 | 
48 | # Note we don't bound-checks the CUDA implementation
49 | {.emit:["""
50 |   static inline __device__ int cuda_getIndexOfElementID(
51 |     const int rank,
52 |     const int * __restrict__ shape,
53 |     const int * __restrict__ strides,
54 |     const int offset,
55 |     const int element_id) {
56 | 
57 |     int real_idx = offset;
58 |     int currentOffset = element_id;
59 |     int dimIdx = 0;
60 | 
61 |     for (int k = rank - 1; k >= 0; --k) {
62 |       dimIdx = currentOffset % shape[k];
63 |       currentOffset /= shape[k];
64 | 
65 |       real_idx += dimIdx * strides[k];
66 |     }
67 | 
68 |     return real_idx;
69 |   }
70 |   """].}


--------------------------------------------------------------------------------
/tests/tensors/test_higherorder.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math, future, sequtils
17 | 
18 | suite "Testing higher-order functions":
19 |   let t = [[0, 1, 2],
20 |           [3, 4, 5],
21 |           [6, 7, 8],
22 |           [9, 10, 11]].toTensor()
23 | 
24 |   proc customAdd[T: SomeNumber](x, y: T): T = x + y
25 | 
26 |   test "Map function":
27 | 
28 |     let t2 = [0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121].toTensor.reshape([4,3])
29 | 
30 |     check: t.map(x => x*x) == t2
31 | 
32 |   test "Apply functions - with in-place and out of place closure":
33 |     var t = toSeq(0..11).toTensor().reshape([4,3])
34 |     let t2 = toSeq(1..12).toTensor().reshape([4,3])
35 | 
36 |     var tmp1 = t
37 |     tmp1.apply(x => x+1) # out of place
38 |     check: tmp1 == t2
39 | 
40 |     var tmp2 = t[_,2]
41 | 
42 |     proc plus_one[T](x: var T) = x += 1
43 |     tmp2.apply(plus_one) # in-place
44 |     check: tmp2 == t2[_,2]
45 | 
46 |   test "Reduce function":
47 |     check: t.reduce(customAdd) == 66
48 | 
49 |     proc customConcat(x, y: string): string = x & y
50 | 
51 |     check: t.map(x => $x).reduce(customConcat) == "01234567891011"
52 | 
53 |   test "Reduce over an axis":
54 |     proc customMin[T: SomeNumber](x,y: Tensor[T]): Tensor[T] = x - y
55 | 
56 |     check: t.reduce(customMin, axis = 0) == [-18, -20, -22].toTensor.reshape([1,3])
57 | 
58 |   test "Fold with different in and result types":
59 |     proc isEven(n: int): bool =
60 |       return n mod 2 == 0
61 | 
62 |     # Check if all even
63 |     check: t.fold(true, proc(x: bool,y: int): bool = x and y.isEven) == false
64 | 
65 |     check: (t * 2).fold(true, proc(x: bool,y: int): bool = x and y.isEven) == true
66 | 
67 |   test "Fold over axis":
68 |     let col_sum_plus_1010 = [[4],
69 |                             [12],
70 |                             [22],
71 |                             [30]].toTensor()
72 | 
73 |     let initval = [1,0,1,0].toTensor.reshape([4,1])
74 | 
75 |     check: t.fold(initval, `+`, axis = 1) == col_sum_plus_1010
76 | 


--------------------------------------------------------------------------------
/src/arraymancer/term_rewriting.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | template toTensorReshapeT(oa: typed, shape: varargs[int]): untyped = 
16 |   let data = toSeq(flatIter(oa))
17 |   let seq_shape = @shape
18 | 
19 |   when compileOption("boundChecks"): check_nested_elements(seq_shape, data.len)
20 | 
21 |   var t: Tensor[type(data[0])]
22 |   tensorCpu(seq_shape, t)
23 |   shallowCopy(t.data, data)
24 |   return t
25 | 
26 | proc toTensorReshape(oa: string, shape: varargs[int]): auto {.noSideEffect.}=
27 |   ## Fuse toTensor and reshape in one operation.
28 |   ##
29 |   ## Deal specifically with strings/seq[char]
30 | 
31 |   toTensorReshapeT(oa, shape)
32 | 
33 | proc toTensorReshape(oa: openarray, shape: varargs[int], dummy_bugfix: static[int] = 0): auto {.noSideEffect.}=
34 |   ## Fuse toTensor and reshape in one operation
35 |   ##
36 |   # Dummy_bugfix param is necessary due to: https://github.com/nim-lang/Nim/issues/6343
37 |   # TODO: remove 'dummy_bugfix'
38 |   toTensorReshapeT(oa, shape)
39 | 
40 | template rewriteToTensorReshape*{reshape(toTensor(oa, dummy_bugfix), shape)}(
41 |   oa: openarray,
42 |   shape: varargs[int],
43 |   dummy_bugfix: static[int]): auto =
44 |   ## Fuse ``sequence.toTensor.reshape(new_shape)`` into a single operation.
45 |   ##
46 |   ## Operation fusion leverage the Nim compiler and should not be called explicitly.
47 |   toTensorReshape(oa, shape, dummy_bugfix)
48 | 
49 | proc unsafeToTensorReshape*[T](data: seq[T], shape: varargs[int]): Tensor[T] {.noSideEffect.} =
50 |   ## Fuse unsafeToTensor and unsafeReshape in one operation
51 | 
52 |   when compileOption("boundChecks"): check_nested_elements(@shape, data.len)
53 | 
54 |   tensorCpu(shape, result)
55 |   shallowCopy(result.data, data)
56 | 
57 | template rewriteUnsafeToTensorReshape*{unsafeReshape(unsafeToTensor(s), shape)}(
58 |   s: seq,
59 |   shape: varargs[int]): auto =
60 |   ## Fuse ``sequence.unsafeToTensor().unsafeReshape(new_shape)`` into a single operation.
61 |   ##
62 |   ## Operation fusion leverage the Nim compiler and should not be called explicitly.
63 |   unsafeToTensorReshape(s, shape, dummy_bugfix)
64 | 


--------------------------------------------------------------------------------
/benchmarks/implementation/stable_sigmoid_bench.nim:
--------------------------------------------------------------------------------
 1 | import times, ../../src/arraymancer, math
 2 | 
 3 | # The goal is to test the speed of various sigmoid implementation
 4 | # Some are numericall stable for positive, negative or both value
 5 | 
 6 | # We create a random tensor with randomly positive and negative value
 7 | let a = randomTensor(1000, 1000, 100.0f) .- 50.0f
 8 | 
 9 | proc sigmoid1[T: SomeReal](t: Tensor[T]): Tensor[T] =
10 |   # Instable for negative
11 |   proc sigmoid1_closure(x: T): T = 1.T / (1 + exp(-x))
12 |   return t.map(sigmoid1_closure)
13 | 
14 | proc sigmoid2[T: SomeReal](t: Tensor[T]): Tensor[T] =
15 |   # Instable for positive
16 |   proc sigmoid2_closure(x: T): T =
17 |     let z = exp(x)
18 |     return z / (1.T + z)
19 |   return t.map(sigmoid2_closure)
20 | 
21 | proc sigmoid3[T: SomeReal](t: Tensor[T]): Tensor[T] =
22 |   # Stable but branching in a loop
23 |   proc sigmoid3_closure(x: T): T =
24 |     if x >= 0:
25 |       return 1.T / (1 + exp(-x))
26 |     let z = exp(x)
27 |     return z / (1 + z)
28 |   return t.map(sigmoid3_closure)
29 | 
30 | proc sigmoid4*[T: SomeReal](t: Tensor[T]): Tensor[T] =
31 |   # Stable but expensive tanh
32 |   proc sigmoid4_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T)
33 |   return t.map(sigmoid4_closure)
34 | 
35 | proc sigmoid5*[T: SomeReal](t: Tensor[T]): Tensor[T] =
36 |   # Stable and probably fastest
37 |   proc sigmoid5_closure(x: T): T =
38 |     let clip_x = max(-500, -x)
39 |     return 1.T / (1 + exp(clip_x))
40 |   return t.map(sigmoid5_closure)
41 | 
42 | ## Warmup for ondemand CPU
43 | for i in 0..<1000:
44 |   discard a.sigmoid1
45 | 
46 | var start = cpuTime()
47 | for i in 0..<1000:
48 |   discard a.sigmoid1
49 | echo " Sigmoid1: 1 / (1 + exp(-x)) ", cpuTime() - start
50 | 
51 | 
52 | start = cpuTime()
53 | for i in 0..<1000:
54 |   discard a.sigmoid2
55 | echo " Sigmoid2: exp(x) / (1 + exp(x)) ", cpuTime() - start
56 | 
57 | start = cpuTime()
58 | for i in 0..<1000:
59 |   discard a.sigmoid3
60 | echo " Sigmoid3: branching ", cpuTime() - start
61 | 
62 | start = cpuTime()
63 | for i in 0..<1000:
64 |   discard a.sigmoid4
65 | echo " Sigmoid4: 0.5 * (tanh(0.5 * x) + 1) ", cpuTime() - start
66 | 
67 | start = cpuTime()
68 | for i in 0..<1000:
69 |   discard a.sigmoid5
70 | echo " Sigmoid5: 1 / (1 + exp(max(-500,-x)) ", cpuTime() - start
71 | 
72 | 
73 | # Results with -d:release on i5-5257U (dual-core mobile 2.7GHz, turbo 3.1)
74 | # Note: results vary strongly depending on your number of cores due to cpuTime methodology
75 | # Sigmoid1: 1 / (1 + exp(-x)) 8.265147999999998
76 | # Sigmoid2: exp(x) / (1 + exp(x)) 7.757116
77 | # Sigmoid3: branching 12.477108
78 | # Sigmoid4: 0.5 * (tanh(0.5 * x) + 1) 11.162277
79 | # Sigmoid5: 1 / (1 + exp(max(-500,-x)) 10.050294


--------------------------------------------------------------------------------
/tests/tensors/test_aggregate_deprecated.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, math
17 | 
18 | suite "Testing aggregation functions":
19 |   let t = [[0, 1, 2],
20 |           [3, 4, 5],
21 |           [6, 7, 8],
22 |           [9, 10, 11]].toTensor(Cpu)
23 | 
24 |   test "Sum all elements":
25 |     check: t.sum == 66
26 | 
27 |   test "Sum over axis":
28 |     let row_sum = [[18, 22, 26]].toTensor(Cpu)
29 |     let col_sum = [[3],
30 |                    [12],
31 |                    [21],
32 |                    [30]].toTensor(Cpu)
33 |     check: t.sum(axis=0) == row_sum
34 |     check: t.sum(axis=1) == col_sum
35 | 
36 |     ## TODO: 3D axis sum
37 |   test "Mean of all elements":
38 |     check: t.astype(float).mean == 5.5 # Note: may fail due to float rounding
39 | 
40 |   test "Mean over axis":
41 |     let row_mean = [[4.5, 5.5, 6.5]].toTensor(Cpu)
42 |     let col_mean = [[1.0],
43 |                    [4.0],
44 |                    [7.0],
45 |                    [10.0]].toTensor(Cpu)
46 |     check: t.astype(float).mean(axis=0) == row_mean
47 |     check: t.astype(float).mean(axis=1) == col_mean
48 | 
49 |   test "Generic aggregate functions":
50 |     # We can't pass built-ins to procvar
51 |     proc addition[T](a, b: T): T=
52 |       return a+b
53 |     proc addition_inplace[T](a: var T, b: T)=
54 |       a+=b
55 | 
56 |     check: t.agg(addition, start_val=0) == 66
57 | 
58 |     var z = 0
59 |     z.agg_inplace(addition_inplace, t)
60 |     check: z == 66
61 | 
62 |     #### Axis - `+`, `+=` for tensors are not "built-ins"
63 |     let row_sum = [[18, 22, 26]].toTensor(Cpu)
64 |     let col_sum = [[3],
65 |                    [12],
66 |                    [21],
67 |                    [30]].toTensor(Cpu)
68 | 
69 |     var z1 = zeros([1,3], int, Cpu)
70 |     var z2 = zeros([4,1], int, Cpu)
71 | 
72 |     # Start with non-inplace proc
73 |     check: t.agg(`+`, axis=0, start_val = z1 ) == row_sum
74 |     check: t.agg(`+`, axis=1, start_val = z2 ) == col_sum
75 | 
76 |     # Inplace proc
77 |     # z1.agg_inplace(`+=`, t, axis=0)
78 |     # z2.agg_inplace(`+=`, t, axis=1)
79 | 
80 |     # check: z1 == row_sum
81 |     # check: z2 == col_sum


--------------------------------------------------------------------------------
/tests/tensors/test_ufunc.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import math, unittest
17 | 
18 | suite "Universal functions":
19 |   test "As type with slicing":
20 |     let a = [1, 2, 3, 4].toTensor()
21 |     let b = a[1..2].astype(float)
22 |     check b == [2.0'f64,3.0'f64].toTensor()
23 | 
24 |   test "Common math functions are exported":
25 |     let a = @[@[1.0,2,3],@[4.0,5,6]]
26 |     let b = @[@[7.0, 8],@[9.0, 10],@[11.0, 12]]
27 | 
28 |     let ta = a.toTensor()
29 |     let tb = b.toTensor()
30 | 
31 |     let expected_a = @[@[cos(1'f64),cos(2'f64),cos(3'f64)],@[cos(4'f64),cos(5'f64),cos(6'f64)]]
32 |     let expected_b = @[@[ln(7'f64), ln(8'f64)],@[ln(9'f64), ln(10'f64)],@[ln(11'f64), ln(12'f64)]]
33 | 
34 |     check: cos(ta) == expected_a.toTensor()
35 |     check: ln(tb) == expected_b.toTensor()
36 | 
37 |   test "Creating custom universal functions is supported":
38 |     proc square_plus_one(x: int): int = x ^ 2 + 1
39 |     makeUniversalLocal(square_plus_one)
40 | 
41 |     let c = @[@[2,4,8],@[3,9,27]]
42 |     let tc = c.toTensor()
43 | 
44 |     let expected_c = @[@[5, 17, 65],@[10, 82, 730]]
45 | 
46 |     check: square_plus_one(tc) == expected_c.toTensor()
47 | 
48 |   ## MakeUniversal cannot change Tensor[B,T] to Tensor[B,U] for now
49 |   ## map must be used instead
50 |   test "Universal functions that change types are supported":
51 |     let d = @[@[2,4,8],@[3,9,27]]
52 |     let e = @[@["2","4","8"],@["3","9","27"]]
53 | 
54 |     proc stringify(n: int): string = $n
55 |     # makeUniversalLocal(stringify)
56 | 
57 |     let td = d.toTensor()
58 |     let te = e.toTensor()
59 | 
60 |     when compiles(td == te): check: false
61 | 
62 |     check: td.map(stringify) == te
63 |     check: td.map(stringify)[0,1] == "4"
64 | 
65 |     when compileOption("boundChecks"):
66 |       expect(IndexError):
67 |         discard td.map(stringify)[1,3]
68 |     else:
69 |       echo "Bound-checking is disabled. The incorrect seq shape test has been skipped."
70 | 
71 | 
72 |   test "Abs":
73 |     let a = [-2,-1,0,1,2].toTensor()
74 |     check abs(a) == [2,1,0,1,2].toTensor()
75 |     let b = [-2.0,-1,0,1,2].toTensor()
76 |     check abs(b) == [2.0,1,0,1,2].toTensor()


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Arraymancer v0.2.0 Sept. 24, 2017 "The Color of Magic"
 4 | ===========================================
 5 | 
 6 | I am very excited to announce the second release of Arraymancer which includes numerous improvements `blablabla` ...
 7 | 
 8 | Without further ado:
 9 | - Communauty
10 |    - There is a Gitter room!
11 | - Breaking
12 |    - `shallowCopy` is now `unsafeView` and accepts `let` arguments
13 |    - Element-wise multiplication is now `.*` instead of `|*|`
14 |    - vector dot product is now `dot` instead of `.*`
15 | - Deprecated
16 |    - All tensor initialization proc have their `Backend` parameter deprecated.
17 |    - `fmap` is now `map`
18 |    - `agg` and `agg_in_place` are now `fold` and nothing (too bad!)
19 | 
20 | - Initial support for Cuda !!!
21 |    - All linear algebra operations are supported
22 |    - Slicing (read-only) is supported
23 |    - Transforming a slice to a new contiguous Tensor is supported
24 | - Tensors
25 |    - Introduction of `unsafe` operations that works without copy: `unsafeTranspose`, `unsafeReshape`, `unsafebroadcast`, `unsafeBroadcast2`, `unsafeContiguous`, 
26 |    - Implicit broadcasting via `.+, .*, ./, .-` and their in-place equivalent `.+=, .-=, .*=, ./=`
27 |    - Several shapeshifting operations: `squeeze`, `at` and their `unsafe` version.
28 |    - New property: `size`
29 |    - Exporting: `export_tensor` and `toRawSeq`
30 |    - Reduce and reduce on axis
31 | - Ecosystem:
32 |    - I express my deep thanks to @edubart for testing Arraymancer, contributing new functions, and improving its overall performance. He built [arraymancer-demos](https://github.com/edubart/arraymancer-demos) and [arraymancer-vision](https://github.com/edubart/arraymancer-vision),check those out you can load images in Tensor and do logistic regression on those!
33 | 
34 | Also thanks to the Nim communauty on IRC/Gitter, they are a tremendous help (yes Varriount, Yardanico, Zachary, Krux).
35 | I probably would have struggled a lot more without the guidance of Andrea's code for Cuda in his [neo](https://github.com/unicredit/neo) and [nimcuda](https://github.com/unicredit/nimcuda) library. And obviously Araq and Dom for Nim which is an amazing language for performance, productivity, safety and metaprogramming.
36 | 
37 | 
38 | Minor revisions v0.1.1 to v0.1.3
39 | ================================
40 | 
41 | Arraymancer v0.1.0. July 12, 2017 "Magician Apprentice"
42 | ===========================================
43 | 
44 | First public release.
45 | 
46 | Include:
47 | 
48 | - converting from deep nested proc or array
49 | - Slicing, and slice mutation
50 | - basic linear algebra operations,
51 | - reshaping, broadcasting, concatenating,
52 | - universal functions
53 | - iterators (in-place, axis, inline and closure versions)
54 | - BLAS and BLIS support for fast linear algebra
55 | 


--------------------------------------------------------------------------------
/src/arraymancer/fallback/blas_l3_gemm_micro_kernel.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | template gemm_micro_kernelT[T](
16 |             kc: int,
17 |             alpha: T,
18 |             A: typed, offA: int,
19 |             B: typed, offB: int,
20 |             beta: T,
21 |             C: typed,
22 |             offC: int,
23 |             incRowC, incColC: int): untyped =
24 | 
25 |   {.pragma: align16, codegenDecl: "$# $# __attribute__((aligned(16)))".}
26 |   var AB{.align16.}: array[MR*NR, T]
27 |   var voffA = offA
28 |   var voffB = offB
29 | 
30 |   ## Compute A*B
31 |   for _ in 0 ..< kc:
32 |     for j in 0 ..< NR:
33 |       for i in 0 .. < MR:
34 |         AB[i + j*MR] += A[i + voffA] * B[j + voffB]
35 |     voffA += MR
36 |     voffB += NR
37 | 
38 |   ## C <- beta * C
39 |   if beta == 0.T:
40 |     for j in 0 ..< NR:
41 |       for i in 0 ..< MR:
42 |         C[i*incRowC + j*incColC + offC] = 0.T
43 |   elif beta != 1.T:
44 |     for j in 0 ..< NR:
45 |       for i in 0 ..< MR:
46 |         C[i*incRowC + j*incColC + offC] *= beta
47 | 
48 |   ## C <- C + alpha*AB, alpha !=0
49 |   if alpha == 1.T:
50 |     for j in 0 ..< NR:
51 |       for i in 0 ..< MR:
52 |         C[i*incRowC + j*incColC + offC] += AB[i + j*MR]
53 |   else:
54 |     for j in 0 ..< NR:
55 |       for i in 0 ..< MR:
56 |         C[i*incRowC + j*incColC + offC] += alpha*AB[i + j*MR]
57 | 
58 | proc gemm_micro_kernel[T](kc: int,
59 |                           alpha: T,
60 |                           A: ref array[MCKC, T], offA: int,
61 |                           B: ref array[KCNC, T], offB: int,
62 |                           beta: T,
63 |                           C: var ref array[MRNR, T],
64 |                           offC: int,
65 |                           incRowC, incColC: int)
66 |                           {.noSideEffect.} =
67 |   gemm_micro_kernelT(kc, alpha, A, offA, B, offB, beta, C, offC, incRowC, incColc)
68 | 
69 | proc gemm_micro_kernel[T](kc: int,
70 |                           alpha: T,
71 |                           A: ref array[MCKC, T], offA: int,
72 |                           B: ref array[KCNC, T], offB: int,
73 |                           beta: T,
74 |                           C: var seq[T],
75 |                           offC: int,
76 |                           incRowC, incColC: int)
77 |                           {.noSideEffect.} =
78 |   gemm_micro_kernelT(kc, alpha, A, offA, B, offB, beta, C, offC, incRowC, incColc)


--------------------------------------------------------------------------------
/tests/tensors/test_shapeshifting_cuda.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../src/arraymancer
16 | import unittest, future, sequtils
17 | 
18 | suite "CUDA: Shapeshifting":
19 |   ## Note: by default (momentarily), CudaTensors are column-major
20 |   test "Contiguous conversion":
21 |     let a = [7, 4, 3, 1, 8, 6,
22 |              8, 1, 6, 2, 6, 6,
23 |              2, 0, 4, 3, 2, 0].toTensor.reshape([3,6]).astype(float).cuda
24 | 
25 |     # Tensor of shape 3x6 of type "int" on backend "Cpu"
26 |     # |7      4       3       1       8       6|
27 |     # |8      1       6       2       6       6|
28 |     # |2      0       4       3       2       0|
29 | 
30 |     let b = a.unsafeContiguous()
31 |     check: b.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0]
32 | 
33 |     # a is already contiguous, even if wrong layout.
34 |     # Nothing should be done
35 |     let c = a.unsafeContiguous(colMajor)
36 |     check: c.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0]
37 | 
38 |     # force parameter has been used.
39 |     # Layout will change even if a was contiguous
40 |     let d = a.unsafeContiguous(colMajor, force = true)
41 |     check: d.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0]
42 | 
43 | 
44 |     # Now test with a non-contiguous tensor
45 |     let u = a[_,0..1]
46 |     check: u.cpu.toRawSeq == @[7.0, 8, 2, 4, 1, 0, 3, 6, 4, 1, 2, 3, 8, 6, 2, 6, 6, 0]
47 |     check: u.cpu == [7.0,4,8,1,2,0].toTensor.reshape([3,2])
48 | 
49 |     check: u.unsafeContiguous(rowMajor, force=true).cpu.toRawSeq == @[7.0,4,8,1,2,0]
50 | 
51 |   test "Unsafe reshape":
52 |     block:
53 |       let a = toSeq(1..4).toTensor().astype(float).cuda
54 |       var a_view = a.unsafeReshape(2,2)
55 |       check: a_view.cpu == [[1.0,2],[3.0,4]].toTensor()
56 | 
57 |       # TODO
58 |       # a_view[_, _] = 0.0
59 |       # check: a.cpu == [0.0,0,0,0].toTensor()
60 | 
61 |     # on slices
62 |     block:
63 |       # not that 'a' here a let variable, however
64 |       # unsafeView and unsafeReshape allow us to
65 |       # modify its elements value
66 |       let a = toSeq(1..4).toTensor().astype(float).cuda
67 |       var a_view = a[1..2].unsafeReshape(1,2) # a[1..2] == a.unsafeSlice(1..2) for CudaTensors
68 |       check: a_view.cpu == [[2.0,3]].toTensor()
69 | 
70 |       # TODO: pending slice assignation
71 |       # a_view[_, _].cpu = 0
72 |       # check: a.cpu == [1.0,0,0,4].toTensor()


--------------------------------------------------------------------------------
/src/nn/layers/linear.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../../arraymancer_ag, ../../arraymancer, ../../autograd/utils
16 | import ./layer
17 | 
18 | type LinearGate* {.final.} [TT] = ref object of Gate[TT]
19 |   ## TODO: use fused AddMatMul gate: C <- alpha AB + beta C
20 |   x, W, b: Variable[TT]
21 | 
22 | method forward*[TT](self: LinearGate[TT], a: Variable[TT]): Variable[TT] {.inline, locks:0.}=
23 |   new result
24 | 
25 |   result.tape = a.tape
26 |   result.value = self.W.value * a.value
27 |   if not self.b.isNil:
28 |     result.value .+= self.b.value # Bias is broadcasted other the whole batch size
29 |   result.grad = zeros[getSubType(TT)](result.value.shape)
30 | 
31 | method backward*[TT](self: LinearGate[TT], gradient: TT): SmallDiffs[TT] {.inline, locks:0.}=
32 |   result[0] = self.W.value.unsafeTranspose * gradient
33 |   result[1] = gradient * self.x.value.unsafeTranspose
34 | 
35 |   if not self.b.isNil:
36 |     result[2] = sum(gradient, axis=0) # https://mlxai.github.io/2017/01/10/a-modular-approach-to-implementing-fully-connected-neural-networks.html
37 | 
38 | proc linear*[TT](x, weight: Variable[TT], bias: Variable[TT] = nil): Variable[TT] =
39 |   ## Input:
40 |   ##   - A x Variable of shape @[in_features, batch_size]
41 |   ##   - A weight Variable of shape @[out_features, in_features]
42 |   ##   - Optionally a bias Variable of shape @[out_features, 1]
43 |   ##
44 |   ## Return: Weight * x + bias
45 | 
46 |   when compileOption("boundChecks"):
47 |     if x.value.rank > 2:
48 |       raise newException(ValueError, "Tensor must be flattened for a linear layer (features, batch_size)")
49 | 
50 |     check_ctx(x, weight)
51 |     if not bias.isNil:
52 |       check_ctx(x, bias)
53 | 
54 |     # weight has shape: Out_features * In_features
55 |     # bias must have shape: Out_features * 1
56 |     if not bias.isNil and not (bias.value.shape == @[weight.value.shape[0], 1]):
57 |       raise newException(ValueError, "Incompatible shape: bias must be a vector of shape @[out_features, 1]")
58 | 
59 |   # Gate
60 |   var gate: LinearGate[TT]
61 |   new gate
62 |   gate.arity = if bias.isNil: 2 else: 3
63 |   gate.x = x
64 |   gate.W = weight
65 |   gate.b = bias
66 | 
67 |   # Node
68 |   var node: Node[TT]
69 |   new node
70 | 
71 |   node.gate = gate
72 |   node.parents[0] = x
73 |   node.parents[1] = weight
74 |   if not bias.isNil:
75 |     node.parents[2] = bias
76 | 
77 |   x.tape.push(node)
78 | 
79 |   # Resulting var
80 |   result = gate.forward(x)
81 |   result.ancestor = node
82 |   node.child = result


--------------------------------------------------------------------------------
/src/arraymancer/ufunc.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc astype*[T, U](t: Tensor[T], typ: typedesc[U]): Tensor[U] =
16 |   ## Apply type conversion on the whole tensor
17 |   result = t.map(x => x.U)
18 | 
19 | 
20 | # Built-in nim functions that doesn't work with makeUniversal
21 | proc abs*[T](t: Tensor[T]): Tensor[T] =
22 |   t.mapT(abs(x))
23 | 
24 | 
25 | # #############################################################
26 | # Autogen universal functions
27 | 
28 | # Note, the makeUniversal/Local documentation gets duplicated in docs at each template call
29 | # And shouldn't use ##
30 | template makeUniversal*(func_name: untyped) =
31 |   # Lift an unary function into an exported universal function.
32 |   #
33 |   # Universal functions apply element-wise.
34 |   #
35 |   # ``makeUniversal`` does not work when the internal type of the Tensor changes,
36 |   # for example, a function "isEven: int -> bool".
37 |   # Use ``map`` in this case instead instead
38 |   proc func_name*(t: Tensor): Tensor =
39 |     ## Auto-generated universal version of the function.
40 |     ##
41 |     ## The function can be used directly on tensors and will work element-wise.
42 |     t.mapT(func_name(x))
43 |   export func_name
44 | 
45 | template makeUniversalLocal*(func_name: untyped) =
46 |   # Lift an unary function into a non-exported universal function.
47 |   #
48 |   # Universal functions apply element-wise.
49 |   #
50 |   # ``makeUniversalLocal`` does not work when the internal type of the Tensor changes,
51 |   # for example, a function "isEven: int -> bool".
52 |   # Use ``map`` in this case instead instead
53 |   proc func_name(t: Tensor): Tensor =
54 |     t.mapT(func_name(x))
55 | 
56 | # Unary functions from Nim math library
57 | 
58 | makeUniversal(fac)
59 | #makeUniversal(classify)
60 | #makeUniversal(isPowerOfTwo)
61 | #makeUniversal(nextPowerOfTwo)
62 | #makeUniversal(countBits32)
63 | #makeUniversal(sum)
64 | makeUniversal(sqrt)
65 | makeUniversal(cbrt)
66 | makeUniversal(ln)
67 | makeUniversal(log10)
68 | makeUniversal(log2)
69 | makeUniversal(exp)
70 | makeUniversal(arccos)
71 | makeUniversal(arcsin)
72 | makeUniversal(arctan)
73 | makeUniversal(cos)
74 | makeUniversal(cosh)
75 | makeUniversal(sinh)
76 | makeUniversal(sin)
77 | makeUniversal(tan)
78 | makeUniversal(tanh)
79 | makeUniversal(erf)
80 | makeUniversal(erfc)
81 | makeUniversal(lgamma)
82 | makeUniversal(tgamma)
83 | makeUniversal(floor)
84 | makeUniversal(ceil)
85 | makeUniversal(trunc)
86 | makeUniversal(round)
87 | #makeUniversal(splitDecimal)
88 | makeUniversal(degToRad)
89 | makeUniversal(radToDeg)
90 | 


--------------------------------------------------------------------------------
/src/nn_primitives/activation_primitives.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import ../arraymancer, math
16 | 
17 | # Neural net activation functions that works directly on Tensors
18 | 
19 | 
20 | # Note:
21 | # 1. Canonical sigmoid "f(x) = 1 / (1 + exp(-x))" is unstable
22 | # for negative values < 709 (for float64)
23 | #
24 | # 2. Alternative expression stable for negative but unstable for positive is
25 | # "f(x) = exp(x) / (1 + exp(x))"
26 | #
27 | # 3. Introducing branching would be very costly.
28 | #
29 | # 4. Using tanh as 0.5 * (tanh(0.5 * x) + 1) is better than branching
30 | # but slow as well
31 | #
32 | # 5. Another alternative would be to clip x to max (-500, x) to avoid this instability
33 | #
34 | # Benchmarks available in the benchmark folder
35 | #
36 | 
37 | proc sigmoid*[T: SomeReal](t: Tensor[T]): Tensor[T] {.inline.}=
38 |   ## Logistic sigmoid activation function, :math:`f(x) = 1 / (1 + \exp(-x))`
39 |   ## Note: Canonical sigmoid is not stable for large negative value
40 | 
41 |   proc sigmoid_closure(x: T): T = 1.T / (1.T + exp(-x))
42 | 
43 |   # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T)
44 | 
45 |   return t.map(sigmoid_closure)
46 | 
47 | proc msigmoid*[T: SomeReal](t: var Tensor[T]): Tensor[T] {.inline.}=
48 |   ## Logistic sigmoid activation function, :math:`f(x) = 1 / (1 + \exp(-x))`
49 |   ## Note: Canonical sigmoid is not stable for large negative value
50 | 
51 |   proc sigmoid_closure(x: T): T = 1.T / (1.T + exp(-x))
52 | 
53 |   # stable: proc sigmoid_closure(x: T): T = 0.5.T * (tanh(0.5.T * x) + 1.T)
54 | 
55 |   return t.map(sigmoid_closure)
56 | 
57 | proc relu*[T](t: Tensor[T]): Tensor[T] {.inline.}=
58 |   proc relu_closure(x: T): T =
59 |     max(0.T, x)
60 |   t.map(relu_closure)
61 | 
62 | proc mrelu*[T](t: var Tensor[T]): Tensor[T] {.inline.}=
63 |   proc relu_closure(x: T): T =
64 |     max(0.T, x)
65 |   t.apply(relu_closure)
66 | 
67 | 
68 | proc relu_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.inline.}=
69 |   proc relu_backward_closure[T](x: T): T =
70 |     if x <= 0.T:
71 |       return 0.T
72 |     return 1.T
73 | 
74 |   result = cached_tensor.map(relu_backward_closure)
75 |   result .*= gradient
76 | 
77 | proc sigmoid_backward*[T](gradient: Tensor[T], cached_tensor: Tensor[T]): Tensor[T]{.inline.}=
78 |   proc sigmoid_backward_closure[T](x: T): T =
79 |     ## We suppose the input was already passed through the logistic sigmoid.
80 |     ## Derivative is f' = f * (1 - f)
81 |     x * (1 - x)
82 |   
83 |   result = cached_tensor.map(sigmoid_backward_closure)
84 |   result .*= gradient


--------------------------------------------------------------------------------
/src/arraymancer.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 the Arraymancer contributors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | when defined(doc):
16 |   include ../docs/autogen_nim_API
17 | 
18 | import sequtils, strutils, future, algorithm, nimblas, math, typetraits, macros, random
19 | 
20 | # Export OrderType (rowMajor, colMajor) from nimblas
21 | export OrderType
22 | 
23 | # include ../docs/autogen_nim_API
24 | include arraymancer/utils/functional,
25 |         arraymancer/utils/nested_containers,
26 |         arraymancer/utils/ast_utils,
27 |         arraymancer/global_config,
28 |         arraymancer/backend/blis,
29 |         arraymancer/backend/openmp,
30 |         arraymancer/data_structure,
31 |         arraymancer/data_structure_helpers,
32 |         arraymancer/init_cpu,
33 |         arraymancer/init_deprecated_0_1_0,
34 |         arraymancer/init_cpu_deprecated_0_2_0, # source of deprecation spam https://github.com/nim-lang/Nim/issues/6436
35 |         arraymancer/accessors,
36 |         arraymancer/accessors_macros_syntax,
37 |         arraymancer/accessors_macros_desugar,
38 |         arraymancer/accessors_macros_read,
39 |         arraymancer/accessors_macros_write,
40 |         arraymancer/comparison,
41 |         arraymancer/higher_order,
42 |         arraymancer/higher_order_deprecated,
43 |         arraymancer/shapeshifting,
44 |         arraymancer/display,
45 |         arraymancer/ufunc,
46 |         arraymancer/operators_blas_l1,
47 |         arraymancer/fallback/blas_l3_gemm,
48 |         arraymancer/fallback/naive_l2_gemv,
49 |         arraymancer/operators_blas_l2l3,
50 |         arraymancer/operators_broadcasted,
51 |         arraymancer/math_functions,
52 |         arraymancer/filling_data,
53 |         arraymancer/aggregate,
54 |         arraymancer/term_rewriting,
55 |         arraymancer/shortcuts,
56 |         arraymancer/exporting
57 | 
58 | 
59 | when defined(cuda):
60 |   # Nimcuda poses issues with Nim docgen
61 |   import nimcuda/[cuda_runtime_api, driver_types, cublas_api, cublas_v2, nimcuda]
62 | 
63 | when defined(cuda) or defined(doc):
64 |   include ./arraymancer/backend/cuda_global_state,
65 |           ./arraymancer/backend/cuda,
66 |           ./arraymancer/backend/cublas,
67 |           # ./arraymancer/backend/cublas_helper_proc,
68 |           ./arraymancer/init_cuda,
69 |           ./arraymancer/accessors_cuda,
70 |           ./arraymancer/display_cuda,
71 |           ./arraymancer/elementwise_cuda.nim,
72 |           ./arraymancer/elementwise_glue_cuda.nim,
73 |           ./arraymancer/higher_order_cuda,
74 |           ./arraymancer/operators_blas_l1_cuda,
75 |           ./arraymancer/operators_blas_l2l3_cuda,
76 |           ./arraymancer/shapeshifting_cuda


--------------------------------------------------------------------------------
/tests/tensors/test_accessors.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import ../../src/arraymancer
 16 | import unittest, math
 17 | 
 18 | 
 19 | suite "Accessing and setting tensor values":
 20 |   test "Accessing and setting a single value":
 21 |     var a = zeros[int](@[2,3,4])
 22 |     a[1,2,2] = 122
 23 |     check: a[1,2,2] == 122
 24 | 
 25 |     var b = zeros[int](@[3,4])
 26 |     b[1,2] = 12
 27 |     check: b[1,2] == 12
 28 |     b[0,0] = 999
 29 |     check: b[0,0] == 999
 30 |     b[2,3] = 111
 31 |     check: b[2,3] == 111
 32 | 
 33 | 
 34 |   when compileOption("boundChecks"):
 35 |     test "Out of bounds checking":
 36 |       var a = newTensor(@[2,3,4], int, Backend.Cpu)
 37 |       expect(IndexError):
 38 |         a[2,0,0] = 200
 39 |       var b = newTensor(@[3,4], int, Backend.Cpu)
 40 |       expect(IndexError):
 41 |         b[3,4] = 999
 42 |       expect(IndexError):
 43 |         discard b[-1,0]
 44 |       expect(IndexError):
 45 |         discard b[0,-2]
 46 |   else:
 47 |     echo "Bound-checking is disabled. The out-of-bounds checking test has been skipped."
 48 | 
 49 |   test "Iterators":
 50 |     const
 51 |       a = @[1, 2, 3, 4, 5]
 52 |       b = @[1, 2, 3]
 53 |     var
 54 |       vd: seq[seq[int]]
 55 |       row: seq[int]
 56 |     vd = newSeq[seq[int]]()
 57 |     for i, aa in a:
 58 |       row = newSeq[int]()
 59 |       vd.add(row)
 60 |       for j, bb in b:
 61 |         vd[i].add(aa^bb)
 62 | 
 63 |     let nda_vd = vd.toTensor()
 64 | 
 65 |     let expected_seq = @[1,1,1,2,4,8,3,9,27,4,16,64,5,25,125]
 66 | 
 67 |     var seq_val: seq[int] = @[]
 68 |     for i in nda_vd:
 69 |       seq_val.add(i)
 70 | 
 71 |     check: seq_val == expected_seq
 72 | 
 73 |     var seq_validx: seq[tuple[idx: seq[int], val: int]] = @[]
 74 |     for i,j in nda_vd:
 75 |       seq_validx.add((i,j))
 76 | 
 77 |     check: seq_validx[0] == (@[0,0], 1)
 78 |     check: seq_validx[10] == (@[3,1], 16)
 79 | 
 80 |     let t_nda = transpose(nda_vd)
 81 | 
 82 |     var seq_transpose: seq[tuple[idx: seq[int], val: int]] = @[]
 83 |     for i,j in t_nda:
 84 |       seq_transpose.add((i,j))
 85 | 
 86 |     check: seq_transpose[0] == (@[0,0], 1)
 87 |     check: seq_transpose[8] == (@[1,3], 16)
 88 | 
 89 |   test "indexing + in-place operator":
 90 |     var a = newTensor[int]([3,3])
 91 | 
 92 |     a[1,1] += 10
 93 | 
 94 |     a[1,1] *= 20
 95 | 
 96 |     check: a == [[0,0,0],[0,200,0],[0,0,0]].toTensor
 97 | 
 98 |   test "Zipping two tensors":
 99 |     let a = [[1,2],[3,4]].toTensor()
100 |     let b = [[5,6],[7,8]].toTensor()
101 | 
102 |     var res = 0
103 |     for ai, bi in zip(a, b):
104 |       res += ai + bi
105 |     check: res == 36


--------------------------------------------------------------------------------
/src/arraymancer/operators_blas_l2l3_cuda.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | proc cudaMV_y_eq_aAx_p_by[T: SomeReal](
16 |   alpha: T, a, x: CudaTensor[T],
17 |   beta: T, y: var CudaTensor[T]) =
18 |   # Matrix-Vector: y = alpha A matvecmul x + beta y
19 | 
20 |   # TODO: remove this contiguous layout constraint
21 |   if not a.isContiguous:
22 |     raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
23 | 
24 |   let
25 |     a_is_colMajor = a.is_F_contiguous
26 | 
27 |     transpose_A = if a_is_colMajor: CUBLAS_OP_N
28 |                   else: CUBLAS_OP_T
29 |     ld_A = if a_is_colMajor: a.strides[1]
30 |            else: a.strides[0]
31 | 
32 |   cublas_gemv(
33 |       transpose_A, a.shape[0], a.shape[1],
34 |       alpha, a.get_data_ptr, ld_A,
35 |       x.get_data_ptr, x.strides[0],
36 |       beta, y.get_data_ptr, y.strides[0])
37 | 
38 | proc cudaMM_C_eq_aAB_p_bC[T: SomeReal](
39 |   alpha: T, a, b: CudaTensor[T],
40 |   beta: T, c: var CudaTensor[T]) =
41 |   # Matrix: C = alpha A matmul B + beta C
42 | 
43 |   # TODO: remove this contiguous layout constraint
44 |   if not (a.isContiguous and b.isContiguous):
45 |     raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
46 | 
47 |   let
48 |     a_is_colMajor = a.is_F_contiguous
49 |     b_is_colMajor = b.is_F_contiguous
50 | 
51 |     transpose_A = if a_is_colMajor: CUBLAS_OP_N
52 |                   else: CUBLAS_OP_T
53 |     ld_A = if a_is_colMajor: a.strides[1]
54 |            else: a.strides[0]
55 | 
56 |     transpose_B = if b_is_colMajor: CUBLAS_OP_N
57 |                   else: CUBLAS_OP_T
58 |     ld_B = if b_is_colMajor: b.strides[1]
59 |            else: b.strides[0]
60 | 
61 |     ld_C = c.strides[1] # C is always F contiguous (TODO test)
62 | 
63 |   cublas_gemm(transpose_A, transpose_B,
64 |               a.shape[0], b.shape[1], a.shape[1],
65 |               alpha, a.get_data_ptr, ld_A,
66 |               b.get_data_ptr, ld_B,
67 |               beta, c.get_data_ptr, ld_C)
68 | 
69 | proc `*`*[T: SomeReal](a, b: CudaTensor[T]): CudaTensor[T] =
70 |   ## Matrix multiplication (Matrix-Matrix and Matrix-Vector) on CUDA
71 | 
72 |   if a.rank == 2 and b.rank == 2:
73 |     when compileOption("boundChecks"): check_matmat(a,b)
74 |     result = newCudaTensor[T]([a.shape[0], b.shape[1]])
75 |     cudaMM_C_eq_aAB_p_bC(1.T, a, b, 0.T, result)
76 |   elif a.rank == 2 and b.rank == 1:
77 |     when compileOption("boundChecks"): check_matvec(a,b)
78 |     result = newCudaTensor[T]([a.shape[0]])
79 |     cudaMV_y_eq_aAx_p_by(1.T,a, b, 0.T, result)
80 |   else: raise newException(ValueError, "Matrix-Matrix or Matrix-Vector multiplication valid only if first Tensor is a Matrix and second is a Matrix or Vector")


--------------------------------------------------------------------------------
/tests/tensors/test_init.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import ../../src/arraymancer
 16 | import unittest, math, sequtils
 17 | 
 18 | suite "Creating a new Tensor":
 19 |   test "Creating from sequence":
 20 |     let t1 = @[1,2,3].toTensor()
 21 |     check: t1.shape == @[3]
 22 |     check: t1.rank == 1
 23 | 
 24 |     const
 25 |       a = @[1, 2, 3, 4, 5]
 26 |       b = @[1, 2, 3, 4, 5]
 27 | 
 28 |     var
 29 |       vandermonde: seq[seq[int]]
 30 |       row: seq[int]
 31 | 
 32 |     vandermonde = newSeq[seq[int]]()
 33 | 
 34 |     for i, aa in a:
 35 |       row = newSeq[int]()
 36 |       vandermonde.add(row)
 37 |       for j, bb in b:
 38 |         vandermonde[i].add(aa^bb)
 39 | 
 40 |     let t2 = vandermonde.toTensor()
 41 |     check: t2.rank == 2
 42 |     check: t2.shape == @[5, 5]
 43 | 
 44 |     let nest3 = @[
 45 |             @[
 46 |               @[1,2,3],
 47 |               @[1,2,3]
 48 |             ],
 49 |             @[
 50 |               @[3,2,1],
 51 |               @[3,2,1]
 52 |             ],
 53 |             @[
 54 |               @[4,4,5],
 55 |               @[4,4,4]
 56 |             ],
 57 |             @[
 58 |               @[6,6,6],
 59 |               @[6,6,6]
 60 |             ]
 61 |           ]
 62 | 
 63 |     let t3 = nest3.toTensor()
 64 |     check: t3.rank == 3
 65 |     check: t3.shape == @[4, 2, 3]  # 4 rows, 2 cols, 3 depth. depth indices moves the fastest. Same scheme as Numpy.
 66 | 
 67 |     let u = @[@[1.0, -1, 2],@[0.0, -1]]
 68 | 
 69 |     when compileOption("boundChecks"):
 70 |       expect(IndexError):
 71 |         discard u.toTensor()
 72 |     else:
 73 |       echo "Bound-checking is disabled. The incorrect seq shape test has been skipped."
 74 | 
 75 |   test "Check that Tensor shape is in row-by-column order":
 76 |     let s = @[@[1,2,3],@[3,2,1]]
 77 |     let t = s.toTensor()
 78 | 
 79 |     check: t.shape == @[2,3]
 80 | 
 81 |     let u = newTensor[int](@[2,3])
 82 |     check: u.shape == @[2,3]
 83 | 
 84 |     check: u.shape == t.shape
 85 | 
 86 |   test "Zeros":
 87 |     block:
 88 |       let t = zeros[float]([4,4,4])
 89 |       for v in t.items:
 90 |         check v == 0.0f
 91 |     block:
 92 |       let t = zeros[int]([4,4,4])
 93 |       for v in t.items:
 94 |         check v == 0
 95 | 
 96 |   test "Ones":
 97 |     block:
 98 |       let t = ones[float]([4,4,4])
 99 |       for v in t.items:
100 |         check v == 1.0f
101 |     block:
102 |       let t = ones[int]([4,4,4])
103 |       for v in t.items:
104 |         check v == 1
105 | 
106 |   test "Filled new tensor":
107 |     block:
108 |       let t = newTensorWith([4,4,4], 2.0f)
109 |       for v in t.items:
110 |         check v == 2.0f
111 |     block:
112 |       let t = newTensorWith([4,4,4], 2)
113 |       for v in t.items:
114 |         check v == 2
115 | 
116 |   test "Random tensor":
117 |     block:
118 |       # Check that randomTensor doesn't silently convert float32 to float64
119 |       let a = randomTensor([3, 4], 100'f32)
120 | 
121 |       check: a[0,0] is float32
122 |   # TODO add tests for randomTensor
123 | 
124 | 


--------------------------------------------------------------------------------
/src/arraymancer/operators_blas_l1.nim:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Mamy André-Ratsimbazafy
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # Bounds checking functions
17 | proc check_dot_prod(a, b:AnyTensor)  {.noSideEffect.}=
18 |   if a.rank != 1 or b.rank != 1: raise newException(ValueError, "Dot product is only supported for vectors (tensors of rank 1)")
19 |   if a.shape != b.shape: raise newException(ValueError, "Vector should be the same length")
20 | 
21 | # ####################################################################
22 | # BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix)
23 | 
24 |   # FIXME: Can't use built-in proc `+` in map: https://github.com/nim-lang/Nim/issues/5702
25 |   # map2(a, `+`, b)
26 | 
27 | proc dot*[T: SomeReal](a, b: Tensor[T]): T {.noSideEffect.} =
28 |   ## Vector to Vector dot (scalar) product
29 |   when compileOption("boundChecks"): check_dot_prod(a,b)
30 |   return dot(a.shape[0], a.get_data_ptr, a.strides[0], b.get_data_ptr, b.strides[0])
31 | 
32 | proc dot*[T: SomeInteger](a, b: Tensor[T]): T {.noSideEffect.} =
33 |   ## Vector to Vector dot (scalar) product
34 |   # Fallback for non-floats
35 |   when compileOption("boundChecks"): check_dot_prod(a,b)
36 |   for ai, bi in zip(a, b):
37 |     result += ai * bi
38 | 
39 | # #########################################################
40 | # # Tensor-Tensor linear algebra
41 | # # shape checks are done in map2 proc
42 | 
43 | proc `+`*[T: SomeNumber](a, b: Tensor[T]): Tensor[T] =
44 |   ## Tensor addition
45 |   map2T(a, b, x + y)
46 | 
47 | proc `-`*[T: SomeNumber](a, b: Tensor[T]): Tensor[T] =
48 |   ## Tensor substraction
49 |   map2T(a, b, x - y)
50 | 
51 | # #########################################################
52 | # # Tensor-Tensor in-place linear algebra
53 | 
54 | proc `+=`*[T: SomeNumber](a: var Tensor[T], b: Tensor[T]) =
55 |   ## Tensor in-place addition
56 |   a.apply2T(b, x + y)
57 | 
58 | proc `-=`*[T: SomeNumber](a: var Tensor[T], b: Tensor[T]) =
59 |   ## Tensor in-place substraction
60 |   a.apply2T(b, x - y)
61 | 
62 | # #########################################################
63 | # # Tensor-scalar linear algebra
64 | 
65 | proc `*`*[T: SomeNumber](a: T, t: Tensor[T]): Tensor[T] =
66 |   ## Element-wise multiplication by a scalar
67 |   t.mapT(x * a)
68 | 
69 | proc `*`*[T: SomeNumber](t: Tensor[T], a: T): Tensor[T] =
70 |   ## Element-wise multiplication by a scalar
71 |   a * t
72 | 
73 | proc `/`*[T: SomeReal](t: Tensor[T], a: T): Tensor[T] =
74 |   ## Element-wise division by a float scalar
75 |   t.mapT(x / a)
76 | 
77 | proc `div`*[T: SomeInteger](t: Tensor[T], a: T): Tensor[T] =
78 |   ## Element-wise division by an integer
79 |   t.mapT(x div a)
80 | 
81 | # #########################################################
82 | # # Tensor-scalar in-place linear algebra
83 | 
84 | proc `*=`*[T: SomeNumber](t: var Tensor[T], a: T) =
85 |   ## Element-wise multiplication by a scalar (in-place)
86 |   t.applyT(x * a)
87 | 
88 | proc `/=`*[T: SomeReal](t: var Tensor[T], a: T) =
89 |   ## Element-wise division by a scalar (in-place)
90 |   t.applyT(x / a)
91 | 
92 | proc `/=`*[T: SomeInteger](t: var Tensor[T], a: T) =
93 |   ## Element-wise division by a scalar (in-place)
94 |   t.applyT(x div a)
95 | 


--------------------------------------------------------------------------------
/src/arraymancer/backend/cublas_helper_proc.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # Cublas helper procs for L1 BLAS
 17 | # With custom kernels they shouldn't be needed anymore
 18 | # They however have a nice interface to call for fused aX + Y or aA + Bb
 19 | 
 20 | #################################################
 21 | ## In-place
 22 | 
 23 | proc cudaVV_A_eq_A_p_bB[T: SomeReal](
 24 |   a: var CudaTensor[T], beta: T, b: CudaTensor[T]) {.inline, deprecated.}=
 25 |   # Vector: A = A + beta B
 26 | 
 27 |   cublas_axpy(a.shape[0],
 28 |               beta,
 29 |               b.get_data_ptr, b.strides[0],
 30 |               a.get_data_ptr, a.strides[0])
 31 | 
 32 | proc cudaMM_A_eq_aA_p_bB[T: SomeReal](
 33 |   alpha: T, a: var CudaTensor[T],
 34 |   beta: T, b: CudaTensor[T]) {.deprecated.}=
 35 |   # Matrix: A = alpha A + beta B
 36 | 
 37 |   # TODO: remove this contiguous layout constraint (via conversion or custom kernel)
 38 |   if not (isContiguous(a) and isContiguous(b)):
 39 |     raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
 40 | 
 41 |   if not is_F_contiguous(a):
 42 |     raise newException(ValueError, "NotImplemented: the modified tensor must have a column-major layout")
 43 | 
 44 |   let
 45 |     b_is_colMajor = b.is_F_contiguous
 46 | 
 47 |     transpose_B = if b_is_colMajor: CUBLAS_OP_N
 48 |                   else: CUBLAS_OP_T
 49 | 
 50 |     ld_B =  if b_is_colMajor: b.strides[1]
 51 |             else: b.strides[0]
 52 | 
 53 |   cublas_geam(CUBLAS_OP_N, transpose_B,
 54 |               a.shape[0], a.shape[1],
 55 |               alpha,
 56 |               a.get_data_ptr, a.strides[1],
 57 |               beta,
 58 |               b.get_data_ptr, ld_B,
 59 |               a.get_data_ptr, a.strides[1])
 60 |   # In column-majour layout a.shape[0] == a.strides[1]
 61 | 
 62 | #############################################################
 63 | ## Out-of-place
 64 | 
 65 | proc cudaVV_C_eq_A_p_bB[T: SomeReal]( a: CudaTensor[T],
 66 |                                       beta: T, b: CudaTensor[T],
 67 |                                       result: var CudaTensor[T]) {.inline, deprecated.}=
 68 |   # Vector: C = A + beta B
 69 |   result = newCudaTensor[T](a.shape)
 70 | 
 71 |   cublas_copy(a.len, a.get_data_ptr, a.strides[0],
 72 |               result.get_data_ptr, result.strides[0])
 73 | 
 74 |   cudaVV_A_eq_A_p_bB(result, beta, b)
 75 | 
 76 | proc cudaMM_C_eq_aA_p_aB[T: SomeReal](alpha: T, a: CudaTensor[T],
 77 |                                           beta: T, b: CudaTensor[T],
 78 |                                           result: var CudaTensor[T]) {.deprecated.}=
 79 |   # TODO: remove this contiguous layout constraint (via conversion or custom kernel)
 80 |   if not (isContiguous(a) and isContiguous(b)):
 81 |     raise newException(ValueError, "NotImplemented: for now both tensors should be contiguous")
 82 | 
 83 |   result = newCudaTensor[T](a.shape) # result is colMajor
 84 | 
 85 |   let
 86 |     a_is_colMajor = a.is_F_contiguous
 87 |     b_is_colMajor = b.is_F_contiguous
 88 | 
 89 |     transpose_A = if a_is_colMajor: CUBLAS_OP_N
 90 |                   else: CUBLAS_OP_T
 91 |     ld_A = if a_is_colMajor: a.strides[1]
 92 |            else: a.strides[0]
 93 | 
 94 |     transpose_B = if b_is_colMajor: CUBLAS_OP_N
 95 |                   else: CUBLAS_OP_T
 96 |     ld_B = if b_is_colMajor: b.strides[1]
 97 |            else: b.strides[0]
 98 | 
 99 |   cublas_geam(transpose_A, transpose_B,
100 |               a.shape[0], a.shape[1],
101 |               alpha,
102 |               a.get_data_ptr, ld_A,
103 |               beta,
104 |               b.get_data_ptr, ld_B,
105 |               result.get_data_ptr, result.strides[1])


--------------------------------------------------------------------------------
/src/arraymancer/operators_blas_l1_cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # ####################################################################
 16 | # BLAS Level 1 (Vector dot product, Addition, Scalar to Vector/Matrix)
 17 | 
 18 | proc dot*[T: SomeReal](a, b: CudaTensor[T]): T {.inline.}=
 19 |   ## Vector to Vector dot (scalar) product
 20 |   when compileOption("boundChecks"): check_dot_prod(a,b)
 21 |   cublas_dot( a.shape[0],
 22 |               a.get_data_ptr, a.strides[0],
 23 |               b.get_data_ptr, b.strides[0],
 24 |               addr result)
 25 | 
 26 | proc cuda_inPlaceAdd = discard # This is a hack so that the symbol is open
 27 | cuda_assign_glue(cuda_inPlaceAdd, "InPlaceAddOp")
 28 | 
 29 | proc `+=`*[T: SomeReal](a: var CudaTensor[T], b: CudaTensor[T]) =
 30 |   ## CudaTensor in-place addition
 31 | 
 32 |   when compileOption("boundChecks"):
 33 |     check_elementwise(a,b)
 34 | 
 35 |   cuda_assign_call(cuda_inPlaceAdd, a, b)
 36 | 
 37 |   # TODO: if a and b share the same location, TEST
 38 | 
 39 | proc cuda_Add = discard # This is a hack so that the symbol is open
 40 | cuda_binary_glue(cuda_Add, "AddOp")
 41 | 
 42 | proc `+`*[T: SomeReal](a,b: CudaTensor[T]): CudaTensor[T] =
 43 |   ## CudaTensor addition
 44 | 
 45 |   when compileOption("boundChecks"):
 46 |     check_elementwise(a,b)
 47 | 
 48 |   result = newCudaTensor[T](a.shape)
 49 |   cuda_binary_call(cuda_Add, result, a, b)
 50 | 
 51 | proc cuda_inPlaceSub = discard # This is a hack so that the symbol is open
 52 | cuda_assign_glue(cuda_inPlaceSub, "InPlaceSubOp")
 53 | 
 54 | proc `-=`*[T: SomeReal](a: var CudaTensor[T], b: CudaTensor[T]) =
 55 |   ## CudaTensor in-place substraction
 56 | 
 57 |   when compileOption("boundChecks"): check_elementwise(a,b)
 58 | 
 59 |   cuda_assign_call(cuda_inPlaceSub, a, b)
 60 | 
 61 |   # TODO: if a and b share the same location, TEST
 62 | 
 63 | 
 64 | proc cuda_Sub = discard # This is a hack so that the symbol is open
 65 | cuda_binary_glue(cuda_Sub, "SubOp")
 66 | 
 67 | proc `-`*[T: SomeReal](a,b: CudaTensor[T]): CudaTensor[T] =
 68 |   ## CudaTensor substraction
 69 | 
 70 |   when compileOption("boundChecks"): check_elementwise(a,b)
 71 | 
 72 |   result = newCudaTensor[T](a.shape)
 73 |   cuda_binary_call(cuda_Sub, result, a, b)
 74 | 
 75 | proc `*=`*[T:SomeReal](t: var CudaTensor[T]; a: T) {.inline.}=
 76 |   ## CudaTensor inplace multiplication by a scalar
 77 | 
 78 |   # We multiply all elements of the CudaTensor regardless of shape/strides
 79 |   # So this operation can be applied to tensors of all ranks.
 80 |   # Hence we use the whole allocated length and a stride of 1
 81 |   cublas_scal(t.data.len, a, t.get_data_ptr, 1)
 82 | 
 83 | proc `*`*[T:SomeReal](a: T, t: CudaTensor[T]): CudaTensor[T] {.inline.}=
 84 |   ## CudaTensor multiplication by a scalar
 85 | 
 86 |   # TODO replace by a custom kernel
 87 |   # Instead of a full clone we keep only the useful which is advantageous if t was a slice
 88 |   # It also makes it contiguous
 89 |   result = t.clone()
 90 |   result *= a
 91 | 
 92 | proc `*`*[T:SomeReal](t: CudaTensor[T], a: T): CudaTensor[T] {.inline.}=
 93 |   ## CudaTensor multiplication by a scalar
 94 |   a * t
 95 | 
 96 | proc `/=`*[T:SomeReal](t: var CudaTensor[T]; a: T) {.inline.}=
 97 |   ## CudaTensor in-place division by a scalar
 98 |   t *= (1/a)
 99 | 
100 | proc `/`*[T:SomeReal](t: CudaTensor[T], a: T): CudaTensor[T] {.inline.}=
101 |   ## CudaTensor division by a scalar
102 | 
103 |   # TODO replace by a custom kernel
104 |   # Instead of a full clone we keep only the useful which is advantageous if t was a slice
105 |   # It also makes it contiguous
106 |   # Furthermore doing t[i]/a instead of 1/a * t[i] will be much better for speed and numerical stability
107 |   (1/a) * t
108 | 
109 | proc `/`*[T:SomeReal](a: T, t: CudaTensor[T]): CudaTensor[T] {.inline.}=
110 |   ## CudaTensor division by a scalar
111 |   (1/a) * t


--------------------------------------------------------------------------------
/src/arraymancer/higher_order_deprecated.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | proc fmap*[T, U](t: Tensor[T], f: T -> U): Tensor[U]
 17 |   {.deprecated, inline.}=
 18 |   ## DEPRECATED
 19 |   ##
 20 |   ## Replace by map2
 21 |   t.map(f)
 22 | 
 23 | proc fmap2*[T, U, V](t1: Tensor[T], t2: Tensor[U], f: (T,U) -> V): Tensor[V]
 24 |   {.deprecated, inline.}=
 25 |   ## DEPRECATED
 26 |   ##
 27 |   ## Replaced by map2
 28 |   ##
 29 |   ## Note the new argument order of map2 to accomodate for
 30 |   ## t1.map2(`op`, t2) where op is an infix operator.
 31 |   t1.map2(f, t2)
 32 | 
 33 | 
 34 | # # Compute aggregate/reduction/folds over tensors
 35 | 
 36 | # ### Elementwise generic aggregate functions
 37 | # Note: You can't pass builtins like `+` or `+=` due to Nim limitations
 38 | # https://github.com/nim-lang/Nim/issues/2172
 39 | 
 40 | proc agg*[T: SomeNumber](t: Tensor[T],
 41 |                             f:(T, T)-> T,
 42 |                             start_val: T
 43 |                             ): T
 44 |   {.noSideEffect, inline, deprecated.}=
 45 |   ## DEPRECATED, use fold instead.
 46 |   ##
 47 |   ## Note: order between function f and start_val has changed
 48 |   ##
 49 |   ## Compute the aggregate
 50 |   ## Input:
 51 |   ##     - A tensor to aggregate on
 52 |   ##     - The aggregation function. It is applied this way: new_aggregate = f(old_aggregate, current_value)
 53 |   ##     - The starting value
 54 |   ##     - The axis
 55 |   t.fold(start_val, f)
 56 | 
 57 | proc agg_inplace*[T: SomeNumber](
 58 |                             accum_val: var T,
 59 |                             f: proc(x:var T, y:T), # We can't use the nice future syntax here for unknown reason
 60 |                             t: Tensor[T],
 61 |                             )
 62 |   {.noSideEffect, inline, deprecated.}=
 63 |   ## DEPRECATED, use fold instead.
 64 |   ##
 65 |   ## You will have to switch to a non-inplace function.
 66 |   ##
 67 |   ## Compute the aggregate
 68 |   ## Input:
 69 |   ##     - The accumulating value which will be modified in-place
 70 |   ##     - The aggregation in-place function. It is applied this way: f(var old_aggregate, current_value)
 71 |   ##     - A tensor to aggregate from
 72 |   ##     - The axis
 73 |   for val in t:
 74 |     f(accum_val, val)
 75 | 
 76 | 
 77 | # ### Axis generic functions
 78 | # `+`, `+=` for tensors are not "built-ins"
 79 | 
 80 | proc agg*[T: SomeNumber](t: Tensor[T],
 81 |                             f:(Tensor[T], Tensor[T])-> Tensor[T],
 82 |                             start_val: Tensor[T],
 83 |                             axis: int
 84 |                             ): Tensor[T]
 85 |   {.noSideEffect, inline, deprecated.}=
 86 |   ## DEPRECATED, use fold instead.
 87 |   ##
 88 |   ## Note: order between function f and start_val has changed
 89 |   ##
 90 |   ## Input:
 91 |   ##     - A tensor to aggregate on
 92 |   ##     - The aggregation function. It is applied this way: new_aggregate = f(old_aggregate, current_value)
 93 |   ##     - The starting value
 94 |   ##     - The axis
 95 | 
 96 |   t.fold(start_val, f, axis)
 97 | 
 98 | proc agg_inplace*[T: SomeNumber](
 99 |                             accum_val: var Tensor[T],
100 |                             f: proc(x:var Tensor[T], y:Tensor[T]), # We can't use the nice future syntax here for unknown reason
101 |                             t: Tensor[T],
102 |                             axis: int
103 |                             )
104 |   {.noSideEffect, inline, deprecated.}=
105 |   ## DEPRECATED, use fold instead.
106 |   ##
107 |   ## You will have to switch to a non-inplace function.
108 |   ##
109 |   ## Input:
110 |   ##     - The accumulating value which will be modified in-place
111 |   ##     - A tensor to aggregate from
112 |   ##     - The aggregation in-place function. It is applied this way: f(var old_aggregate, current_value)
113 |   ##     - The axis
114 | 
115 |   for val in t.axis(axis):
116 |     f(accum_val, val)
117 | 


--------------------------------------------------------------------------------
/src/arraymancer/elementwise_cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # Collection of cuda basic element-wise operations
 17 | # to be use by higher-order functions.
 18 | # The end-goal is to have a macro/template that can auto-generate these from:
 19 | #
 20 | # elementwise:
 21 | #   C = (A + B*sin(D))/exp(-X)
 22 | #
 23 | # __ldg is a cuda intrinsics to load read-only data
 24 | # from a special cache
 25 | 
 26 | # Assignment op
 27 | # Does element-wise A[i] `op=` B[i]
 28 | template cuda_assign_op(op_name, op_symbol: string)=
 29 |   {.emit: ["""
 30 |   template<typename T>
 31 |   struct """,op_name,"""{
 32 |   __device__ __forceinline__ void operator()(
 33 |       T *  __restrict__ dst,
 34 |       const T *  __restrict__ src){
 35 |       *dst """,op_symbol,""" __ldg(src);
 36 |       }
 37 |   };
 38 |   """].}
 39 | 
 40 | # Assignment with scalars
 41 | template cuda_assignscal_op(op_name, op_symbol: string)=
 42 |   {.emit: ["""
 43 |   template<typename T>
 44 |   struct """,op_name,"""{
 45 |   __device__ __forceinline__ void operator()(
 46 |       T *  __restrict__ dst,
 47 |       const T *  __restrict__ scal){
 48 |       *dst """,op_symbol,""" scal;
 49 |       }
 50 |   };
 51 |   """].}
 52 | 
 53 | # Binary op
 54 | # Does C[i] = A[i] `op` B[i]
 55 | template cuda_binary_op(op_name, op_symbol: string)=
 56 |   {.emit:["""
 57 |   template<typename T>
 58 |   struct """,op_name,"""{
 59 |   __device__ __forceinline__ void operator()(
 60 |       T *  __restrict__ dst,
 61 |       const T *  __restrict__ A,
 62 |       const T *  __restrict__ B){
 63 |       *dst = __ldg(A)""", op_symbol, """ __ldg(B);
 64 |       }
 65 |   };
 66 |   """].}
 67 | 
 68 | # Binary op with scalar on the left
 69 | # Does C[i] = a `op` B[i]
 70 | template cuda_lscal_op(op_name, op_symbol: string)=
 71 |   {.emit:["""
 72 |   template<typename T>
 73 |   struct """,op_name,"""{
 74 |   __device__ __forceinline__ void operator()(
 75 |       T *  __restrict__ dst,
 76 |       const T alpha,
 77 |       const T *  __restrict__ B){
 78 |       *dst = alpha""", op_symbol, """ __ldg(B);
 79 |       }
 80 |   };
 81 |   """].}
 82 | 
 83 | # Binary op with scalar on the right
 84 | # Does C[i] = A[i] `op` beta
 85 | template cuda_rscal_op(op_name, op_symbol: string)=
 86 |   {.emit:["""
 87 |   template<typename T>
 88 |   struct """,op_name,"""{
 89 |   __device__ __forceinline__ void operator()(
 90 |       T *  __restrict__ dst,
 91 |       const T *  __restrict__ A,
 92 |       const T beta){
 93 |       *dst = __ldg(A)""", op_symbol, """ beta;
 94 |       }
 95 |   };
 96 |   """].}
 97 | 
 98 | # Unary op
 99 | # Does C[i] = op(A[i])
100 | template cuda_unary_op(op_name, op_symbol: string)=
101 |   {.emit:["""
102 |   template<typename T>
103 |   struct """,op_name,"""{
104 |   __device__ __forceinline__ void operator()(
105 |       T *  __restrict__ dst,
106 |       const T *  __restrict__ src){
107 |       *dst = """, op_symbol, """(__ldg(src));
108 |       }
109 |   };
110 |   """].}
111 | 
112 | cuda_assign_op("CopyOp", "=")
113 | cuda_assign_op("InPlaceAddOp", "+=")
114 | cuda_assign_op("InPlaceSubOp", "-=")
115 | cuda_assign_op("InPlaceMulOp", "*=")
116 | cuda_assign_op("InPlaceDivOp", "/=")
117 | 
118 | cuda_assignscal_op("CopyScalOp", "=")
119 | cuda_assignscal_op("InPscalAddOp", "+=")
120 | cuda_assignscal_op("InPscalSubOp", "-=")
121 | cuda_assignscal_op("InPscalMulOp", "*=")
122 | cuda_assignscal_op("InPscalDivOp", "/=")
123 | 
124 | cuda_binary_op("AddOp", "+")
125 | cuda_binary_op("SubOp", "-")
126 | cuda_binary_op("MulOp", "*")
127 | cuda_binary_op("DivOp", "/")
128 | 
129 | cuda_lscal_op("LscalMul","*")
130 | cuda_lscal_op("LscalDiv","/")
131 | cuda_lscal_op("LscalSub","-")
132 | 
133 | cuda_rscal_op("RscalDiv","/")
134 | cuda_rscal_op("RscalSub","-")
135 | cuda_rscal_op("RscalAdd","+")
136 | 
137 | cuda_unary_op("NegOp","-")
138 | cuda_unary_op("ExpOp","exp")
139 | cuda_unary_op("SinOp","sin")
140 | cuda_unary_op("CosOp","cos")
141 | cuda_unary_op("TanOp","tan")
142 | cuda_unary_op("TanhOp","tanh")
143 | 


--------------------------------------------------------------------------------
/src/arraymancer/shapeshifting_cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | proc unsafeTranspose*(t: CudaTensor): CudaTensor {.noSideEffect.}=
 16 |   ## Transpose a Tensor.
 17 |   ##
 18 |   ## For N-d Tensor with shape (0, 1, 2 ... n-1) the resulting tensor will have shape (n-1, ... 2, 1, 0)
 19 |   ##
 20 |   ## Warning ⚠ CudaTensor temporary default:
 21 |   ##   This is a no-copy operation, data is shared with the input.
 22 |   ##   This proc does not guarantee that a ``let`` value is immutable.
 23 | 
 24 |   result.shape = t.shape.reversed
 25 |   result.strides = t.strides.reversed
 26 |   result.offset = t.offset
 27 |   result.data = t.data
 28 | 
 29 | proc cuda_unsafeContiguous = discard # This is a hack so that the symbol is open
 30 | cuda_assign_glue(cuda_unsafeContiguous, "CopyOp")
 31 | 
 32 | proc unsafeContiguous*[T: SomeReal](t: CudaTensor[T], layout: OrderType = colMajor, force: bool = false):
 33 |   CudaTensor[T] {.noSideEffect.}=
 34 |   ## Transform a tensor with general striding to a Tensor with contiguous layout.
 35 |   ##
 36 |   ## By default CudaTensor will be colMajor (contrary to a cpu tensor).
 37 |   ##
 38 |   ## By default nothing is done if the tensor is already contiguous (C Major or F major)
 39 |   ## The "force" parameter can force re-ordering to a specific layout
 40 |   ##
 41 |   ## Warning ⚠ CudaTensor temporary default:
 42 |   ##   If the CudaTensor is contiguous, this is a no-copy operation, data is shared with the input.
 43 |   ##   This proc does not guarantee that a ``let`` value is immutable.
 44 | 
 45 |   if t.isContiguous and not force:
 46 |     return t
 47 |   elif t.is_F_contiguous and layout == colMajor:
 48 |     return t
 49 |   elif t.is_C_contiguous and layout == rowMajor:
 50 |     return t
 51 | 
 52 |   result = newCudaTensor[T](t.shape, layout)
 53 | 
 54 |   cuda_assign_call(cuda_unsafeContiguous, result, t)
 55 | 
 56 | 
 57 | proc unsafeReshape*(t: CudaTensor, new_shape: varargs[int]): CudaTensor =
 58 |   ## Reshape a CudaTensor without copy.
 59 |   ##
 60 |   ## ⚠ Reshaping without copy is only possible on contiguous Tensors
 61 |   ##
 62 |   ## Warning ⚠:
 63 |   ##   This is a no-copy operation, data is shared with the input.
 64 |   ##   This proc does not guarantee that a ``let`` value is immutable.
 65 | 
 66 |   t.reshape_no_copy(new_shape)
 67 |   result.data = t.data
 68 | 
 69 | proc unsafeBroadcast*(t: CudaTensor, shape: varargs[int]): CudaTensor {.noSideEffect.}=
 70 |   ## Explicitly broadcast a CudaTensor to the specified shape.
 71 |   ## The returned broadcasted CudaTensor share the underlying data with the input.
 72 |   ##
 73 |   ## Dimension(s) of size 1 can be expanded to arbitrary size by replicating
 74 |   ## values along that dimension.
 75 |   ##
 76 |   ## Warning ⚠:
 77 |   ##   This is a no-copy operation, data is shared with the input.
 78 |   ##   This proc does not guarantee that a ``let`` value is immutable.
 79 |   ##   A broadcasted tensor should not be modified and only used for computation.
 80 |   result = t
 81 |   result.broadcastT(shape)
 82 | 
 83 | proc unsafeSqueeze*(t: CudaTensor, axis: int): CudaTensor {.noSideEffect.}=
 84 |   ## Collapse the given axis, if the dimension is not 1; it does nothing
 85 |   ## Input:
 86 |   ##   - a CudaTensor
 87 |   ##   - an axis (dimension)
 88 |   ## Returns:
 89 |   ##   - a CudaTensor with singleton dimensions collapsed
 90 |   ## Warning ⚠:
 91 |   ##   This is a no-copy operation, data is shared with the input.
 92 |   ##   This proc does not guarantee that a ``let`` value is immutable.
 93 |   result = t
 94 |   result.squeezeT(axis)
 95 | 
 96 | proc unsafeUnsqueeze*(t: CudaTensor, axis: int): CudaTensor {.noSideEffect.}=
 97 |   ## Insert a new axis just before the given axis, increasing the CudaTensor
 98 |   ## dimension (rank) by 1
 99 |   ##   - a tensor with that new axis
100 |   ## Warning ⚠:
101 |   ##   This is a no-copy operation, data is shared with the input.
102 |   ##   This proc does not guarantee that a ``let`` value is immutable.
103 |   result = t
104 |   result.unsqueezeT(axis)


--------------------------------------------------------------------------------
/src/arraymancer/backend/cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Data structures to ease interfacing with Cuda and kernels
 16 | 
 17 | proc cudaMalloc[T](size: int): ptr T {.noSideEffect, inline.}=
 18 |   ## Internal proc.
 19 |   ## Wrap CudaMAlloc(var pointer, size) -> Error_code
 20 |   let s = size * sizeof(T)
 21 |   check cudaMalloc(cast[ptr pointer](addr result), s)
 22 | 
 23 | proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
 24 |   if not p[].isNil:
 25 |     check cudaFree(p[])
 26 | 
 27 | 
 28 | # ##############################################################
 29 | # # Base CudaSeq type
 30 | # # End goal is for it to have value semantics like Nim seq
 31 | 
 32 | proc newCudaSeq[T: SomeReal](length: int): CudaSeq[T] {.noSideEffect.}=
 33 |   result.len = length
 34 |   new(result.data, deallocCuda)
 35 |   result.data[] = cast[ptr UncheckedArray[T]](cudaMalloc[T](result.len))
 36 | 
 37 | # #########################################################
 38 | # # Sending tensor layout to Cuda Kernel
 39 | 
 40 | # # So that layout->strides can be used in Cuda kernel, it's easier if everything is declared from cpp
 41 | # # pending https://github.com/nim-lang/Nim/issues/6415
 42 | #
 43 | # template create_CudaTensorLayout(N: static[int]) =
 44 | #   ## This Layout in C++ will be overriden by a CudaMemCpy from the Nim data structure
 45 | #   {. emit:[ """
 46 | #
 47 | #     template <typename T>
 48 | #     struct CudaTensorLayout {
 49 | #       int rank;
 50 | #       int shape[""", N,"""];
 51 | #       int strides[""", N,"""];
 52 | #       int offset;
 53 | #       T * __restrict__ data;
 54 | #       };
 55 | #
 56 | #
 57 | #   """].}
 58 | #
 59 | # create_CudaTensorLayout(MAXRANK)
 60 | 
 61 | type
 62 |   # CudaLayoutArray = array[MAXRANK, cint]
 63 |   # This will replace the current ref[ptr T] for shape and strides in the future
 64 |   ## Using arrays instead of seq avoids having to indicate __restrict__ everywhere to indicate no-aliasing
 65 |   ## We also prefer stack allocated array sice the data will be used at every single loop iteration to compute elements position.
 66 |   ## Ultimately it avoids worrying about deallocation too
 67 |   CudaLayoutArray = ref[ptr cint]
 68 | 
 69 | 
 70 |   CudaTensorLayout [T: SomeReal] = object
 71 |     ## Mimicks CudaTensor
 72 |     ## This will be stored on GPU in the end
 73 |     ## Goal is to avoids clumbering proc with cudaMemcpyshape, strides, offset, data, rank, len
 74 |     ##
 75 |     ## Check https://github.com/mratsim/Arraymancer/issues/26 (Optimizing Host <-> Cuda transfer)
 76 |     ## on why I don't (yet?) use Unified Memory and choose to manage it manually.
 77 | 
 78 |     rank: cint               # Number of dimension of the tensor
 79 |     shape: CudaLayoutArray
 80 |     strides: CudaLayoutArray
 81 |     offset: cint
 82 |     data: ptr T              # Data on Cuda device
 83 |     len: cint                # Number of elements allocated in memory
 84 | 
 85 | proc layoutOnDevice*[T:SomeReal](t: CudaTensor[T]): CudaTensorLayout[T] {.noSideEffect.}=
 86 |   ## Store a CudaTensor shape, strides, etc information on the GPU
 87 |   #
 88 |   # TODO: instead of storing pointers to shape/stride/etc that are passed to each kernel
 89 |   # pass the layout object directly and call it with layout->shape, layout->rank
 90 | 
 91 |   result.rank = t.rank.cint
 92 | 
 93 |   result.offset = t.offset.cint
 94 |   result.data = t.get_data_ptr
 95 |   result.len = t.size.cint
 96 | 
 97 |   new result.shape, deallocCuda
 98 |   new result.strides, deallocCuda
 99 | 
100 |   result.shape[] = cudaMalloc[cint](MAXRANK)
101 |   result.strides[] = cudaMalloc[cint](MAXRANK)
102 | 
103 |   var
104 |     tmp_shape: array[MAXRANK, cint] # CudaLayoutArray
105 |     tmp_strides: array[MAXRANK, cint] # CudaLayoutArray
106 | 
107 |   for i in 0..<result.rank:
108 |     tmp_shape[i] = t.shape[i].cint
109 |     tmp_strides[i] = t.strides[i].cint
110 | 
111 | 
112 |   # TODO: use streams and async
113 |   let size = t.rank * sizeof(cint)
114 |   check cudaMemCpy(result.shape[], addr tmp_shape[0], size, cudaMemcpyHostToDevice)
115 |   check cudaMemCpy(result.strides[], addr tmp_strides[0], size, cudaMemcpyHostToDevice)


--------------------------------------------------------------------------------
/examples/ex01_xor_perceptron_from_scratch.nim:
--------------------------------------------------------------------------------
  1 | import ../src/arraymancer_nn, ../src/arraymancer_ag, ../src/arraymancer, ../src/arraymancer_nn_primitives
  2 | import future
  3 | 
  4 | # Example multilayer perceptron in Arraymancer.
  5 | 
  6 | # We will use as examples the OR function similar to this article:
  7 | # https://blog.dbrgn.ch/2013/3/26/perceptrons-in-python/
  8 | 
  9 | 
 10 | # Okay let's start
 11 | # With x and y being one sample, the perceptron equation is
 12 | #
 13 | # Layer 1
 14 | # n1 = relu(a1 * x + b1 * y + c1) # First neuron + relu activation
 15 | # n2 = relu(a2 * x + b2 * y + c2) # 2nd neuron + relu activation
 16 | # n3 = relu(a3 * x + b3 * y + c3) # 3nd neuron + relu activation
 17 | #
 18 | # Layer 2
 19 | # classifier =  a4 * n1 + b4 * n2 + c4 * n3
 20 | #
 21 | # Loss
 22 | # loss = cross_entropy(sigmoid(classifier))
 23 | 
 24 | 
 25 | 
 26 | # In terms of high level layers this becomes:
 27 | # Input --> Linear(out_features = 3) --> relu --> Linear(out_features = 1) --> Sigmoid --> Cross-Entropy Loss
 28 | 
 29 | # Let's go
 30 | 
 31 | # First create a context that will store backpropagation information
 32 | let ctx = newContext Tensor[float32]
 33 | 
 34 | # We will pass batches of 32 samples
 35 | let bsz = 32 #batch size
 36 | 
 37 | # We will create a tensor of size 3200 --> 100 batch sizes of 32
 38 | # We create it as int between [0, 2[ (2 excluded) and convert to bool
 39 | let x_train_bool = randomTensor([bsz * 100, 2], 2).astype(bool) # generate batch_size examples of (0,1) combination
 40 | 
 41 | # Let's check the first 32
 42 | echo x_train_bool[0..<32, _]
 43 | # Tensor of shape 32x2 of type "bool" on backend "Cpu"
 44 | # |true   false|
 45 | # |true   true|
 46 | # |false  false|
 47 | # |false  true|
 48 | # |false  false|
 49 | # |false  false|
 50 | # |false  false|
 51 | # ...
 52 | 
 53 | # Let's build or truth labels. We need to apply xor between the 2 columns of the tensors
 54 | 
 55 | proc xor_alt[T](x,y: T): T =
 56 |   ## xor is builtin and cannot be passed to map as is
 57 |   x xor y
 58 | 
 59 | # We map or new xor function to matching elements of the subtensors
 60 | let y_bool = map2(x_train_bool[_,0], xor_alt, x_train_bool[_,1])
 61 | echo y_bool[0..<32, _]
 62 | # Tensor of shape 32x1 of type "bool" on backend "Cpu"
 63 | #         true|
 64 | #         false|
 65 | #         false|
 66 | #         true|
 67 | #         false|
 68 | #         false|
 69 | #         false|
 70 | #         true|
 71 | #         false|
 72 | #         ...
 73 | 
 74 | # Convert to float,
 75 | # Important: To improve perf, Arraymancer expects batch size to be last
 76 | # so we transpose
 77 | let x_train = ctx.variable(x_train_bool.astype(float32).transpose)
 78 | let y = y_bool.astype(float32).transpose
 79 | 
 80 | # Now we create layer of neurons W that we will train to reproduce the xor function.
 81 | # Weights are of this shape: [W: out_features, in_features]
 82 | 
 83 | # First hidden layer of 3 neurons, with 2 features in
 84 | # We initialize with random weights between -1 and 1
 85 | let layer_3neurons = ctx.variable(
 86 |                       randomTensor(3, 2, 2.0f) .- 1.0f
 87 |                       )
 88 | 
 89 | # Classifier layer with 1 neuron per feature. (In our case only one neuron overall)
 90 | # We initialize with random weights between -1 and 1
 91 | let classifier_layer = ctx.variable(
 92 |                   randomTensor(1, 3, 2.0f) .- 1.0f
 93 |                   )
 94 | # We use Stochastic Gradient Descent as optimizer
 95 | # With gradient descent the weigth are updated as follows:
 96 | # W -= learning_rate * dW
 97 | let optim = newSGD[float32](
 98 |   layer_3neurons, classifier_layer, 0.01f # 0.01 is the learning rate
 99 | )
100 | 
101 | # Now let's setup the training loops.
102 | # First loop is passing the mini-batch, bacpropagating, updating the gradients.
103 | # We do it until the whole x_train tensor has been passed through.
104 | # This is one "epoch".
105 | 
106 | # Usually after each epoch we "validate" with a test set that the network was never trained on
107 | # how the network generalized. In this example we won't go there to keep it short.
108 | 
109 | # We will do 5 epochs, passing the 32*100 minibatches
110 | for epoch in 0..5:
111 | 
112 |   for batch_id in 0..<100:
113 | 
114 |     # offset in the Tensor (Remember, batch size is last)
115 |     let offset = batch_id * 32
116 |     let x = x_train[_, offset ..< offset + 32]
117 |     let target = y[_, offset ..< offset + 32]
118 | 
119 |     # Building the network
120 |     let n1 = linear(x, layer_3neurons)
121 |     let n1_act = n1.relu
122 |     let n2 = linear(n1_act, classifier_layer)
123 |     let loss = sigmoid_cross_entropy(n2, target)
124 | 
125 |     echo "Epoch is:" & $epoch
126 |     echo "Batch id:" & $batch_id
127 | 
128 |     echo "Loss is:" & $loss.value.data[0]
129 | 
130 |     # Compute the gradient (i.e. contribution of each parameter to the loss)
131 |     loss.backprop()
132 | 
133 |     # Correct the weights now that we have the gradient information
134 |     optim.update()


--------------------------------------------------------------------------------
/arraymancer.nimble:
--------------------------------------------------------------------------------
  1 | ### Package
  2 | version       = "0.2.0"
  3 | author        = "Mamy André-Ratsimbazafy"
  4 | description   = "A n-dimensional tensor (ndarray) library"
  5 | license       = "Apache License 2.0"
  6 | 
  7 | ### Dependencies
  8 | requires "nim >= 0.17.2", "nimblas >= 0.1.3", "nimcuda >= 0.1.4"
  9 | 
 10 | ## Install files
 11 | srcDir = "src"
 12 | 
 13 | ########################################################
 14 | # External libs configuration
 15 | 
 16 | ### BLAS support
 17 | ## OSX
 18 | # switch("define","openblas")
 19 | # switch("clibdir", "/usr/local/opt/openblas/lib")
 20 | # switch("cincludes", "/usr/local/opt/openblas/include")
 21 | 
 22 | ### BLIS support
 23 | # switch("define","blis")
 24 | 
 25 | ### MKL support
 26 | # Check the mkl switches in the test file for single-threaded and openp version
 27 | 
 28 | ### Cuda configuration
 29 | ## Pass -d:cuda to build arraymancer with cuda support
 30 | ## Use the cuda switches below
 31 | ## Replace /opt/cuda by your own path
 32 | ## TODO: auto detection or at least check in common directories
 33 | ## Note: It is import to gate compiler flags like -march=native  behind Xcompiler "-Xcompiler -march=native"
 34 | 
 35 | template cudaSwitches() =
 36 |   switch("cincludes", "/opt/cuda/include")
 37 |   switch("cc", "gcc") # We trick Nim about nvcc being gcc, pending https://github.com/nim-lang/Nim/issues/6372
 38 |   switch("gcc.exe", "/opt/cuda/bin/nvcc")
 39 |   switch("gcc.linkerexe", "/opt/cuda/bin/nvcc")
 40 |   switch("gcc.cpp.exe", "/opt/cuda/bin/nvcc")
 41 |   switch("gcc.cpp.linkerexe", "/opt/cuda/bin/nvcc")
 42 |   # Due to the __ldg intrinsics in kernels
 43 |   # we only support compute capabilities 3.5+
 44 |   # See here: http://docs.nvidia.com/cuda/pascal-compatibility-guide/index.html
 45 |   # And wikipedia for GPU capabilities: https://en.wikipedia.org/wiki/CUDA
 46 |   switch("gcc.options.always", "-arch=sm_61 --x cu") # Interpret .c files as .cu
 47 |   switch("gcc.cpp.options.always", "-arch=sm_61 --x cu -Xcompiler -fpermissive") # Interpret .c files as .cu, gate fpermissive behind Xcompiler
 48 | 
 49 | when defined(cuda):
 50 |   cudaSwitches
 51 | 
 52 | ########################################################
 53 | # Optimization
 54 | 
 55 | # Multithreading
 56 | # use the -d:openmp switch
 57 | # which passC: -fopenmp to the compiler
 58 | 
 59 | # Native processor optimization
 60 | # use the -d:native
 61 | # which passC: -march=native to the compiler
 62 | 
 63 | 
 64 | ##########################################################################
 65 | ## Testing tasks
 66 | 
 67 | proc test(name: string, lang: string = "c") =
 68 |   if not dirExists "bin":
 69 |     mkDir "bin"
 70 |   if not dirExists "nimcache":
 71 |     mkDir "nimcache"
 72 |   --run
 73 |   --nimcache: "nimcache"
 74 |   switch("out", ("./bin/" & name))
 75 |   setCommand lang, "tests/" & name & ".nim"
 76 | 
 77 | task test, "Run all tests - Default BLAS":
 78 |   test "all_tests"
 79 | 
 80 | task test_cuda, "Run all tests - Cuda backend with CUBLAS":
 81 |   switch("define","cuda")
 82 |   cudaSwitches # Unfortunately the "switch" line doesn't also trigger
 83 |                # the "when defined(cuda)" part of this nimble file
 84 |                # hence the need to call cudaSwitches explicitly
 85 |   test "all_tests_cuda", "cpp"
 86 | 
 87 | task test_deprecated, "Run all tests on deprecated static[Backend] procs":
 88 |   test "all_tests_deprecated"
 89 | 
 90 | task test_openblas, "Run all tests - OpenBLAS":
 91 |   ## Should work but somehow Nim doesn't find libopenblas.dylib on MacOS
 92 |   when defined(macosx):
 93 |     switch("define","blas=openblas")
 94 |     switch("clibdir", "/usr/local/opt/openblas/lib")
 95 |     switch("cincludes", "/usr/local/opt/openblas/include")
 96 |   test "all_tests"
 97 | 
 98 | task test_blis, "Run all tests - BLIS":
 99 |   switch("define","blis")
100 |   test "all_tests"
101 | 
102 | task test_native, "Run all tests - march=native":
103 |   switch("define","native")
104 |   test "all_tests"
105 | 
106 | task test_openmp, "Run all tests - OpenMP":
107 |   switch("define","openmp")
108 |   test "all_tests"
109 | 
110 | task test_mkl, "Run all tests - Intel MKL - single threaded":
111 |   switch("define","blas=mkl_intel_lp64")
112 |   switch("clibdir", "/opt/intel/mkl/lib/intel64")
113 |   switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a")
114 |   switch("passl", "-lmkl_core")
115 |   switch("passl", "-lmkl_sequential")
116 |   switch("dynlibOverride","mkl_intel_lp64")
117 |   test "all_tests"
118 | 
119 | task test_mkl_omp, "Run all tests - Intel MKL + OpenMP":
120 |   switch("define","openmp")
121 |   switch("define","blas=mkl_intel_lp64")
122 |   switch("clibdir", "/opt/intel/mkl/lib/intel64")
123 |   switch("passl", "/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a")
124 |   switch("passl", "-lmkl_core")
125 |   switch("passl", "-lmkl_gnu_thread")
126 |   switch("passl", "-lgomp")
127 |   switch("dynlibOverride","mkl_intel_lp64")
128 |   test "all_tests"
129 | 
130 | task test_release, "Run all tests - Release mode":
131 |   switch("define","release")
132 |   test "all_tests"
133 | 
134 | task gen_doc, "Generate Arraymancer documentation":
135 |   switch("define", "doc")
136 |   exec "nim doc2 src/arraymancer"
137 | 


--------------------------------------------------------------------------------
/src/arraymancer/fallback/blas_l3_gemm.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # The following code is heavily inspired by ulmBLAS (http://apfel.mathematik.uni-ulm.de/~lehn/ulmBLAS/)
 16 | # which is heavily inspired by BLIS (https://github.com/flame/blis)
 17 | # A big difference (for now?) is instead of passing (const) pointers I pass the (var) array and a var offset.
 18 | 
 19 | # # Reading
 20 | # C++ version: https://stackoverflow.com/questions/35620853/how-to-write-a-matrix-matrix-product-that-can-compete-with-eigen
 21 | # uBLAS C++: http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session1/page01.html
 22 | # Blaze C++: http://www.mathematik.uni-ulm.de/~lehn/test_blaze/session1/page01.html
 23 | # Rust BLIS inspired: https://github.com/bluss/matrixmultiply
 24 | 
 25 | # ### TODO:
 26 | # - OpenMP parallelization
 27 | # {.passl: "-fopenmp".} # Issue: Clang OSX does not support openmp
 28 | # {.passc: "-fopenmp".} # and the default GCC is actually a link to Clang
 29 | 
 30 | # - Loop unrolling  # Currently Nim `unroll` pragma exists but is ignored.
 31 | # - Pass `-march=native` to the compiler
 32 | # - Align memory # should be automatic
 33 | # - Is there a way to get L1/L2 cache size at compile-time
 34 | # - Is there a way to get number of registers at compile-time
 35 | 
 36 | # Best numbers depend on
 37 | # L1, L2, L3 cache and register size
 38 | 
 39 | # L1 cache: 32 KB data + 32 KB instructions since Nehalem (per proc)
 40 | # L2 cache: 256KB since Nehalem
 41 | # X86-64 Register size: 16 registers 128-bit (16 Bytes) wide (SSE2), 256-bit with AVX
 42 | # Loading int in AVX registers needs AVX2 support in CPU.
 43 | # Everything must be aligned in memory for faster loading in registers.
 44 | 
 45 | # Int/float64 takes 4B
 46 | # float32 takes 2B
 47 | # --> use "when" to parametrize size at compile-time?
 48 | 
 49 | const MC = 96
 50 | const KC = 256
 51 | const NC = 4096
 52 | 
 53 | # The following should be bigger (4x8) but somehow it hurts my performance
 54 | # It might be because the compiler is not using the large AVX registers by default.
 55 | const MR = 2
 56 | const NR = 2
 57 | 
 58 | #                    Panels of B of size KC * NR resides in L1 cache
 59 | const MCKC = MC*KC # A resides in L2 cache
 60 | const KCNC = KC*NC # B resides in L3 cache
 61 | const MRNR = MR*NR # Work area: Fit in registers
 62 | 
 63 | 
 64 | include ./blas_l3_gemm_packing
 65 | include ./blas_l3_gemm_aux
 66 | include ./blas_l3_gemm_micro_kernel
 67 | include ./blas_l3_gemm_macro_kernel
 68 | 
 69 | proc newBufferArray[T: SomeNumber](N: static[int], typ: typedesc[T]): ref array[N, T]  {.noSideEffect.} =
 70 |   new result
 71 |   for i in 0 ..< N:
 72 |     result[i] = 0.T
 73 | 
 74 | proc gemm_nn_fallback[T](m, n, k: int,
 75 |                 alpha: T,
 76 |                 A: seq[T], offA: int,
 77 |                 incRowA, incColA: int,
 78 |                 B: seq[T], offB: int,
 79 |                 incRowB, incColB: int,
 80 |                 beta: T,
 81 |                 C: var seq[T], offC: int,
 82 |                 incRowC, incColc: int)  {.noSideEffect.} =
 83 | 
 84 |   let
 85 |     mb = (m + MC - 1) div MC
 86 |     nb = (n + NC - 1) div NC
 87 |     kb = (k + KC - 1) div KC
 88 | 
 89 |     mod_mc = m mod MC
 90 |     mod_nc = n mod NC
 91 |     mod_kc = k mod KC
 92 | 
 93 |   var mc, nc, kc: int
 94 |   var tmp_beta: T
 95 | 
 96 |   {.pragma: align16, codegenDecl: "$# $# __attribute__((aligned(16)))".}
 97 |   var buffer_A{.align16.} = newBufferArray(MCKC, T)
 98 |   var buffer_B{.align16.} = newBufferArray(KCNC, T)
 99 |   var buffer_C{.align16.} = newBufferArray(MRNR, T)
100 | 
101 |   if alpha == 0.T or k == 0:
102 |     gescal(m, n, beta, C, offC, incRowC, incColC)
103 |     return
104 | 
105 |   for j in 0 ..< nb:
106 |     nc =  if (j != nb-1 or mod_nc == 0): NC
107 |           else: mod_nc
108 | 
109 |     for k in 0 ..< kb:
110 |       kc       =  if (k != kb-1 or mod_kc == 0): KC
111 |                   else: mod_kc
112 |       tmp_beta =  if k == 0: beta
113 |                   else: 1.T
114 | 
115 |       pack_dim( nc, kc,
116 |                 B, k*KC*incRowB + j*NC*incColB + offB,
117 |                 incColB, incRowB, NR,
118 |                 buffer_B)
119 | 
120 |       for i in 0 ..< mb:
121 |         mc = if (i != mb-1 or mod_mc == 0): MC
122 |              else: mod_mc
123 | 
124 |         pack_dim( mc, kc,
125 |                   A, i*MC*incRowA + k*KC*incColA + offA,
126 |                   incRowA, incColA, MR,
127 |                   buffer_A)
128 | 
129 |         gemm_macro_kernel(mc, nc, kc,
130 |                           alpha, tmp_beta,
131 |                           C, i*MC*incRowC + j*NC*incColC + offC,
132 |                           incRowC, incColC, buffer_A, buffer_B, buffer_C)


--------------------------------------------------------------------------------
/src/arraymancer/higher_order_cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 the Arraymancer contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # Note: Maximum number of threads per block is
 17 | # 1024 on Pascal GPU, i.e. 32 warps of 32 threads
 18 | 
 19 | 
 20 | # Important CUDA optimization
 21 | # To loop over each element of an array with arbitrary length
 22 | # use grid-strides for loop: https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
 23 | #
 24 | # Avoid branching in the same warp (32 threads), otherwise it reverts to serial execution.
 25 | # "idx < length" can be converted to "idx = max( idx, 0); idx = min( idx, length);"
 26 | # for example. (Beware of aliasing)
 27 | 
 28 | # TODO, use an on-device struct to store, shape, strides, offset
 29 | # And pass arguments via a struct pointer to limite register pressure
 30 | 
 31 | {.emit:["""
 32 |   template<typename T, typename Op>
 33 |   __global__ void cuda_apply2(const int rank,
 34 |                               const int len,
 35 |                               const int *  __restrict__ dst_shape,
 36 |                               const int *  __restrict__ dst_strides,
 37 |                               const int dst_offset,
 38 |                               T * __restrict__ dst_data,
 39 |                               Op f,
 40 |                               const int *  __restrict__ src_shape,
 41 |                               const int *  __restrict__ src_strides,
 42 |                               const int src_offset,
 43 |                               const T * __restrict__ src_data){
 44 | 
 45 |     for (int elemID = blockIdx.x * blockDim.x + threadIdx.x;
 46 |          elemID < len;
 47 |          elemID += blockDim.x * gridDim.x) {
 48 | 
 49 |       // ## we can't instantiate the variable outside the loop
 50 |       // ## each threads will store its own in parallel
 51 |       const int dst_real_idx = cuda_getIndexOfElementID(
 52 |                                rank,
 53 |                                dst_shape,
 54 |                                dst_strides,
 55 |                                dst_offset,
 56 |                                elemID);
 57 | 
 58 |       const int src_real_idx = cuda_getIndexOfElementID(
 59 |                                rank,
 60 |                                src_shape,
 61 |                                src_strides,
 62 |                                src_offset,
 63 |                                elemID);
 64 | 
 65 |       f(&dst_data[dst_real_idx], &src_data[src_real_idx]);
 66 |     }
 67 |   }
 68 | """].}
 69 | 
 70 | 
 71 | {.emit:["""
 72 |   template<typename T, typename Op>
 73 |   __global__ void cuda_apply3(const int rank,
 74 |                               const int len,
 75 |                               const int *  __restrict__ dst_shape,
 76 |                               const int *  __restrict__ dst_strides,
 77 |                               const int dst_offset,
 78 |                               T * __restrict__ dst_data,
 79 |                               const int *  __restrict__ A_shape,
 80 |                               const int *  __restrict__ A_strides,
 81 |                               const int A_offset,
 82 |                               const T * __restrict__ A_data,
 83 |                               Op f,
 84 |                               const int *  __restrict__ B_shape,
 85 |                               const int *  __restrict__ B_strides,
 86 |                               const int B_offset,
 87 |                               const T * __restrict__ B_data){
 88 | 
 89 |     for (int elemID = blockIdx.x * blockDim.x + threadIdx.x;
 90 |          elemID < len;
 91 |          elemID += blockDim.x * gridDim.x) {
 92 | 
 93 |       // ## we can't instantiate the variable outside the loop
 94 |       // ## each threads will store its own in parallel
 95 |       const int dst_real_idx = cuda_getIndexOfElementID(
 96 |                                rank,
 97 |                                dst_shape,
 98 |                                dst_strides,
 99 |                                dst_offset,
100 |                                elemID);
101 | 
102 |       const int A_real_idx = cuda_getIndexOfElementID(
103 |                                rank,
104 |                                A_shape,
105 |                                A_strides,
106 |                                A_offset,
107 |                                elemID);
108 | 
109 |       const int B_real_idx = cuda_getIndexOfElementID(
110 |                                rank,
111 |                                B_shape,
112 |                                B_strides,
113 |                                B_offset,
114 |                                elemID);
115 | 
116 |       f(&dst_data[dst_real_idx], &A_data[A_real_idx], &B_data[B_real_idx]);
117 |     }
118 |   }
119 | """].}


--------------------------------------------------------------------------------
/src/nn_primitives/sigmoid_cross_entropy_primitives.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 the Arraymancer contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import ../arraymancer
 16 | import math
 17 | 
 18 | # Sigmoid cross-entropy function that works directly on Tensors
 19 | # and provide control without autograd
 20 | 
 21 | proc check_input_target[T](input, target: Tensor[T]) {.inline.}=
 22 |   if input.shape != target.shape:
 23 |     raise newException(ValueError, "Input shape " & $input.shape &
 24 |       " and target shape " & $target.shape & " should be the same")
 25 | 
 26 | proc sigmoid_cross_entropy*[T](input, target: Tensor[T]): T {.inline.} =
 27 |   ## Sigmoid function + Cross-Entropy loss fused in one layer.
 28 |   ## This leverage the log-sum-exp trick for improved numerical stability
 29 |   ## It is also faster than calling both separately
 30 |   ##
 31 |   ## Input:
 32 |   ##   - A Tensor
 33 |   ##   - The target values
 34 |   ## Returns:
 35 |   ##   - Apply and sigmoid activation and returns the cross-entropy loss.
 36 |   ## Shape:
 37 |   ##   - Both the cache and target shape should be @[features, batchsize] i.e. number of samples as last dimension
 38 | 
 39 | 
 40 |   # TODO: term rewriting macro for auto fusion
 41 | 
 42 |   when compileOption("boundChecks"):
 43 |     check_input_target(input, target)
 44 | 
 45 |   result = 0.T
 46 |   for xi, ti in zip(input, target):
 47 |     result += (-ti * xi +  max(xi,0) + ln(1 + exp(-abs(xi))) ) / T(input.shape[1]) #input.shape[1] is the batch size
 48 | 
 49 | 
 50 | proc sigmoid_cross_entropy_backward*[T](
 51 |         gradient: Tensor[T] or T,
 52 |         cached_tensor: Tensor[T],
 53 |         target: Tensor[T]
 54 |         ): Tensor[T] {.inline.} =
 55 |   ## Derivatives of sigmoid_cross_entropy
 56 |   ## Input:
 57 |   ##   - The input gradient as a scalar or a Tensor
 58 |   ##   - A cache tensor that contains data from before the forward pass
 59 |   ##   - The target values
 60 |   ## Shape:
 61 |   ##   - Both the cache and target shape should be @[features, batchsize] i.e. number of samples as last dimension
 62 |   let batch_size = cached_tensor.shape[^1]
 63 | 
 64 |   # Deal with scalar and tensor gradient
 65 |   when gradient is T:
 66 |     let grad = gradient
 67 |   elif gradient is Tensor:
 68 |     let grad = gradient.data[gradient.offset]
 69 | 
 70 |   proc sigmoid_cross_entropy_backward_closure[T](xi, ti: T): T =
 71 |     grad * ( 1.T / (1.T + exp(-xi)) - ti) / T(batch_size)
 72 | 
 73 |   return map2(cached_tensor, sigmoid_cross_entropy_backward_closure, target)
 74 | 
 75 | # ################################################
 76 | # Explanation of sigmoid cross-entropy algorithms:
 77 | 
 78 | # ############
 79 | # Forward pass
 80 | 
 81 | # Cross-entropy has the following form for a single sample
 82 | # CEi(yi, yi') = − ( ti ln(yi) + (1−ti) ln(1−yi) )
 83 | 
 84 | # Since we pass a minibatch of several samples we should average by minibatch size (1/batchsize)
 85 | # to keep the gradient magnitude/weight updates on the same scale as a single sample pass
 86 | # CE(y, y') = − 1/n ∑i( ti ln(yi) + (1−ti) ln(1−yi) )
 87 | 
 88 | # yi = ln(sigmoid(xi)) = ln(1/(1+e^-xi)) = ln(e^xi/( 1 + e^xi ))
 89 | # yi = x - ln(1 + e^xi)
 90 | 
 91 | # 1 - yi = ln(1 - sigmoid(xi)) = ln(1 + e^xi - e^xi) / (1 + e^xi))
 92 | # 1 - yi = - ln(1 + e^xi)
 93 | 
 94 | # Replacing Sigmoid Cross Entropy
 95 | # SCE(x, y') = − 1/n ∑i(ti * (xi - ln(1 + e^xi)) + (1−ti) * -ln(1 + e^xi) )
 96 | #            = − 1/n ∑i(ti * xi - ti * ln(1 + e^xi) -ln(1 + e^xi) + ti * ln(1 + e^xi) )
 97 | #            = − 1/n ∑i(ti * xi - ln(1 + e^xi) )
 98 | #            = − 1/n ∑i(ti * xi - ln(e^0 + e^xi) )
 99 | #
100 | # Using the logsumexp trick with factorize by a constant
101 | # c = max(xi, 0)
102 | #
103 | # SCE(x, y') = − 1/n ∑i(ti * xi - ln(e^c *( e^(0-c) + e^(xi-c))
104 | #            = − 1/n ∑i(ti * xi - ln(e^c *( e^(0-c) + e^(xi-c))
105 | #            = − 1/n ∑i(ti * xi - c - ln(e^-c + e^(xi-c))
106 | #
107 | # If c = xi (xi > 0), ln(e^-c + e^(xi-c)) becomes ln(e^-xi + 1)
108 | # else c = 0 (xi < 0 ), ln(e^-c + e^(xi-c)) becomes ln(1 + e^xi)
109 | # Both cases are covered by ln(1 + e^-|xi|)
110 | #
111 | # Finally
112 | # SCE(x, y') = − 1/n ∑i(ti * xi - max(xi,0) - ln(1 + e^-|xi|)
113 | #
114 | #
115 | #
116 | # Other idea: streaming maximum (http://www.nowozin.net/sebastian/blog/streaming-log-sum-exp-computation.html)
117 | #
118 | 
119 | # #############
120 | # Backward pass
121 | 
122 | # Derivative of Sigmoid-CE:
123 | # We start from this formula: SCE(x, y') = − 1/n ∑i(ti * xi - ln(1 + e^xi) )
124 | #                                        = 1/n ∑i(-ti * xi + ln(1 + e^xi) )
125 | #
126 | # On a single sample:
127 | # dSCE/dxi = d/dxi (-ti * xi + ln(1 + e^xi))
128 | #          = -ti + e^xi * 1/(1 + e^xi))
129 | #          = -ti * sigmoid(xi)
130 | #
131 | # For a vector of samples
132 | # dSCE/dx = 1/n ∑i( sigmoid(xi) - ti )
133 | 


--------------------------------------------------------------------------------
/src/arraymancer/init_cuda.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | proc unsafeView*[T](t: CudaTensor[T]): CudaTensor[T] {.inline,noSideEffect.}=
 16 |   ## Input:
 17 |   ##     - A CudaTensor
 18 |   ## Returns:
 19 |   ##     - A shallow copy.
 20 |   ##
 21 |   ## Warning ⚠
 22 |   ##   Both tensors shares the same memory. Data modification on one will be reflected on the other.
 23 |   ##   However modifying the shape, strides or offset will not affect the other.
 24 | 
 25 |   # shape and strides fields have value semantics by default
 26 |   # CudaSeq has ref semantics
 27 |   system.`=`(result, t)
 28 | 
 29 | proc clone*[T](t: CudaTensor[T]): CudaTensor[T] =
 30 |   ## Clone (deep copy) a CudaTensor.
 31 |   ## Copy will not share its data with the original.
 32 |   ##
 33 |   ## Tensor is copied as is. For example it will not be made contiguous.
 34 |   ## Use `unsafeContiguous` for this case
 35 | 
 36 |   # Note: due to modifying the defaultStream global var for async memcopy
 37 |   # proc cannot be tagged noSideEffect
 38 | 
 39 |   result.shape = t.shape
 40 |   result.strides = t.strides
 41 |   result.offset = t.offset
 42 |   result.data = newCudaSeq[T](t.data.len)
 43 |   let size = t.data.len * sizeof(T)
 44 | 
 45 |   check cudaMemCpyAsync(result.get_data_ptr,
 46 |                         t.get_data_ptr,
 47 |                         size,
 48 |                         cudaMemcpyDeviceToDevice,
 49 |                         defaultStream) # defaultStream is a cudaStream_t global var
 50 | 
 51 | # ###########################################################
 52 | # Implement value semantics for CudaTensor
 53 | # Pending https://github.com/nim-lang/Nim/issues/6348
 54 | # Tracked in https://github.com/mratsim/Arraymancer/issues/19
 55 | #
 56 | # proc `=`*[T](dest: var CudaTensor[T]; src: CudaTensor[T]) =
 57 | #   ## Overloading the assignment operator
 58 | #   ## It will have value semantics by default
 59 | #   dest.shape = src.shape
 60 | #   dest.strides = src.strides
 61 | #   dest.offset = src.offset
 62 | #   dest.data = newCudaSeq(src.data.len)
 63 | #
 64 | #   let size = dest.size * sizeof(T)
 65 | #
 66 | #   check cudaMemCpy(dest.get_data_ptr,
 67 | #                    src.get_data_ptr,
 68 | #                    size,
 69 | #                    cudaMemcpyDeviceToDevice)
 70 | #   echo "Value copied"
 71 | #
 72 | # proc `=`*[T](dest: var CudaTensor[T]; src: CudaTensor[T]{call}) {.inline.}=
 73 | #   ## Overloading the assignment operator
 74 | #   ## Optimized version that knows that
 75 | #   ## the source CudaTensor is unique and thus don't need to be copied
 76 | #   system.`=`(result, t)
 77 | #   echo "Value moved"
 78 | 
 79 | proc newCudaTensor[T: SomeReal](shape: varargs[int], layout: OrderType = colMajor): CudaTensor[T] {.noSideEffect.}=
 80 |   ## Internal proc
 81 |   ## Allocate a CudaTensor
 82 |   ## WARNING: The Cuda memory is not initialized to 0
 83 | 
 84 |   # TODO: default to RowMajor. Pending https://github.com/mratsim/Arraymancer/issues/22
 85 |   # As mentionned in design doc, an element-wise kernel will avoid relying on CuBLAS
 86 |   # for inplace operation that requires column major layout.
 87 | 
 88 |   result.shape = @shape
 89 |   result.strides = shape_to_strides(result.shape, layout)
 90 |   result.offset = 0
 91 |   result.data = newCudaSeq[T](result.size)
 92 | 
 93 | proc cuda*[T:SomeReal](t: Tensor[T]): CudaTensor[T] =
 94 |   ## Convert a tensor on Cpu to a tensor on a Cuda device.
 95 |   # Note: due to modifying the defaultStream global var for async copy
 96 |   # proc cannot be tagged noSideEffect
 97 | 
 98 |   result = newCudaTensor[T](t.shape)
 99 | 
100 |   # TODO: avoid reordering rowMajor tensors. This is only needed for inplace operation in CUBLAS.
101 |   let contig_t = t.unsafeContiguous(colMajor, force = true)
102 |   let size = result.size * sizeof(T)
103 | 
104 |   # For host to device we use non-blocking copy
105 |   # Host can proceed with computation.
106 |   # On CUDA device, next operations will be batch in the stream queue.
107 |   check cudaMemCpyAsync(result.get_data_ptr,
108 |                         contig_t.get_data_ptr,
109 |                         size,
110 |                         cudaMemcpyHostToDevice,
111 |                         defaultStream) # defaultStream is a cudaStream_t global var
112 | 
113 | proc cpu*[T:SomeReal](t: CudaTensor[T]): Tensor[T] {.noSideEffect.}=
114 |   ## Convert a tensor on a Cuda device to a tensor on Cpu.
115 |   # We use blocking copy in this case to make sure
116 |   # all data is available for future computation
117 | 
118 |   result.shape = t.shape
119 |   result.strides = t.strides
120 |   result.offset = t.offset
121 |   result.data = newSeqUninit[T](t.data.len) # We copy over all the memory allocated
122 | 
123 |   let size = t.data.len * sizeof(T)
124 | 
125 |   check cudaMemCpy(result.get_data_ptr,
126 |                    t.get_data_ptr,
127 |                    size,
128 |                    cudaMemcpyDeviceToHost)


--------------------------------------------------------------------------------
/src/arraymancer/display.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Mamy André-Ratsimbazafy
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | proc bounds_display(t: Tensor,
 16 |           idx_data: tuple[val: string, idx: int]
 17 |           ): string {.noSideEffect.}=
 18 |   ## Internal routine, compare an index with the strides of a Tensor
 19 |   ## to check beginning and end of lines
 20 |   ## Add the delimiter "|" and line breaks at beginning and end of lines
 21 |   ## TODO: improve 3+D-tensors display
 22 |   let (val,idx) = idx_data
 23 |   let s = t.shape.reversed
 24 | 
 25 |   if val == "|":
 26 |     return " | "
 27 | 
 28 |   for i,j in s[0 .. ^2]: # We don't take the last element (the row in C convention)
 29 |     if idx mod j == 0:
 30 |       return "\t" & $val & "|\n"
 31 |     if idx mod j == 1:
 32 |       return "|" & $val
 33 |   return "\t" & $val
 34 | 
 35 | # TODO: Create a generic n-dimensional display function using nested tables.
 36 | # Example code in hTensor: https://github.com/albertoruiz/hTensor/blob/b36c3748b211c7f41c9af9d486c6ef320e2b7585/lib/Numeric/LinearAlgebra/Array/Display.hs#L92
 37 | 
 38 | # Last dim always in column (except vector)
 39 | # If rank is odd, first dim is along columns
 40 | # if rank is even, first dim is along row
 41 | 
 42 | # Expected for 2x3x3
 43 | #                      0                                       1
 44 | # ---------------------------------------
 45 | # 0,0 # 0,0,0    0,0,1    0,0,2    0,0,3 | 1,0 # 1,0,0    1,0,1    1,0,2    1,0,3
 46 | # 0,1 # 0,1,0    0,1,1    0,1,2    0,1,3 | 1,1 # 1,1,0    1,1,1    1,1,2    1,1,3
 47 | # 0,2 # 0,2,0    0,2,1    0,2,2    0,2,3 | 1,2 # 1,2,0    1,2,1    1,2,2    1,2,3
 48 | # ---------------------------------------
 49 | 
 50 | # Expected for 2x3x3x4
 51 | # 1   2   3   4| 13 14 15 16 | 25 26 27 28
 52 | # 5   6   7   8| 17 18 19 20 | 29 30 31 32
 53 | # 9  10  11  12| 21 22 23 24 | 33 34 35 36
 54 | # ----------------------------------------------
 55 | # 37 38  39  40| 49 59 51 52 | 61 62 63 64
 56 | # 41 42  43  44| 53 54 55 56 | 65 66 67 68
 57 | # 45 46  47  48| 57 58 59 60 | 69 70 71 72
 58 | 
 59 | # Test with
 60 | # let a = toSeq(1..24).toTensor(Cpu).reshape(2,3,4)
 61 | # echo a
 62 | # let b = toSeq(1..72).toTensor(Cpu).reshape(2,3,3,4)
 63 | # echo b
 64 | 
 65 | proc disp2d(t: Tensor): string {.noSideEffect.} =
 66 |   ## Display a 2D-tensor
 67 | 
 68 |   # Add a position index to each value in the Tensor.
 69 |   var indexed_data: seq[(string,int)] = @[]
 70 |   for i, value in t.enumerate:
 71 |     indexed_data.add(($value,i))
 72 | 
 73 |   # Create a closure to apply the boundaries transformation for the specific input
 74 |   proc curry_bounds(tup: (string,int)): string {.noSideEffect.}= t.bounds_display(tup)
 75 | 
 76 |   return indexed_data.concatMap(curry_bounds)
 77 | 
 78 | proc disp3d(t: Tensor): string =
 79 |   ## Display a 3D-tensor
 80 | 
 81 |   let sep: seq[string] = @["|"]
 82 |   let empty: seq[string] = @[]
 83 | 
 84 |   var buffer = empty.repeat(t.shape[1]).toTensor()
 85 | 
 86 |   for t0 in t.axis(0):
 87 |     buffer = buffer.concat(
 88 |               sep.repeat(t0.shape[1]).toTensor().reshape(t.shape[1],1),
 89 |               t0.map(x => $x).reshape(t.shape[1], t.shape[2]),
 90 |               axis = 1
 91 |               )
 92 | 
 93 |   return buffer.disp2d
 94 | 
 95 | proc disp4d(t: Tensor): string =
 96 |   ## Display a 4D-tensor
 97 | 
 98 |   let sep: seq[string] = @["|"]
 99 |   let sepv: seq[string] = @["-"]
100 |   let empty: seq[string] = @[]
101 | 
102 |   # First create seq of tensor to concat horizontally
103 |   var hbuffer = newSeqWith(t.shape[0], empty.repeat(t.shape[2]).toTensor())
104 | 
105 |   var i = 0
106 |   for s0 in t.axis(0):
107 |     let s0r = s0.reshape(t.shape[1],t.shape[2],t.shape[3])
108 |     for s1 in s0r.axis(0):
109 |       hbuffer[i] = hbuffer[i].concat(
110 |                 sep.repeat(t.shape[2]).toTensor().reshape(t.shape[2],1),
111 |                 s1.reshape(t.shape[2], t.shape[3]).map(x => $x),
112 |                 axis = 1
113 |                 )
114 |     inc i
115 | 
116 |   # Then concat vertically
117 |   var vbuffer = empty.repeat(hbuffer[0].shape[1]).toTensor().reshape(0, hbuffer[0].shape[1])
118 | 
119 |   for h in hbuffer:
120 |     vbuffer = vbuffer.concat(
121 |               sepv.repeat(hbuffer[0].shape[1]).toTensor().reshape(1, hbuffer[0].shape[1]),
122 |               h.map(x => $x).reshape(hbuffer[0].shape[0], hbuffer[0].shape[1]),
123 |               axis = 0
124 |               )
125 |   return vbuffer.disp2d
126 | 
127 | proc `$`*[T](t: Tensor[T]): string =
128 |   ## Pretty-print a tensor (when using ``echo`` for example)
129 |   let desc = "Tensor of shape " & t.shape.join("x") & " of type \"" & T.name & "\" on backend \"" & "Cpu" & "\""
130 |   if t.rank <= 2:
131 |     return desc & "\n" & t.disp2d
132 |   elif t.rank == 3:
133 |     return desc & "\n" & t.disp3d
134 |   elif t.rank == 4:
135 |     return desc & "\n" & t.disp4d
136 |   else:
137 |     return desc & "\n" & " -- NotImplemented: Display not implemented for tensors of rank > 4"


--------------------------------------------------------------------------------
/src/autograd/autograd.nim:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 the Arraymancer contributors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | import typetraits
 16 | 
 17 | const MAX_ARITY = 3 # Max arity/number of input of autograd operations
 18 | 
 19 | type
 20 |   Gate*[TT] = ref object {.inheritable.}
 21 |     arity*: int
 22 |     # Base operator or layer
 23 |     # Inherit from it and add a forward and backward method.
 24 |     # Each operations should set its arity (number of input)
 25 |     # Additional fields like weights, cache for bprop should be added too.
 26 | 
 27 |   Node*[TT] = ref NodeObj[TT]
 28 |   Parents*[TT] = array[MAX_ARITY, Variable[TT]]
 29 |   SmallDiffs*[TT] = array[MAX_ARITY, TT]  #TODO: how not to export that
 30 | 
 31 |   NodeObj[TT] = object
 32 |     # Store an operator/layer + its parent
 33 |     gate*: Gate[TT] #TODO: how not to export that
 34 |     parents*: Parents[TT] #TODO: how not to export that
 35 |     child*: Variable[TT] # Todo: avoid reference to child and add {.acyclic.}
 36 | 
 37 |   Context*[TT] = ref object
 38 |     ## Tape / Wengert list. Contains the list of applied operations or layers
 39 |     nodes: seq[Node[TT]]
 40 | 
 41 |   ## Considerations
 42 |   ## A variable can be used in 2 different computations, in that case both gate will point to it
 43 |   ## It can only have one ancestor
 44 | 
 45 |   Variable*[TT] = ref object
 46 |     ## Wrapper for values
 47 |     tape*: Context[TT] #TODO: how not to export that
 48 |     ancestor*: Node[TT] # Absence of ancestor will be represented by the nil value. TODO: Option type with no overhead: https://forum.nim-lang.org/t/3082
 49 |     value*: TT # TT should be a Tensor[T] or CudaTensor[T] or a scalar
 50 |     grad*: TT # gradient wrt to the last back propagation done
 51 |     # TODO make the grad initialization optional to optimize memory use
 52 | 
 53 | 
 54 | # Somehow if you declare forward before backward, you get invalid declaration order
 55 | # https://github.com/nim-lang/Nim/issues/5325
 56 | method backward*[TT](self: Gate[TT], gradient: TT): SmallDiffs[TT] {.base, inline.} =
 57 |   raise newException(ValueError, "backward method is not implemented for " & $self.type.name)
 58 | 
 59 | method forward*[TT](self: Gate[TT], a, b: Variable[TT]): Variable[TT] {.base, inline.} =
 60 |   # Binary forward
 61 |   raise newException(ValueError, "forward method is not implemented for " & $self.type.name)
 62 | 
 63 | method forward*[TT](self: Gate[TT], a: Variable[TT]): Variable[TT] {.base, inline.}=
 64 |   # Unary forward
 65 |   raise newException(ValueError, "forward method is not implemented for " & $self.type.name)
 66 | 
 67 | proc newContext*(TT: typedesc): Context[TT] {.inline, noSideEffect.} =
 68 |   ## Initialize a context (Tape / Wengert list)
 69 |   new result
 70 |   result.nodes = newSeq[Node[TT]]()
 71 | 
 72 | proc variable*[TT](ctx: Context[TT], value: TT): Variable[TT] {.inline, noSideEffect.} =
 73 |   ## Wrap a variable to the context
 74 |   ## T is a Tensour[T, CudaTensor[T] or scalar T
 75 |   # TODO make the grad initialization optional to optimize memory use
 76 |   return Variable[TT](tape: ctx, ancestor: nil, value: value, grad: value.zeros_like)
 77 | 
 78 | template len[TT](t: Context[TT]): int =
 79 |   ## Returns the number of operations applied in the context
 80 |   t.nodes.len()
 81 | 
 82 | template push*[TT](t: Context[TT], node: Node[TT]) = #TODO: how not to export that
 83 |   ## Append a new operation to the context
 84 |   t.nodes.add(node) #Appending in Nim is add not push
 85 | 
 86 | template value*[TT](v: Variable[TT]): TT  =
 87 |   ## Unwrap the value from its context
 88 |   v.value
 89 | 
 90 | proc check_ctx*(a, b: Variable) {.inline.} =
 91 |   if a.tape[].unsafeAddr != b.tape[].unsafeAddr: # compare pointer adress directly (avoid deep comparison)
 92 |     raise newException(ValueError, "You cannot combine variable from different contexts")
 93 | 
 94 | proc backprop*[TT](v: Variable[TT]) =
 95 |   ## Differentiate the chain of operations w.r.t to this variable.
 96 |   ## Context will be reset
 97 | 
 98 |   # We initialize the Variable we want to backpropagate on with a Tensor of ones.
 99 |   # TODO, restrict to scalar backprop?
100 |   v.grad = v.value.ones_like
101 | 
102 |   # We pop the context until we find the gate that produced our Variable
103 |   while v.tape.len > 0 and v.tape.nodes[^1] != v.ancestor:
104 |     discard v.tape.nodes.pop
105 | 
106 |   # Now, until the context is been all backpropagated through we update
107 |   # each intermediate variables with its accumulated gradient and then pop the node
108 |   # TODO: Count Toward Zero memory optimization:
109 |   # https://rufflewind.com/2016-12-30/reverse-mode-automatic-differentiation and https://github.com/Rufflewind/revad/blob/de509269fe878bc9d564775abc25c4fa663d8a5e/src/chain.rs
110 | 
111 |   while v.tape.len > 0:
112 |     let curNode = v.tape.nodes.pop
113 |     let curVar = curNode.child
114 | 
115 |     let diffs = curNode.gate.backward(curVar.grad)
116 | 
117 |     for i in 0 ..< curNode.gate.arity:
118 |       curNode.parents[i].grad += diffs[i]


--------------------------------------------------------------------------------