├── testing
    ├── run-tests.py
    └── Dockerfile
├── src
    ├── tenet.zig
    ├── reference_counter.zig
    ├── funcs.zig
    ├── optim.zig
    ├── module.zig
    ├── main.zig
    └── tensor.zig
├── LICENSE
├── scripts
    └── grad_example.py
├── bindings
    └── mkl.zig
└── README.md


/testing/run-tests.py:
--------------------------------------------------------------------------------
1 | import subprocess as sp
2 | 
3 | sp.run(["zig", "build", "test"], check=True)
4 | 


--------------------------------------------------------------------------------
/testing/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM christopherhesse/dockertest:v5
2 | 
3 | RUN curl -o zig.tar.xz https://ziglang.org/download/0.7.1/zig-linux-x86_64-0.7.1.tar.xz
4 | RUN mkdir /opt/zig
5 | RUN tar xvf zig.tar.xz -C /opt/zig --strip-components=1
6 | ENV PATH="${PATH}:/opt/zig"


--------------------------------------------------------------------------------
/src/tenet.zig:
--------------------------------------------------------------------------------
1 | pub const array = @import("array.zig");
2 | pub const Array = array.Array;
3 | pub const module = @import("module.zig");
4 | pub const tensor = @import("tensor.zig");
5 | pub const Tensor = tensor.Tensor;
6 | pub const optim = @import("optim.zig");
7 | pub const funcs = @import("funcs.zig");


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Christopher Hesse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/reference_counter.zig:
--------------------------------------------------------------------------------
 1 | // Keep a reference count that is incremented and decremented atomically
 2 | 
 3 | const std = @import("std");
 4 | const builtin = @import("builtin");
 5 | 
 6 | 
 7 | pub const ReferenceCounter = struct {
 8 |     ref_count: usize,
 9 | 
10 |     const Self = @This();
11 | 
12 |     pub fn init() Self {
13 |         return Self{ .ref_count = 1 };
14 |     }
15 | 
16 |     pub fn increment(self: *Self) void {
17 |         // atomically increment the reference count
18 |         var ref_count = self.ref_count;
19 |         while (true) {
20 |             if (ref_count == 0) {
21 |                 // reference count could be zero briefly, but if it's zero, then it's about to be deallocated
22 |                 // and then it can have any value
23 |                 @panic("reference count is zero");
24 |             }
25 |             var new_ref_count = ref_count + 1;
26 |             var result = @cmpxchgWeak(usize, &self.ref_count, ref_count, new_ref_count, builtin.AtomicOrder.Monotonic, builtin.AtomicOrder.Monotonic);
27 |             if (result == null) {
28 |                 break;
29 |             }
30 |             ref_count = result.?;
31 |         }
32 |     }
33 | 
34 |     pub fn decrement(self: *Self) bool {
35 |         // atomically decrement the ref count
36 |         // return true if the ref count hit zero
37 |         var ref_count = self.ref_count;
38 |         while (true) {
39 |             var new_ref_count = ref_count - 1;
40 |             var result = @cmpxchgWeak(usize, &self.ref_count, ref_count, new_ref_count, builtin.AtomicOrder.Monotonic, builtin.AtomicOrder.Monotonic);
41 |             if (result == null) {
42 |                 // the exchange was a success
43 |                 return new_ref_count == 0;
44 |             }
45 |             ref_count = result.?;
46 |         }
47 |     }
48 | };


--------------------------------------------------------------------------------
/scripts/grad_example.py:
--------------------------------------------------------------------------------
 1 | def square(x):
 2 |     return x ** 2
 3 | 
 4 | 
 5 | def cube(x):
 6 |     return x ** 3
 7 | 
 8 | 
 9 | def multiply(x, y):
10 |     return x * y
11 | 
12 | 
13 | def f(x, y):
14 |     a = square(x)
15 |     b = cube(y)
16 |     c = multiply(a, b)
17 |     return c
18 | 
19 | 
20 | def backward_multiply(x, y, grad_out):
21 |     grad_in_x = y * grad_out
22 |     grad_in_y = x * grad_out
23 |     return grad_in_x, grad_in_y
24 | 
25 | 
26 | def backward_square(x, grad_out):
27 |     grad_in = 2 * x * grad_out
28 |     return grad_in
29 | 
30 | 
31 | def backward_cube(x, grad_out):
32 |     grad_in = 3 * x ** 2 * grad_out
33 |     return grad_in
34 | 
35 | 
36 | def backward_f(x, y, grad_z):
37 |     # we actually need the intermediate values to call the backward functions
38 |     # so re-calculate them here (normally we would just store them when running f() the first time)
39 |     a = square(x)
40 |     b = cube(y)
41 |     _c = multiply(a, b)
42 | 
43 |     grad_a, grad_b = backward_multiply(a, b, grad_z)
44 |     grad_y = backward_cube(y, grad_b)
45 |     grad_x = backward_square(x, grad_a)
46 |     return grad_x, grad_y
47 | 
48 | 
49 | # run the function normally
50 | x = 1.0
51 | y = 2.0
52 | z = f(x, y)
53 | print(f"f(x,y): {z}")
54 | 
55 | # run the backward function
56 | grad_z = 1.0  # the initial grad value is set to 1
57 | grad_x, grad_y = backward_f(x, y, grad_z)
58 | print(f"backward_f(x, y, grad_z): grad_x = {grad_x}, grad_y = {grad_y}")
59 | 
60 | # check the backward function using finite differences
61 | # by making small changes to each input to find how the output changes
62 | def finite_differences(x, y, f, epsilon=1e-6):
63 |     grad_x = (f(x + epsilon, y) - f(x - epsilon, y)) / (2 * epsilon)
64 |     grad_y = (f(x, y + epsilon) - f(x, y - epsilon)) / (2 * epsilon)
65 |     return grad_x, grad_y
66 | 
67 | 
68 | grad_x_fd, grad_y_fd = finite_differences(x, y, f)
69 | print(f"finite differences approximation: grad_x = {grad_x_fd}, grad_y = {grad_y_fd}")
70 | 


--------------------------------------------------------------------------------
/bindings/mkl.zig:
--------------------------------------------------------------------------------
 1 | // MKL bindings for fast CPU operations
 2 | 
 3 | const std = @import("std");
 4 | 
 5 | const C = @cImport({
 6 |     @cInclude("mkl.h");
 7 | });
 8 | 
 9 | pub fn cblas_sgemm(a: *f32, b: *f32, c: *f32, lda: u64, ldb: u64, ldc: u64, m: u64, n: u64, k: u64, alpha: f32, beta: f32) void {
10 |     var lda_int = @intCast(C.MKL_INT, lda);
11 |     var ldb_int = @intCast(C.MKL_INT, ldb);
12 |     var ldc_int = @intCast(C.MKL_INT, ldc);
13 |     var m_int = @intCast(C.MKL_INT, m);
14 |     var n_int = @intCast(C.MKL_INT, n);
15 |     var k_int = @intCast(C.MKL_INT, k);
16 |     var layout: C.CBLAS_LAYOUT = C.CBLAS_LAYOUT.CblasRowMajor;
17 |     var transa: C.CBLAS_TRANSPOSE = C.CBLAS_TRANSPOSE.CblasNoTrans;
18 |     var transb: C.CBLAS_TRANSPOSE = C.CBLAS_TRANSPOSE.CblasNoTrans;
19 |     C.cblas_sgemm(
20 |         layout,
21 |         transa,
22 |         transb,
23 |         m_int,
24 |         n_int,
25 |         k_int,
26 |         alpha,
27 |         a,
28 |         lda_int,
29 |         b,
30 |         ldb_int,
31 |         beta,
32 |         c,
33 |         ldc_int,
34 |     );
35 | }
36 | 
37 | pub fn cblas_dgemm(a: *f64, b: *f64, c: *f64, lda: u64, ldb: u64, ldc: u64, m: u64, n: u64, k: u64, alpha: f64, beta: f64) void {
38 |     var lda_int = @intCast(C.MKL_INT, lda);
39 |     var ldb_int = @intCast(C.MKL_INT, ldb);
40 |     var ldc_int = @intCast(C.MKL_INT, ldc);
41 |     var m_int = @intCast(C.MKL_INT, m);
42 |     var n_int = @intCast(C.MKL_INT, n);
43 |     var k_int = @intCast(C.MKL_INT, k);
44 |     var layout: C.CBLAS_LAYOUT = C.CBLAS_LAYOUT.CblasRowMajor;
45 |     var transa: C.CBLAS_TRANSPOSE = C.CBLAS_TRANSPOSE.CblasNoTrans;
46 |     var transb: C.CBLAS_TRANSPOSE = C.CBLAS_TRANSPOSE.CblasNoTrans;
47 |     C.cblas_dgemm(
48 |         layout,
49 |         transa,
50 |         transb,
51 |         m_int,
52 |         n_int,
53 |         k_int,
54 |         alpha,
55 |         a,
56 |         lda_int,
57 |         b,
58 |         ldb_int,
59 |         beta,
60 |         c,
61 |         ldc_int,
62 |     );
63 | }
64 | 


--------------------------------------------------------------------------------
/src/funcs.zig:
--------------------------------------------------------------------------------
 1 | const std = @import("std");
 2 | const array = @import("array.zig");
 3 | const Array = array.Array;
 4 | const tensor = @import("tensor.zig");
 5 | const Tensor = tensor.Tensor;
 6 | const expr = tensor.expr;
 7 | 
 8 | pub fn relu(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 9 |     return try expr(alc, "max(0, x)", .{ .x = x });
10 | }
11 | 
12 | pub fn logSoftmax(alc: *std.mem.Allocator, x: Tensor, dims: []const u64) !Tensor {
13 |     var dims_tensor = Tensor.flatFromBuffer(u64, @bitCast([]u64, dims));
14 |     // https://github.com/google/jax/pull/2260
15 |     var x_shifted = try expr(alc, "x - detach(keep_max(x, dims))", .{ .x = x, .dims = dims_tensor });
16 |     defer x_shifted.release();
17 |     return try expr(alc, "x_shifted - log(keep_sum(exp(x_shifted), dims))", .{ .x_shifted = x_shifted, .dims = dims_tensor });
18 | }
19 | 
20 | test "logSoftmax" {
21 |     var input = try Tensor.allocWithString(f32, std.testing.allocator, "[[1, 2, 3], [1, 20, 300], [0, -1, 1000], [0, -1, 1000]]", tensor.REQUIRES_GRAD);
22 |     defer input.release();
23 |     var dims = [_]u64{1};
24 |     var output = try logSoftmax(std.testing.allocator, input, &dims);
25 |     defer output.release();
26 |     var expected_output = try Array.allocWithString(f32, std.testing.allocator, "[[-2.4076e+00, -1.4076e+00, -4.0761e-01], [-2.9900e+02, -2.8000e+02, 0.0000e+00], [-1.0000e+03, -1.0010e+03, 0.0000e+00], [-1.0000e+03, -1.0010e+03, 0.0000e+00]]");
27 |     defer expected_output.release();
28 |     std.testing.expect(array.allclose(output.data, expected_output, 1e-5, 1e-8));
29 |     var grad_output = try tensor.onesLikeAlloc(std.testing.allocator, output, tensor.NO_FLAGS);
30 |     defer grad_output.release();
31 |     try tensor.backwardAlloc(std.testing.allocator, output, grad_output);
32 |     var expected_grad_input = try Array.allocWithString(f32, std.testing.allocator, "[[ 0.7299,  0.2658, -0.9957], [ 1.0000,  1.0000, -2.0000], [1.0000,  1.0000, -2.0000], [1.0000,  1.0000, -2.0000]]");
33 |     defer expected_grad_input.release();
34 |     std.testing.expect(array.allclose(input.grad.?, expected_grad_input, 1e-5, 1e-3));
35 | }
36 | 
37 | pub fn nllLoss(alc: *std.mem.Allocator, input: Tensor, target: Tensor) !Tensor {
38 |     if (input.data.ndim != 2) {
39 |         @panic("Input has wrong number of dimensions");
40 |     }
41 |     if (target.data.ndim != 1) {
42 |         @panic("Target has wrong number of dimensions");
43 |     }
44 |     if (!array.dtypeIsInteger(target.data.dtype)) {
45 |         @panic("Target dtype must be int");
46 |     }
47 |     var target_expanded = target.reshapeView(&[_]u64{target.data.numel, 1});
48 |     var dims = [_]u64{0,1};
49 |     return try expr(alc, "reduce_mean(-gather(input, 1, target), dims)", .{.input=input, .target=target_expanded, .dims=Tensor.flatFromBuffer(u64, &dims)});
50 | }
51 | 
52 | test "nllLoss" {
53 |     var input = try Tensor.allocWithString(f32, std.testing.allocator, "[[1, 2, 3], [1, 20, 300], [0, 1, 1000]]", tensor.REQUIRES_GRAD);
54 |     defer input.release();
55 |     var target = try Tensor.allocWithString(u64, std.testing.allocator, "[0, 1, 2]", tensor.NO_FLAGS);
56 |     defer target.release();
57 |     var output = try nllLoss(std.testing.allocator, input, target);
58 |     defer output.release();
59 |     var expected_output = try array.scalarAlloc(std.testing.allocator, .f32, -340.3333);
60 |     defer expected_output.release();
61 |     std.testing.expect(array.allclose(output.data, expected_output, 1e-05, 1e-08));
62 | }
63 | 
64 | /// Kaiming init for fan_in init with gain set for ReLU
65 | /// https://arxiv.org/abs/1502.01852
66 | pub fn kaimingUniform(alc: *std.mem.Allocator, dst: Array, r: *std.rand.Random) !void {
67 |     var high = try array.expr(alc, "(3.0 .^ 0.5) .* ((2.0 .^ 0.5) ./ (fan .^ 0.5))", .{.fan=dst.getShape()[0]});
68 |     defer high.release();
69 |     var low = try array.uminusAlloc(alc, high);
70 |     defer low.release();
71 |     array.fillUniform(dst, r, low, high);
72 | }


--------------------------------------------------------------------------------
/src/optim.zig:
--------------------------------------------------------------------------------
  1 | // Optimizer implementations, each takes a list of parameters, and has
  2 | // zeroGrad and step methods.
  3 | 
  4 | const std = @import("std");
  5 | const array = @import("array.zig");
  6 | const Array = array.Array;
  7 | const module = @import("module.zig");
  8 | 
  9 | pub const SGD = struct {
 10 |     parameters: []module.Parameter,
 11 |     momentums: []Array,
 12 |     momentum: f32,
 13 |     alc: *std.mem.Allocator,
 14 | 
 15 |     const Self = @This();
 16 | 
 17 |     pub fn init(alc: *std.mem.Allocator, parameters: []module.Parameter, momentum: f32) !Self {
 18 |         var momentums = try alc.alloc(Array, parameters.len);
 19 |         for (momentums) |_, index| {
 20 |             momentums[index] = try array.zerosLikeAlloc(alc, parameters[index].value.grad.?);
 21 |         }
 22 |         return Self{.parameters=parameters, .alc=alc, .momentum=momentum, .momentums=momentums};
 23 |     }
 24 | 
 25 |     pub fn deinit(self: *Self) void {
 26 |         for (self.momentums) |v| {
 27 |             v.release();
 28 |         }
 29 |         self.alc.free(self.momentums);
 30 |     }
 31 | 
 32 |     pub fn zeroGrad(self: *Self) !void {
 33 |         for (self.parameters) |param| {
 34 |             var grad = param.value.grad.?;
 35 |             var zero = try array.scalarAlloc(self.alc, grad.dtype, 0.0);
 36 |             defer zero.release();
 37 |             array.copy(zero, grad);
 38 |         }
 39 |     }
 40 | 
 41 |     pub fn step(self: *Self, lr: f32) !void {
 42 |         for (self.parameters) |param, index| {
 43 |             var grad = param.value.grad.?;
 44 | 
 45 |             if (self.momentum != 0) {
 46 |                 // update momentum
 47 |                 var v = self.momentums[index];
 48 |                 var new_v = try array.expr(self.alc, "m .* v .+ g", .{.m=self.momentum, .v=v, .g=grad});
 49 |                 defer new_v.release();
 50 |                 array.copy(new_v, v);
 51 |                 grad = v;
 52 |             }
 53 | 
 54 |             var update = try array.expr(self.alc, "-lr .* g", .{.lr=lr, .g=grad});
 55 |             defer update.release();
 56 |             var data = param.value.data;
 57 |             array.plus(data, update, data);
 58 |         }
 59 |     }
 60 | };
 61 | 
 62 | // Adam: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980
 63 | pub const Adam = struct {
 64 |     parameters: []module.Parameter,
 65 |     first_moments: []Array,
 66 |     second_moments: []Array,
 67 |     beta1: f32,
 68 |     beta2: f32,
 69 |     epsilon: f32,
 70 |     step_count: u64,
 71 |     alc: *std.mem.Allocator,
 72 | 
 73 |     const Self = @This();
 74 | 
 75 |     pub fn init(alc: *std.mem.Allocator, parameters: []module.Parameter, beta1: f32, beta2: f32, epsilon: f32) !Self {
 76 |         var first_moments = try alc.alloc(Array, parameters.len);
 77 |         var second_moments = try alc.alloc(Array, parameters.len);
 78 |         for (first_moments) |_, index| {
 79 |             first_moments[index] = try array.zerosLikeAlloc(alc, parameters[index].value.grad.?);
 80 |             second_moments[index] = try array.zerosLikeAlloc(alc, parameters[index].value.grad.?);
 81 |         }
 82 |         return Self{.parameters=parameters, .alc=alc, .beta1=beta1, .beta2=beta2, .epsilon=epsilon, .step_count=0, .first_moments=first_moments, .second_moments=second_moments};
 83 |     }
 84 | 
 85 |     pub fn deinit(self: *Self) void {
 86 |         for (self.first_moments) |v| {
 87 |             v.release();
 88 |         }
 89 |         self.alc.free(self.first_moments);
 90 |         for (self.second_moments) |v| {
 91 |             v.release();
 92 |         }
 93 |         self.alc.free(self.second_moments);
 94 |     }
 95 | 
 96 |     pub fn zeroGrad(self: *Self) !void {
 97 |         for (self.parameters) |param| {
 98 |             var grad = param.value.grad.?;
 99 |             var zero = try array.scalarAlloc(self.alc, grad.dtype, 0.0);
100 |             defer zero.release();
101 |             array.copy(zero, grad);
102 |         }
103 |     }
104 | 
105 |     pub fn step(self: *Self, lr: f32) !void {
106 |         self.step_count += 1;
107 |         for (self.parameters) |param, index| {
108 |             var grad = param.value.grad.?;
109 | 
110 |             var m = self.first_moments[index];
111 |             var new_m = try array.expr(self.alc, "beta1 .* m + (1 - beta1) .* g", .{.m=m, .beta1=self.beta1, .g=grad});
112 |             defer new_m.release();
113 |             array.copy(new_m, m);
114 | 
115 |             var v = self.second_moments[index];
116 |             var new_v = try array.expr(self.alc, "beta2 .* v + (1 - beta2) .* (g .* g)", .{.v=v, .beta2=self.beta2, .g=grad});
117 |             defer new_v.release();
118 |             array.copy(new_v, v);
119 | 
120 |             var m_hat = try array.expr(self.alc, "m ./ (1 - beta1 .^ step)", .{.m=m, .beta1=self.beta1, .step=self.step_count});
121 |             defer m_hat.release();
122 | 
123 |             var v_hat = try array.expr(self.alc, "v ./ (1 - beta2 .^ step)", .{.v=v, .beta2=self.beta2, .step=self.step_count});
124 |             defer v_hat.release();
125 | 
126 |             var update = try array.expr(self.alc, "-lr .* m_hat ./ (v_hat .^ 0.5 + epsilon)", .{.lr=lr, .m_hat=m_hat, .v_hat=v_hat, .epsilon=self.epsilon});
127 |             defer update.release();
128 |             var data = param.value.data;
129 |             array.plus(data, update, data);
130 |         }
131 |     }
132 | };


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tenet
  2 | 
  3 | A [torch](https://github.com/pytorch/pytorch)-inspired automatic differentiation prototype for [Zig](https://ziglang.org/).
  4 | 
  5 | Imagine the [numpy](https://numpy.org/) NDArray, only you can also compute backward in time using inverted functions.  Well, not quite, but you *can* calculate derivatives with respect to the inputs of your computation.
  6 | 
  7 | ## Usage
  8 | 
  9 | The main struct is `Tensor`, an N-dimensional array of numbers, usually floating point numbers.  Here's a short example showing how to do a `+` operation along with a backward pass:
 10 | 
 11 | ```zig
 12 | const tenet = @import("tenet.zig");
 13 | const alc = std.testing.allocator;
 14 | var a = try tenet.Tensor.allocWithValue(f32, alc, &[_]u64{2, 3, 4}, 1.0, tenet.tensor.REQUIRES_GRAD);
 15 | defer a.release();
 16 | var b = try tenet.Tensor.allocWithValue(f32, alc, &[_]u64{2, 3, 4}, 2.0, tenet.tensor.REQUIRES_GRAD);
 17 | defer b.release();
 18 | var out = try tenet.tensor.plusAlloc(alc, a, b);
 19 | defer out.release();
 20 | var grad_out = try tenet.Tensor.allocWithValue(f32, alc, &[_]u64{2, 3, 4}, 4.0, 0);
 21 | defer grad_out.release();
 22 | try tenet.tensor.backwardAlloc(alc, out, grad_out);
 23 | std.testing.expect(tenet.array.equal(a.grad.?, grad_out.data));
 24 | std.testing.expect(tenet.array.equal(b.grad.?, grad_out.data));
 25 | ```
 26 | 
 27 | For a full example, look at the [MNIST example](src/main.zig).
 28 | 
 29 | ## Automatic Differentiation
 30 | 
 31 | If you have a function `z = f(x, y)` and you want to know how to change `x` and `y` to minimize `z`, how do you do find that out?  One way would be to increase and decrease `x` and `y` individually to see how much `z` changes, then move them in whichever direction is better.  That method is called ["finite differences"](https://en.wikipedia.org/wiki/Finite_difference#Relation_with_derivatives).
 32 | 
 33 | For a couple of input variables, this is fine, but it's not very efficient with a large number of input variables.  Instead of doing that, you can find the derivatives by constructing a sort of backward version of the computation graph of your function.  If the function `f` looked like this:
 34 | 
 35 | ```py
 36 | def square(x):
 37 |     return x ** 2
 38 | 
 39 | def cube(x):
 40 |     return x ** 3
 41 | 
 42 | def multiply(x, y):
 43 |     return x * y
 44 | 
 45 | def f(x, y):
 46 |     a = square(x)
 47 |     b = cube(y)
 48 |     c = multiply(a, b)
 49 |     return c
 50 | ```
 51 | 
 52 | You might have a backward function like this:
 53 | 
 54 | ```py
 55 | def backward_multiply(x, y, grad_out):
 56 |     grad_in_x = y * grad_out
 57 |     grad_in_y = x * grad_out
 58 |     return grad_in_x, grad_in_y
 59 | 
 60 | def backward_square(x, grad_out):
 61 |     grad_in = 2 * x * grad_out
 62 |     return grad_in
 63 | 
 64 | def backward_cube(x, grad_out):
 65 |     grad_in = 3 * x ** 2 * grad_out
 66 |     return grad_in
 67 | 
 68 | def backward_f(x, y, grad_z):
 69 |     # we actually need the intermediate values to call the backward functions
 70 |     # so re-calculate them here (normally we would just store them when running f() the first time)
 71 |     a = square(x)
 72 |     b = cube(y)
 73 |     _c = multiply(a, b)
 74 | 
 75 |     grad_a, grad_b = backward_multiply(a, b, grad_z)
 76 |     grad_y = backward_cube(y, grad_b)
 77 |     grad_x = backward_square(x, grad_a)
 78 |     return grad_x, grad_y
 79 | ```
 80 | 
 81 | Where the `backward_` functions are the derivatives of the original functions, using the chain rule to combine them together.  Each `backward_` function takes the original inputs to the normal function, plus an extra `grad_out` parameter, then returns `grad_in_<name>` for each of the original inputs.  You end up with the same information about how the output changes as you would get from changing each input variable individually, only with fewer calculations:
 82 | 
 83 | ```py
 84 | # run the function normally
 85 | x = 1.0
 86 | y = 2.0
 87 | z = f(x, y)
 88 | print(f"f(x,y): {z}")
 89 | 
 90 | # run the backward function
 91 | grad_z = 1.0  # the initial grad value is set to 1
 92 | grad_x, grad_y = backward_f(x, y, grad_z)
 93 | print(f"backward_f(x, y, grad_z): grad_x = {grad_x}, grad_y = {grad_y}")
 94 | 
 95 | # check the backward function using finite differences
 96 | # by making small changes to each input to find how the output changes
 97 | def finite_differences(x, y, f, epsilon=1e-6):
 98 |     grad_x = (f(x + epsilon, y) - f(x - epsilon, y)) / (2 * epsilon)
 99 |     grad_y = (f(x, y + epsilon) - f(x, y - epsilon)) / (2 * epsilon)
100 |     return grad_x, grad_y
101 | 
102 | grad_x_fd, grad_y_fd = finite_differences(x, y, f)
103 | print(f"finite differences approximation: grad_x = {grad_x_fd}, grad_y = {grad_y_fd}")
104 | ```
105 | 
106 | See [scripts/grad_example.py](scripts/grad_example.py) for the full script.  In the case where the inputs and outputs are matrices instead of scalars, `grad_out` will have the shape of the output, and each `grad_in_<name>` will have the shape of the corresponding input.
107 | 
108 | In automatic differentiation, you create `backward_f` automatically based on the operations done by `f`.  Like in torch, no explicit graph is defined when using this prototype.  Arrays in `tenet` track the series of operations used to create them, so when you do the backward pass, each `backward_` function is run for you, automatically.
109 | 
110 | ## Interesting Features
111 | 
112 | There's only one sort of interesting feature about this prototype.  Zig does not support operator overloading, but it would still be nice to write out equations.  Writing out the operations by hand is a bit of a pain:
113 | 
114 | ```zig
115 | // (x * y + z) ^ 2.0
116 | var a = try multiplyAlloc(alc, x, y);
117 | defer a.release();
118 | var b = try addAlloc(alc, a, z);
119 | defer b.release()
120 | var two = try Tensor.allocWithValue(f32, alc, &[_]u64{}, 2, tensor.NO_FLAGS);
121 | defer two.release();
122 | var c = try powerAlloc(alc, b, two);
123 | defer c.release();
124 | ```
125 | 
126 | The `expr` function does all the same stuff, but uses a string at compile time:
127 | 
128 | ```zig
129 | var c = try expr(alc, "(x .* y + z) .^ 2.0", .{.x=x, .y=y, .z=z});
130 | defer c.release();
131 | ```
132 | 
133 | Actually it only parses the expression at compile time, it doesn't fully unroll all the operations. I suspect the only thing keeping it from fully unrolling is some Zig compiler bug.
134 | 
135 | Because operator overloading is not used, the `expr` syntax has much fewer limitations.  For this prototype, it uses [MATLAB style operators](https://www.mathworks.com/help/matlab/matlab_prog/matlab-operators-and-special-characters.html).
136 | 
137 | ## Downsides
138 | 
139 | * Defining an explicit graph may be a better approach than this and is used in the [kann](https://github.com/attractivechaos/kann) library
140 | * Deallocating memory immediately is kind of annoying when you don't use `expr`.  If you use `defer`, it won't be deallocated until the end of the block
141 | * Performance is mediocre, there has been no tuning for performance beyond an option to use [Intel's MKL library](https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/onemkl.html#gs.zou9ms).  The option is `-Duse-mkl` when using `zig build`.
142 | * CPU only for now
143 | * Only tested on windows
144 | * Probably contains serious bugs
145 | * This is mostly a proof-of-concept, and will likely not be maintained as a generally useful library.
146 | 


--------------------------------------------------------------------------------
/src/module.zig:
--------------------------------------------------------------------------------
  1 | // Combine parameters and some operations on them into reusable components
  2 | //
  3 | // These are just structs, the only thing special about them are they have a
  4 | // collectParameters method so that optimizers can find all of the parameters
  5 | // of nested modules.
  6 | 
  7 | const std = @import("std");
  8 | const array = @import("array.zig");
  9 | const Array = array.Array;
 10 | const tensor = @import("tensor.zig");
 11 | const Tensor = tensor.Tensor;
 12 | const funcs = @import("funcs.zig");
 13 | const optim = @import("optim.zig");
 14 | 
 15 | pub const Parameter = struct {
 16 |     path: []const u8,
 17 |     value: Tensor,
 18 | };
 19 | 
 20 | pub const ParameterCollection = struct {
 21 |     data: []Parameter,
 22 |     count: u64,
 23 |     alc: *std.mem.Allocator,
 24 | 
 25 |     const Self = @This();
 26 | 
 27 |     fn init(alc: *std.mem.Allocator) !Self {
 28 |         var data = try alc.alloc(Parameter, 1);
 29 |         return Self{.data=data, .alc=alc, .count=0};
 30 |     }
 31 | 
 32 |     fn deinit(self: *Self) void {
 33 |         var i : usize = 0;
 34 |         while (i < self.count) : (i += 1) {
 35 |             self.alc.free(self.data[i].path);
 36 |         }
 37 |         self.alc.free(self.data);
 38 |     }
 39 | 
 40 |     fn append(self: *Self, path: []u8, value: Tensor) !void {
 41 |         if (self.data.len == self.count) {
 42 |             var new_data = try self.alc.alloc(Parameter, self.count*2);
 43 |             std.mem.copy(Parameter, new_data, self.data);
 44 |             self.alc.free(self.data);
 45 |             self.data = new_data;
 46 |         }
 47 |         var path_copy = try self.alc.alloc(u8, path.len);
 48 |         std.mem.copy(u8, path_copy, path);
 49 |         self.data[self.count] = Parameter{.path=path_copy, .value=value};
 50 |         self.count += 1;
 51 |     }
 52 | };
 53 | 
 54 | pub const ParameterCollector = struct {
 55 |     const Self = @This();
 56 | 
 57 |     prefix: []const u8,
 58 |     collection: *ParameterCollection,
 59 |     parent: ?*const Self,
 60 |     alc: *std.mem.Allocator,
 61 | 
 62 |     pub fn init(alc: *std.mem.Allocator) !Self {
 63 |         var collection = try alc.create(ParameterCollection);
 64 |         collection.* = try ParameterCollection.init(alc);
 65 |         return Self{.prefix="", .collection=collection, .alc=alc, .parent=null};
 66 |     }
 67 | 
 68 |     pub fn deinit(self: *Self) void {
 69 |         self.collection.deinit();
 70 |         self.alc.destroy(self.collection);
 71 |     }
 72 | 
 73 |     pub fn collectParameters(self: *const Self, obj: anytype, comptime name: []const u8) !void {
 74 |         try @field(obj, name).collectParameters(self.withPrefix(name));
 75 |     }
 76 | 
 77 |     pub fn collectSliceParameters(self: *const Self, obj: anytype, comptime name: []const u8) !void {
 78 |         var buf : [1024]u8 = undefined;
 79 |         var slice = @field(obj, name);
 80 |         for (slice) |item, index| {
 81 |             var prefix = try std.fmt.bufPrint(&buf, "{}[{}]", .{name, index});
 82 |             try item.collectParameters(self.withPrefix(prefix));
 83 |         }
 84 |     }
 85 | 
 86 |     pub fn addParameter(self: *const Self, obj: anytype, comptime name: []const u8) !void {
 87 |         var value = @field(obj, name);
 88 |         // traverse parent chain to build full path
 89 |         var cur : *const Self = self;
 90 |         var path_len : u64 = 0;
 91 |         while (cur.parent != null) {
 92 |             path_len += cur.prefix.len + 1;
 93 |             cur = cur.parent.?;
 94 |         }
 95 |         path_len += name.len;
 96 |         var path = try self.alc.alloc(u8, path_len);
 97 |         cur = self;
 98 |         var offset: u64 = path_len;
 99 |         offset -= name.len;
100 |         std.mem.copy(u8, path[offset..], name);
101 |         while (cur.parent != null) {
102 |             offset -= 1;
103 |             path[offset] = '.';
104 |             offset -= cur.prefix.len;
105 |             std.mem.copy(u8, path[offset..], cur.prefix);
106 |             cur = cur.parent.?;
107 |         }
108 |         if (offset != 0) {
109 |             @panic("Incorrect offset calculation");
110 |         }
111 |         try self.collection.append(path, value);
112 |         self.alc.free(path);
113 |     }
114 | 
115 |     pub fn getParameters(self: *const Self) []Parameter {
116 |         return self.collection.data[0..self.collection.count];
117 |     }
118 | 
119 |     pub fn withPrefix(self: *const Self, prefix: []const u8) Self {
120 |         return Self{.prefix=prefix, .collection=self.collection, .alc=self.alc, .parent=self};
121 |     }
122 | };
123 | 
124 | pub const Dense = struct {
125 |     weight: Tensor,
126 |     bias: Tensor,
127 | 
128 |     const Self = @This();
129 | 
130 |     pub fn init(alc: *std.mem.Allocator, rng: *std.rand.Random, in_features: u64, out_features: u64) !Self {
131 |         var weight = try tensor.zerosAlloc(alc, .f32, &[_]u64{in_features, out_features}, tensor.REQUIRES_GRAD);
132 |         var bias = try tensor.zerosAlloc(alc, .f32, &[_]u64{out_features}, tensor.REQUIRES_GRAD);
133 |         try funcs.kaimingUniform(alc, weight.data, rng);
134 |         var high = try array.expr(alc, "1 ./ (in_features .^ 0.5)", .{.in_features=in_features});
135 |         defer high.release();
136 |         var low = try array.uminusAlloc(alc, high);
137 |         defer low.release();
138 |         array.fillUniform(bias.data, rng, low, high);
139 |         return Self{.weight=weight, .bias=bias};
140 |     }
141 | 
142 |     pub fn deinit(self: *Self) void {
143 |         self.weight.release();
144 |         self.bias.release();
145 |     }
146 | 
147 |     pub fn collectParameters(self: Self, pc: ParameterCollector) !void {
148 |         try pc.addParameter(self, "weight");
149 |         try pc.addParameter(self, "bias");
150 |     }
151 | 
152 |     pub fn forward(self: *Self, alc: *std.mem.Allocator, x: Tensor) !Tensor {
153 |         return try tensor.expr(alc, "(x * weight) + bias", .{.x=x, .weight=self.weight, .bias=self.bias});
154 |     }
155 | };
156 | 
157 | pub const MLP = struct {
158 |     fc1: Dense,
159 |     fc2: Dense,
160 | 
161 |     const Self = @This();
162 | 
163 |     pub fn init(alc: *std.mem.Allocator, rng: *std.rand.Random, input_size: u64, hidden_size: u64, output_size: u64) !Self {
164 |         return Self{.fc1=try Dense.init(alc, rng, input_size, hidden_size), .fc2=try Dense.init(alc, rng, hidden_size, output_size)};
165 |     }
166 | 
167 |     pub fn deinit(self: *Self) void {
168 |         self.fc1.deinit();
169 |         self.fc2.deinit();
170 |     }
171 | 
172 |     pub fn collectParameters(self: Self, pc: ParameterCollector) !void {
173 |         try pc.collectParameters(self, "fc1");
174 |         try pc.collectParameters(self, "fc2");
175 |     }
176 | 
177 |     pub fn forward(self: *Self, alc: *std.mem.Allocator, x: Tensor) !Tensor {
178 |         var fc1_out = try self.fc1.forward(alc, x);
179 |         var fc1_act : Tensor = undefined;
180 |         {
181 |             defer fc1_out.release();
182 |             fc1_act = try funcs.relu(alc, fc1_out);
183 |         }
184 |         var fc2_out : Tensor = undefined;
185 |         {
186 |             defer fc1_act.release();
187 |             return try self.fc2.forward(alc, fc1_act);
188 |         }
189 |     }
190 | };
191 | 
192 | test "mlp" {
193 |     var in_features : u64 = 5;
194 |     var hidden_features : u64 = 2;
195 |     var out_features : u64 = 2;
196 |     var gen = std.rand.Xoroshiro128.init(0);
197 |     var mlp = try MLP.init(std.testing.allocator, &gen.random, in_features, hidden_features, out_features);
198 |     defer mlp.deinit();
199 | 
200 |     mlp.fc1.weight.release();
201 |     mlp.fc1.weight = try Tensor.allocWithRange(f32, std.testing.allocator, &[_]u64{in_features, hidden_features}, 0.0, 1.0, tensor.REQUIRES_GRAD);
202 |     mlp.fc1.bias.release();
203 |     mlp.fc1.bias = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{hidden_features}, 0, tensor.REQUIRES_GRAD);
204 | 
205 |     mlp.fc2.weight.release();
206 |     mlp.fc2.weight = try Tensor.allocWithRange(f32, std.testing.allocator, &[_]u64{hidden_features, out_features}, 0.0, 1.0, tensor.REQUIRES_GRAD);
207 |     mlp.fc2.bias.release();
208 |     mlp.fc2.bias = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{out_features}, 0, tensor.REQUIRES_GRAD);
209 | 
210 |     var input = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{4, 5}, 1.0, tensor.NO_FLAGS);
211 |     defer input.release();
212 |     var target = try Tensor.allocWithValue(u64, std.testing.allocator, &[_]u64{4}, 0, tensor.NO_FLAGS);
213 |     defer target.release();
214 |     var logits = try mlp.forward(std.testing.allocator, input);
215 |     defer logits.release();
216 |     var output = try funcs.logSoftmax(std.testing.allocator, logits, &[_]u64{1});
217 |     defer output.release();
218 |     std.testing.expect(output.data.get(f32, &[_]u64{0,0}) == -45.0);
219 |     std.testing.expect(output.data.get(f32, &[_]u64{0,1}) == 0.0);
220 |     var loss = try funcs.nllLoss(std.testing.allocator, output, target);
221 |     defer loss.release();
222 |     var grad_output = try tensor.onesLikeAlloc(std.testing.allocator, loss, tensor.NO_FLAGS);
223 |     defer grad_output.release();
224 |     var pc = try ParameterCollector.init(std.testing.allocator);
225 |     defer pc.deinit();
226 |     try mlp.collectParameters(pc);
227 |     var opt = try optim.SGD.init(std.testing.allocator, pc.getParameters(), 0.0);
228 |     defer opt.deinit();
229 |     try opt.zeroGrad();
230 |     try tensor.backwardAlloc(std.testing.allocator, loss, grad_output);
231 |     std.testing.expect(mlp.fc1.weight.grad.?.get(f32, &[_]u64{0,0}) == 1.0);
232 |     std.testing.expect(mlp.fc2.weight.grad.?.get(f32, &[_]u64{0,0}) == -20.0);
233 |     var before = mlp.fc2.weight.data.get(f32, &[_]u64{0,0});
234 |     try opt.step(2);
235 |     var after = mlp.fc2.weight.data.get(f32, &[_]u64{0,0});
236 |     std.testing.expect(after - before == 40.0);
237 | }


--------------------------------------------------------------------------------
/src/main.zig:
--------------------------------------------------------------------------------
  1 | // MNIST example based on:
  2 | //   https://github.com/milindmalshe/Fully-Connected-Neural-Network-MNIST-Classification-PyTorch/blob/master/FCN_MNIST_Classification_PyTorch.py
  3 | // Download dataset from http://yann.lecun.com/exdb/mnist/
  4 | // Run with `zig build -Drelease-fast run -- <path to folder with mnist files>`
  5 | // If you have MKL installed from https://software.intel.com/content/www/us/en/develop/tools/oneapi/base-toolkit/download.html
  6 | //   you can run with `zig build -Drelease-fast -Duse-mkl run -- <data path>` to use a much faster matrix multiply operation
  7 | //   note that paths are hardcoded for windows.
  8 | 
  9 | const std = @import("std");
 10 | 
 11 | const tenet = @import("tenet.zig");
 12 | const Array = tenet.Array;
 13 | const Tensor = tenet.Tensor;
 14 | 
 15 | fn readIdx(comptime T: type, alc: *std.mem.Allocator, dirpath: []const u8, filename: []const u8, magic_number: i32, comptime num_dims: comptime_int) !Tensor {
 16 |     // check for the already extracted file, if it is missing, extract it from the provided compressed file path
 17 |     if (!std.mem.eql(u8, filename[filename.len-3..], ".gz")) {
 18 |         std.debug.panic("Filename should end in .gz: {}", .{filename});
 19 |     }
 20 |     const extracted_filename = filename[0..filename.len - 3];
 21 |     var f : std.fs.File = undefined;
 22 |     var dir = try std.fs.cwd().openDir(dirpath, .{});
 23 |     defer dir.close();
 24 |     if (dir.openFile(extracted_filename, std.fs.File.OpenFlags{ .read = true })) |ok| {
 25 |         f = ok;
 26 |     } else |err| switch (err) {
 27 |         error.FileNotFound => {
 28 |             // extract the file
 29 |             {
 30 |                 var fw = try dir.createFile(extracted_filename, std.fs.File.CreateFlags{});
 31 |                 defer fw.close();
 32 |                 var fr = try dir.openFile(filename, std.fs.File.OpenFlags{ .read = true });
 33 |                 defer fr.close();
 34 |                 var s = try std.compress.gzip.gzipStream(alc, fr.reader());
 35 |                 defer s.deinit();
 36 |                 var buf : [4096]u8 = undefined;
 37 |                 var total_nbytes : u64 = 0;
 38 |                 while (true) {
 39 |                     var nbytes = try s.read(&buf);
 40 |                     if (nbytes == 0) {
 41 |                         break;
 42 |                     }
 43 |                     try fw.writeAll(buf[0..nbytes]);
 44 |                     total_nbytes += nbytes;
 45 |                 }
 46 |             }
 47 |             // open the extracted file
 48 |             f = try dir.openFile(extracted_filename, std.fs.File.OpenFlags{ .read = true });
 49 |         },
 50 |         else => {
 51 |             std.debug.panic("Failed to open file {}", .{err});
 52 |         },
 53 |     }
 54 |     defer f.close();
 55 |     var r = f.reader();
 56 | 
 57 |     var num = try r.readIntBig(i32);
 58 |     if (num != magic_number) {
 59 |         return error.InvalidFile;
 60 |     }
 61 |     var shape = [_]u64{0} ** num_dims;
 62 |     for (shape) |_, i| {
 63 |         shape[i] = @intCast(u64, try r.readIntBig(i32));
 64 |     }
 65 |     // create array, read into it
 66 |     var result = try Tensor.allocWithValue(T, alc, &shape, 0, tenet.tensor.NO_FLAGS);
 67 |     errdefer result.release();
 68 |     var data_buf = result.data.getBuffer(T);
 69 |     var nbytes = try r.read(data_buf);
 70 |     if (nbytes != data_buf.len) {
 71 |         return error.InvalidFile;
 72 |     }
 73 |     return result;
 74 | }
 75 | 
 76 | fn loadImageData(alc: *std.mem.Allocator, dirpath: []const u8, filename: []const u8) !Tensor {
 77 |     std.debug.print("reading {}/{}\n", .{dirpath, filename});
 78 |     return readIdx(u8, alc, dirpath, filename, 2051, 3);
 79 | }
 80 | 
 81 | fn loadLabelData(alc: *std.mem.Allocator, dirpath: []const u8, filename: []const u8) !Tensor {
 82 |     std.debug.print("reading {}/{}\n", .{dirpath, filename});
 83 |     return readIdx(u8, alc, dirpath, filename, 2049, 1);
 84 | }
 85 | 
 86 | fn preprocessImages(alc: *std.mem.Allocator, images: Tensor) !Tensor {
 87 |     return try tenet.tensor.expr(alc, "f32(images) ./ 255.0", .{.images=images});
 88 | }
 89 | 
 90 | const Model = struct {
 91 |     mlp: tenet.module.MLP,
 92 | 
 93 |     const Self = @This();
 94 | 
 95 |     fn init(alc: *std.mem.Allocator, rng: *std.rand.Random, input_size: u64, hidden_size: u64, output_size: u64) !Self {
 96 |         return Self{.mlp=try tenet.module.MLP.init(alc, rng, input_size, hidden_size, output_size)};
 97 |     }
 98 | 
 99 |     fn collectParameters(self: Self, pc: tenet.module.ParameterCollector) !void {
100 |         try pc.collectParameters(self, "mlp");
101 |     }
102 | 
103 |     fn forward(self: *Self, alc: *std.mem.Allocator, x: Tensor) !Tensor {
104 |         var logits = try self.mlp.forward(alc, x);
105 |         defer logits.release();
106 |         return try tenet.funcs.logSoftmax(alc, logits, &[_]u64{1});
107 |     }
108 | 
109 |     fn deinit(self: *Self) void {
110 |         self.mlp.deinit();
111 |     }
112 | };
113 | 
114 | pub fn main() !void {
115 |     var gpa = std.heap.GeneralPurposeAllocator(.{}){};
116 |     var alc = &gpa.allocator;
117 | 
118 |     var args = try std.process.argsAlloc(alc);
119 |     defer std.process.argsFree(alc, args);
120 | 
121 |     if (args.len != 2) {
122 |         @panic("Incorrect number of args, must provide path to MNIST data");
123 |     }
124 | 
125 |     var dataset_path = args[1];
126 | 
127 |     const batch_size : u64 = 100;
128 |     const in_features : u64 = 28*28;
129 |     const hidden_features : u64 = 500;
130 |     const out_features : u64 = 10;
131 |     const num_epochs : u64 = 5;
132 |     const learning_rate : f32 = 0.001;
133 | 
134 |     var gen = std.rand.Xoroshiro128.init(0);
135 |     var model = try Model.init(alc, &gen.random, in_features, hidden_features, out_features);
136 |     defer model.deinit();
137 |     
138 |     var train_images_raw = try loadImageData(alc, dataset_path, "train-images-idx3-ubyte.gz");
139 |     defer train_images_raw.release();
140 |     var train_images = try preprocessImages(alc, train_images_raw);
141 |     defer train_images.release();
142 |     var train_labels = try loadLabelData(alc, dataset_path, "train-labels-idx1-ubyte.gz");
143 |     defer train_labels.release();
144 |     
145 |     var test_images_raw = try loadImageData(alc, dataset_path, "t10k-images-idx3-ubyte.gz");
146 |     defer test_images_raw.release();
147 |     var test_images = try preprocessImages(alc, test_images_raw);
148 |     defer test_images.release();
149 |     var test_labels = try loadLabelData(alc, dataset_path, "t10k-labels-idx1-ubyte.gz");
150 |     defer test_labels.release();
151 | 
152 |     var pc = try tenet.module.ParameterCollector.init(alc);
153 |     defer pc.deinit();
154 |     try model.collectParameters(pc);
155 | 
156 |     var opt = try tenet.optim.SGD.init(alc, pc.getParameters(), 0.9);
157 |     // var opt = try tenet.optim.Adam.init(alc, pc.getParameters(), 0.9, 0.999, 1e-8);
158 |     defer opt.deinit();
159 | 
160 |     var epoch : u64 = 0;
161 | 
162 |     const num_train_batches = @divTrunc(train_images.data.getShape()[0], batch_size);
163 |     const num_test_batches = @divTrunc(test_images.data.getShape()[0], batch_size);
164 | 
165 |     while (epoch < num_epochs) : (epoch += 1) {
166 |         std.debug.print("epoch {}\n", .{epoch});
167 |         var batch_index : u64 = 0;
168 |         var start = std.time.milliTimestamp();
169 |         var image_count : u64 = 0;
170 | 
171 | 
172 |         while (batch_index < num_train_batches) : (batch_index += 1) {
173 |             var input = train_images.narrowView(&[_]u64{batch_size * batch_index, 0, 0}, &[_]u64{batch_size, 28, 28});
174 |             var input_flat = input.reshapeView(&[_]u64{batch_size, in_features});
175 |             var labels = train_labels.narrowView(&[_]u64{batch_size * batch_index}, &[_]u64{batch_size});
176 |             var logprobs = try model.forward(alc, input_flat);
177 |             defer logprobs.release();
178 |             var loss = try tenet.funcs.nllLoss(alc, logprobs, labels);
179 |             defer loss.release();
180 | 
181 |             try opt.zeroGrad();
182 |             try tenet.tensor.backwardScalarAlloc(alc, loss);
183 |             try opt.step(learning_rate);
184 | 
185 |             var end = std.time.milliTimestamp();
186 |             image_count += batch_size;
187 |             if (batch_index % 100 == 0) {
188 |                 var rate = @intToFloat(f32, image_count) / ((@intToFloat(f32, end - start) / 1000));
189 |                 std.debug.print("train step batch_index {} num_train_batches {} loss {} rate {}\n", .{batch_index, num_train_batches, loss.data.getItem(f32), @floatToInt(u64, rate)});
190 |             }
191 |         }
192 | 
193 |         batch_index = 0;
194 |         var total : u64 = 0;
195 |         var correct : u64 = 0;
196 |         while (batch_index < num_test_batches) : (batch_index += 1) {
197 |             var input = test_images.narrowView(&[_]u64{batch_size * batch_index, 0, 0}, &[_]u64{batch_size, 28, 28});
198 |             var input_flat = input.reshapeView(&[_]u64{batch_size, in_features});
199 |             var logprobs = try model.forward(alc, input_flat);
200 |             defer logprobs.release();
201 |             var labels = test_labels.narrowView(&[_]u64{batch_size * batch_index}, &[_]u64{batch_size});
202 |             var dims = [_]u64{0};
203 |             var correct_count = try tenet.array.expr(alc, "reduce_sum(reduce_arg_max(logprobs, 1) == labels, dims)", .{.logprobs=logprobs.data, .labels=labels.data, .dims=Array.flatFromBuffer(u64, &dims)});
204 |             defer correct_count.release();
205 |             total += input.data.getShape()[0];
206 |             correct += correct_count.getItem(u64);
207 |         }
208 |         var accuracy : f32 = @intToFloat(f32, correct) / @intToFloat(f32, total);
209 |         std.debug.print("test step correct {} total {} accuracy {}%\n", .{correct, total, @floatToInt(u64, accuracy * 100)});
210 |     }
211 | }
212 | 
213 | 


--------------------------------------------------------------------------------
/src/tensor.zig:
--------------------------------------------------------------------------------
   1 | // A wrapper for an N-dimensional array that keeps track of the computation graph
   2 | // so that gradients can be calculated with respect to the inputs of the graph.
   3 | //
   4 | // The underlying array is available as the .data property and, if REQUIRES_GRAD
   5 | // is set for the Tensor, the .grad property will contain a second array the same
   6 | // shape and dtype as the array. The gradient will be placed in that array
   7 | // when it is calculated using backwardAlloc().
   8 | //
   9 | // These are reference counted, so you can call .retain() and .release() on them.
  10 | //
  11 | // Tensors can be provided with the data and grad arrays, but the following
  12 | // things will still be allocated:
  13 | //
  14 | //   grad_record: these are allocated whenever the Tensor is created as the result of an operation
  15 | //                      and deallocated when the Tensor is deallocated
  16 | //   ref_counter: keeps track of the reference count of the Tensor, which is separate from the
  17 | //                      reference counts of the data and grad arrays.  When the reference count
  18 | //                      hits zero, the data and grad will have their reference counts decremented by 1.
  19 | 
  20 | const std = @import("std");
  21 | const array = @import("array.zig");
  22 | const Array = array.Array;
  23 | const DType = array.DType;
  24 | const reference_counter = @import("reference_counter.zig");
  25 | const ReferenceCounter = reference_counter.ReferenceCounter;
  26 | 
  27 | pub const NO_FLAGS = 0;
  28 | pub const REQUIRES_GRAD = 1;
  29 | pub const IS_BRANCH = 2;
  30 | 
  31 | const ForwardError = error{OutOfMemory};
  32 | 
  33 | const BackwardError = error{OutOfMemory};
  34 | 
  35 | const ForwardFn = fn (alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array;
  36 | 
  37 | const BackwardFn = fn (alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void;
  38 | 
  39 | const DeallocFn = fn (alc: *std.mem.Allocator, extra_args_ptr: u64) void;
  40 | 
  41 | const GradientRecord = struct {
  42 |     inputs: []Tensor,
  43 |     extra_args_ptr: u64,
  44 |     output: Tensor,
  45 |     grad_output: ?Array,
  46 |     backward_fn: BackwardFn,
  47 |     dealloc_fn: ?DeallocFn,
  48 |     alc: *std.mem.Allocator,
  49 | 
  50 |     const Self = @This();
  51 | 
  52 |     pub fn alloc(alc: *std.mem.Allocator, inputs: []Tensor, extra_args_ptr: u64, output: Tensor, backward_fn: BackwardFn, dealloc_fn: ?DeallocFn) !Self {
  53 |         var i = try alc.alloc(Tensor, inputs.len);
  54 |         errdefer alc.free(i);
  55 |         std.mem.copy(Tensor, i, inputs);
  56 |         for (i) |*t| {
  57 |             t.retain();
  58 |         }
  59 |         // we don't keep a reference to the output tensor, since the output tensor will
  60 |         // own this GradientRecord
  61 |         return Self{ .inputs = i, .extra_args_ptr = extra_args_ptr, .output = output, .grad_output = null, .alc = alc, .backward_fn = backward_fn, .dealloc_fn = dealloc_fn };
  62 |     }
  63 | 
  64 |     pub fn dealloc(self: *Self) void {
  65 |         for (self.inputs) |*t| {
  66 |             t.release();
  67 |         }
  68 |         if (self.grad_output != null) {
  69 |             @panic("grad_output present on GradientRecord");
  70 |         }
  71 |         self.alc.free(self.inputs);
  72 |         if (self.dealloc_fn) |dealloc_fn| {
  73 |             dealloc_fn(self.alc, self.extra_args_ptr);
  74 |         }
  75 |     }
  76 | 
  77 |     pub fn format(
  78 |         self: Self,
  79 |         comptime fmt: []const u8,
  80 |         options: std.fmt.FormatOptions,
  81 |         writer: anytype,
  82 |     ) !void {
  83 |         if (fmt.len != 0) {
  84 |             @compileError("Unknown format character: '" ++ f ++ "'");
  85 |         }
  86 |         try std.fmt.format(writer, "GradientRecord(num_inputs={}, has_grad_output={})", .{ self.inputs.len, self.grad_output != null });
  87 |     }
  88 | };
  89 | 
  90 | fn has_flag(flags: u64, flag: u64) bool {
  91 |     return flags & flag == flag;
  92 | }
  93 | 
  94 | pub const Tensor = struct {
  95 |     data: Array,
  96 |     grad: ?Array,
  97 |     requires_grad: bool,
  98 |     is_leaf: bool,
  99 |     grad_record: ?*GradientRecord,
 100 |     ref_counter: ?*ReferenceCounter,
 101 |     alc: ?*std.mem.Allocator,
 102 | 
 103 |     const Self = @This();
 104 | 
 105 |     fn alloc(alc: *std.mem.Allocator, data: Array, grad: ?Array, flags: u64) !Self {
 106 |         var requires_grad = has_flag(flags, REQUIRES_GRAD);
 107 |         var is_leaf = !has_flag(flags, IS_BRANCH);
 108 |         if (requires_grad and !(data.dtype == array.DType.f32 or data.dtype == array.DType.f64)) {
 109 |             @panic("grad requires floating point dtype");
 110 |         }
 111 |         var grad_array: ?Array = grad;
 112 |         if (is_leaf and requires_grad and grad == null) {
 113 |             grad_array = try array.zerosLikeAlloc(alc, data);
 114 |         }
 115 |         var ref_counter = try alc.create(ReferenceCounter);
 116 |         ref_counter.* = ReferenceCounter.init();
 117 |         return Self{ .data = data, .grad = grad_array, .requires_grad = requires_grad, .is_leaf = is_leaf, .grad_record = null, .ref_counter = ref_counter, .alc = alc };
 118 |     }
 119 | 
 120 |     pub fn allocWithValue(comptime T: type, alc: *std.mem.Allocator, shape: []const u64, value: T, flags: u64) !Self {
 121 |         var data = try Array.allocWithValue(T, alc, shape, value);
 122 |         return Self.alloc(alc, data, null, flags);
 123 |     }
 124 | 
 125 |     pub fn allocWithString(comptime T: type, alc: *std.mem.Allocator, str: []const u8, flags: u64) !Self {
 126 |         var data = try Array.allocWithString(T, alc, str);
 127 |         return Self.alloc(alc, data, null, flags);
 128 |     }
 129 | 
 130 |     pub fn allocWithRange(comptime T: type, alc: *std.mem.Allocator, shape: []const u64, start: T, step: T, flags: u64) !Self {
 131 |         var data = try Array.allocWithRange(T, alc, shape, start, step);
 132 |         return Self.alloc(alc, data, null, flags);
 133 |     }
 134 | 
 135 |     pub fn allocWithData(alc: *std.mem.Allocator, data: Array, flags: u64) !Self {
 136 |         data.retain();
 137 |         return Self.alloc(alc, data, null, flags);
 138 |     }
 139 | 
 140 |     pub fn allocWithDataAndGrad(alc: *std.mem.Allocator, data: Array, grad: Array, flags: u64) !Self {
 141 |         data.retain();
 142 |         grad.retain();
 143 |         if (!has_flag(flags, REQUIRES_GRAD)) {
 144 |             @panic("must require grad if grad is specified");
 145 |         }
 146 |         array.assertShapesAreTheSame(data, grad);
 147 |         array.assertTypesAreTheSame(data, grad);
 148 |         return Self.alloc(alc, data, grad, flags);
 149 |     }
 150 | 
 151 |     pub fn allocWithBuffers(comptime T: type, alc: *std.mem.Allocator, shape: []const u64, data_buf: []T, grad_buf: []T) !Self {
 152 |         var data = Array.fromBuffer(T, shape, data_buf);
 153 |         var grad = Array.fromBuffer(T, shape, grad_buf);
 154 |         return Self.alloc(alc, data, grad, REQUIRES_GRAD);
 155 |     }
 156 | 
 157 |     pub fn fromBuffer(comptime T: type, shape: []const u64, data_buf: []T) Self {
 158 |         var data = Array.fromBuffer(T, shape, data_buf);
 159 |         return Self{ .data = data, .grad = null, .requires_grad = false, .is_leaf = true, .grad_record = null, .ref_counter = null, .alc = null };
 160 |     }
 161 | 
 162 |     pub fn flatFromBuffer(comptime T: type, data_buf: []T) Self {
 163 |         return Self.fromBuffer(T, &[_]u64{data_buf.len}, data_buf);
 164 |     }
 165 | 
 166 |     pub fn scalarFromBuffer(comptime T: type, data_buf: []T) Self {
 167 |         return Self.fromBuffer(T, &[_]u64{}, data_buf);
 168 |     }
 169 | 
 170 |     pub fn getDType(self: Self) DType {
 171 |         return self.data.dtype;
 172 |     }
 173 | 
 174 |     pub fn narrowView(self: Self, pos: []const u64, shape: []const u64) Self {
 175 |         var grad = self.grad;
 176 |         if (grad != null) {
 177 |             grad = grad.?.narrowView(pos, shape);
 178 |         }
 179 |         return Self{ .data = self.data.narrowView(pos, shape), .grad = grad, .requires_grad = self.requires_grad, .is_leaf = self.is_leaf, .grad_record = self.grad_record, .ref_counter = self.ref_counter, .alc = self.alc };
 180 |     }
 181 | 
 182 |     pub fn reshapeView(self: Self, shape: []const u64) Self {
 183 |         var grad = self.grad;
 184 |         if (grad != null) {
 185 |             grad = grad.?.reshapeView(shape);
 186 |         }
 187 |         return Self{ .data = self.data.reshapeView(shape), .grad = grad, .requires_grad = self.requires_grad, .is_leaf = self.is_leaf, .grad_record = self.grad_record, .ref_counter = self.ref_counter, .alc = self.alc };
 188 |     }
 189 | 
 190 |     pub fn retain(self: Self) void {
 191 |         if (self.ref_counter) |ref_counter| {
 192 |             ref_counter.increment();
 193 |         }
 194 |     }
 195 | 
 196 |     pub fn release(self: Self) void {
 197 |         if (self.ref_counter) |ref_counter| {
 198 |             if (ref_counter.decrement()) {
 199 |                 self.data.release();
 200 |                 if (self.grad) |g| {
 201 |                     g.release();
 202 |                 }
 203 |                 var alc = self.alc.?;
 204 |                 alc.destroy(ref_counter);
 205 |                 // if the tensor has a grad record, it's owned by the tensor
 206 |                 if (self.grad_record) |gr| {
 207 |                     gr.dealloc();
 208 |                     alc.destroy(gr);
 209 |                 }
 210 |             }
 211 |         }
 212 |     }
 213 | 
 214 |     pub fn format(
 215 |         self: Self,
 216 |         comptime fmt: []const u8,
 217 |         options: std.fmt.FormatOptions,
 218 |         writer: anytype,
 219 |     ) !void {
 220 |         if (fmt.len != 0) {
 221 |             @compileError("Unknown format character: '" ++ f ++ "'");
 222 |         }
 223 |         try std.fmt.format(writer, "Tensor(is_leaf={}, requires_grad={}, data={}, grad={}, grad_record={})", .{ self.is_leaf, self.requires_grad, self.data, self.grad, self.grad_record });
 224 |     }
 225 | };
 226 | 
 227 | test "format_tensor" {
 228 |     var t = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3 }, 1.0, REQUIRES_GRAD);
 229 |     defer t.release();
 230 |     var t2 = try timesAlloc(std.testing.allocator, t, t);
 231 |     defer t2.release();
 232 |     std.debug.print("{}\n", .{t2});
 233 | }
 234 | 
 235 | pub fn zerosAlloc(alc: *std.mem.Allocator, dtype: DType, shape: []const u64, flags: u64) !Tensor {
 236 |     var data = try array.zerosAlloc(alc, dtype, shape);
 237 |     var t = try Tensor.allocWithData(alc, data, flags);
 238 |     data.release();
 239 |     return t;
 240 | }
 241 | 
 242 | pub fn zerosLikeAlloc(alc: *std.mem.Allocator, t: Tensor, flags: u64) !Tensor {
 243 |     return zerosAlloc(alc, t.getDType(), t.data.getShape(), flags);
 244 | }
 245 | 
 246 | pub fn onesAlloc(alc: *std.mem.Allocator, dtype: DType, shape: []const u64, flags: u64) !Tensor {
 247 |     var data = try array.onesAlloc(alc, dtype, shape);
 248 |     var t = try Tensor.allocWithData(alc, data, flags);
 249 |     data.release();
 250 |     return t;
 251 | }
 252 | 
 253 | pub fn onesLikeAlloc(alc: *std.mem.Allocator, t: Tensor, flags: u64) !Tensor {
 254 |     return onesAlloc(alc, t.getDType(), t.data.getShape(), flags);
 255 | }
 256 | 
 257 | pub fn detachAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 258 |     return try Tensor.allocWithData(alc, x.data, NO_FLAGS);
 259 | }
 260 | 
 261 | pub fn scalarAlloc(alc: *std.mem.Allocator, dtype: DType, value: f64) !Tensor {
 262 |     var data = try array.scalarAlloc(alc, dtype, value);
 263 |     var output = try Tensor.allocWithData(alc, data, 0);
 264 |     data.release();
 265 |     return output;
 266 | }
 267 | 
 268 | pub fn plusAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 269 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, plusForwardAlloc, plusBackwardAlloc, null);
 270 | }
 271 | 
 272 | pub fn plusForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 273 |     return try array.plusAlloc(alc, inputs[0], inputs[1]);
 274 | }
 275 | 
 276 | pub fn plusBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 277 |     for (grad_inputs_out) |maybe_grad_input| {
 278 |         if (maybe_grad_input) |grad_input| {
 279 |             array.bcastsum(grad_output, grad_input);
 280 |         }
 281 |     }
 282 | }
 283 | 
 284 | pub fn uplusAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 285 |     x.retain();
 286 |     return x;
 287 | }
 288 | 
 289 | pub fn minusAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 290 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, minusForwardAlloc, minusBackwardAlloc, null);
 291 | }
 292 | 
 293 | pub fn minusForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 294 |     return try array.minusAlloc(alc, inputs[0], inputs[1]);
 295 | }
 296 | 
 297 | pub fn minusBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 298 |     for (grad_inputs_out) |maybe_grad_input, i| {
 299 |         if (maybe_grad_input) |grad_input| {
 300 |             array.bcastsum(grad_output, grad_input);
 301 |             if (i == 1) {
 302 |                 var negative_one = try array.scalarAlloc(alc, grad_input.dtype, -1);
 303 |                 defer negative_one.release();
 304 |                 array.times(grad_input, negative_one, grad_input);
 305 |             }
 306 |         }
 307 |     }
 308 | }
 309 | 
 310 | pub fn logAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 311 |     return autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, 0, logForwardAlloc, logBackwardAlloc, null);
 312 | }
 313 | 
 314 | pub fn logForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 315 |     return try array.logAlloc(alc, inputs[0]);
 316 | }
 317 | 
 318 | pub fn logBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 319 |     if (grad_inputs_out[0]) |grad| {
 320 |         var x = inputs[0];
 321 |         var grad_input = try array.expr(alc, "g ./ x", .{ .x = x, .g = grad_output });
 322 |         defer grad_input.release();
 323 |         array.copy(grad_input, grad);
 324 |     }
 325 | }
 326 | 
 327 | test "log_gradcheck" {
 328 |     var a = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 3, 4 }, 1.0, 1.0, REQUIRES_GRAD);
 329 |     defer a.release();
 330 |     var inputs = [_]Tensor{a};
 331 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, logForwardAlloc, logBackwardAlloc, &inputs, 0));
 332 | }
 333 | 
 334 | pub fn log2Alloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 335 |     return autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, 0, log2ForwardAlloc, log2BackwardAlloc, null);
 336 | }
 337 | 
 338 | pub fn log2ForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 339 |     return try array.log2Alloc(alc, inputs[0]);
 340 | }
 341 | 
 342 | pub fn log2BackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 343 |     if (grad_inputs_out[0]) |grad| {
 344 |         var x = inputs[0];
 345 |         var grad_input = try array.expr(alc, "1.0 ./ (x .* log(2.0)) .* g", .{ .x = x, .g = grad_output });
 346 |         defer grad_input.release();
 347 |         array.copy(grad_input, grad);
 348 |     }
 349 | }
 350 | 
 351 | test "log2_gradcheck" {
 352 |     var a = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 3, 4 }, 1.0, 1.0, REQUIRES_GRAD);
 353 |     defer a.release();
 354 |     var inputs = [_]Tensor{a};
 355 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, log2ForwardAlloc, log2BackwardAlloc, &inputs, 0));
 356 | }
 357 | 
 358 | pub fn expAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 359 |     return autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, 0, expForwardAlloc, expBackwardAlloc, null);
 360 | }
 361 | 
 362 | pub fn expForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 363 |     return try array.expAlloc(alc, inputs[0]);
 364 | }
 365 | 
 366 | pub fn expBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 367 |     if (grad_inputs_out[0]) |grad| {
 368 |         var x = inputs[0];
 369 |         var grad_input = try array.expr(alc, "exp(x) .* g", .{ .x = x, .g = grad_output });
 370 |         defer grad_input.release();
 371 |         array.copy(grad_input, grad);
 372 |     }
 373 | }
 374 | 
 375 | test "exp_gradcheck" {
 376 |     var a = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 3, 4 }, 1.0, 1.0, REQUIRES_GRAD);
 377 |     defer a.release();
 378 |     var inputs = [_]Tensor{a};
 379 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, expForwardAlloc, expBackwardAlloc, &inputs, 0));
 380 | }
 381 | 
 382 | pub fn uminusAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 383 |     return autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, 0, uminusForwardAlloc, uminusBackwardAlloc, null);
 384 | }
 385 | 
 386 | pub fn uminusForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 387 |     return try array.uminusAlloc(alc, inputs[0]);
 388 | }
 389 | 
 390 | pub fn uminusBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 391 |     if (grad_inputs_out[0]) |grad_input| {
 392 |         array.copy(grad_output, grad_input);
 393 |         var negative_one = try array.scalarAlloc(alc, grad_input.dtype, -1);
 394 |         defer negative_one.release();
 395 |         array.times(grad_input, negative_one, grad_input);
 396 |     }
 397 | }
 398 | 
 399 | pub fn timesAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 400 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, timesForwardAlloc, timesBackwardAlloc, null);
 401 | }
 402 | 
 403 | pub fn timesForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 404 |     return try array.timesAlloc(alc, inputs[0], inputs[1]);
 405 | }
 406 | 
 407 | pub fn timesBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 408 |     if (inputs.len != 2) {
 409 |         @panic("invalid number of inputs");
 410 |     }
 411 | 
 412 |     var x = inputs[0];
 413 |     var y = inputs[1];
 414 | 
 415 |     var x_grad_to_sum = try array.timesAlloc(alc, grad_output, y);
 416 |     defer x_grad_to_sum.release();
 417 |     if (grad_inputs_out[0]) |x_grad| {
 418 |         array.bcastsum(x_grad_to_sum, x_grad);
 419 |     }
 420 | 
 421 |     var y_grad_to_sum = try array.timesAlloc(alc, grad_output, x);
 422 |     defer y_grad_to_sum.release();
 423 |     if (grad_inputs_out[1]) |y_grad| {
 424 |         array.bcastsum(y_grad_to_sum, y_grad);
 425 |     }
 426 | }
 427 | 
 428 | pub fn mtimesAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 429 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, mtimesForwardAlloc, mtimesBackwardAlloc, null);
 430 | }
 431 | 
 432 | pub fn mtimesForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 433 |     return try array.mtimesAlloc(alc, inputs[0], inputs[1]);
 434 | }
 435 | 
 436 | pub fn mtimesBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 437 |     if (inputs.len != 2) {
 438 |         @panic("invalid number of inputs");
 439 |     }
 440 | 
 441 |     var x = inputs[0];
 442 |     var y = inputs[1];
 443 | 
 444 |     if (grad_inputs_out[0]) |x_grad| {
 445 |         var x_grad_to_copy = try array.expr(alc, "g * y'", .{ .y = y, .g = grad_output });
 446 |         defer x_grad_to_copy.release();
 447 |         array.copy(x_grad_to_copy, x_grad);
 448 |     }
 449 | 
 450 |     if (grad_inputs_out[1]) |y_grad| {
 451 |         var y_grad_to_copy = try array.expr(alc, "x' * g", .{ .x = x, .g = grad_output });
 452 |         defer y_grad_to_copy.release();
 453 |         array.copy(y_grad_to_copy, y_grad);
 454 |     }
 455 | }
 456 | 
 457 | test "mtimes_gradcheck" {
 458 |     var a = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 3, 4 }, 0.0, 1.0, REQUIRES_GRAD);
 459 |     defer a.release();
 460 |     var b = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 4, 3 }, 1.0, 2.0, REQUIRES_GRAD);
 461 |     defer b.release();
 462 |     var inputs = [_]Tensor{ a, b };
 463 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, mtimesForwardAlloc, mtimesBackwardAlloc, &inputs, 0));
 464 | }
 465 | 
 466 | pub fn divideAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 467 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, divideForwardAlloc, divideBackwardAlloc, null);
 468 | }
 469 | 
 470 | pub fn divideForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 471 |     return try array.divideAlloc(alc, inputs[0], inputs[1]);
 472 | }
 473 | 
 474 | pub fn divideBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 475 |     if (inputs.len != 2) {
 476 |         @panic("invalid number of inputs");
 477 |     }
 478 | 
 479 |     var x = inputs[0];
 480 |     var y = inputs[1];
 481 | 
 482 |     if (grad_inputs_out[0]) |x_grad| {
 483 |         var x_grad_to_sum = try array.divideAlloc(alc, grad_output, y);
 484 |         defer x_grad_to_sum.release();
 485 |         array.bcastsum(x_grad_to_sum, x_grad);
 486 |     }
 487 | 
 488 |     if (grad_inputs_out[1]) |y_grad| {
 489 |         var y_grad_to_sum = try array.expr(alc, "-x ./ (y .* y) .* g", .{ .x = x, .y = y, .g = grad_output });
 490 |         defer y_grad_to_sum.release();
 491 |         array.bcastsum(y_grad_to_sum, y_grad);
 492 |     }
 493 | }
 494 | 
 495 | pub fn powerAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 496 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, powerForwardAlloc, powerBackwardAlloc, null);
 497 | }
 498 | 
 499 | pub fn powerForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 500 |     return try array.powerAlloc(alc, inputs[0], inputs[1]);
 501 | }
 502 | 
 503 | pub fn powerBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 504 |     if (inputs.len != 2) {
 505 |         @panic("invalid number of inputs");
 506 |     }
 507 | 
 508 |     var x = inputs[0];
 509 |     var y = inputs[1];
 510 | 
 511 |     if (grad_inputs_out[0]) |x_grad| {
 512 |         var x_grad_to_sum = try array.expr(alc, "y .* (x .^ (y-1)) .* g", .{ .x = x, .y = y, .g = grad_output });
 513 |         defer x_grad_to_sum.release();
 514 |         array.bcastsum(x_grad_to_sum, x_grad);
 515 |     }
 516 | 
 517 |     if (grad_inputs_out[1]) |y_grad| {
 518 |         var y_grad_to_sum = try array.expr(alc, "log(x) .* (x .^ y) .* g", .{ .x = x, .y = y, .g = grad_output });
 519 |         defer y_grad_to_sum.release();
 520 |         array.bcastsum(y_grad_to_sum, y_grad);
 521 |     }
 522 | }
 523 | 
 524 | pub fn maxAlloc(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
 525 |     return autogradAlloc(alc, &[_]Tensor{ x, y }, &[_]Array{ x.data, y.data }, 0, maxForwardAlloc, maxBackwardAlloc, null);
 526 | }
 527 | 
 528 | pub fn maxForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 529 |     return try array.maxAlloc(alc, inputs[0], inputs[1]);
 530 | }
 531 | 
 532 | pub fn maxBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 533 |     if (inputs.len != 2) {
 534 |         @panic("invalid number of inputs");
 535 |     }
 536 | 
 537 |     var x = inputs[0];
 538 |     var y = inputs[1];
 539 | 
 540 |     if (grad_inputs_out[0]) |x_grad| {
 541 |         var x_grad_to_sum = try array.expr(alc, "(x > y) .* g", .{ .x = x, .y = y, .g = grad_output });
 542 |         defer x_grad_to_sum.release();
 543 |         array.bcastsum(x_grad_to_sum, x_grad);
 544 |     }
 545 | 
 546 |     if (grad_inputs_out[1]) |y_grad| {
 547 |         var y_grad_to_sum = try array.expr(alc, "(y >= x) .* g", .{ .x = x, .y = y, .g = grad_output });
 548 |         defer y_grad_to_sum.release();
 549 |         array.bcastsum(y_grad_to_sum, y_grad);
 550 |     }
 551 | }
 552 | 
 553 | test "max_gradcheck" {
 554 |     // gradcheck doesn't work for max because it is discontinuous, so just check that the behavior is similar to pytorch
 555 |     // torch.max(a, b)
 556 |     // a.grad tensor([1., 0., 0.], dtype=torch.float64)
 557 |     // b.grad tensor([0., 1., 1.], dtype=torch.float64)
 558 |     // torch.max(b, a)
 559 |     // a.grad tensor([1., 0., 1.], dtype=torch.float64)
 560 |     // b.grad tensor([0., 1., 0.], dtype=torch.float64)
 561 |     {
 562 |         var a_buf = [_]f32{ 0.0, 0.0, 2.0 };
 563 |         var a_grad_buf = [_]f32{ 0.0, 0.0, 0.0 };
 564 |         const a = try Tensor.allocWithBuffers(f32, std.testing.allocator, &[_]u64{3}, &a_buf, &a_grad_buf);
 565 |         defer a.release();
 566 |         var b_buf = [_]f32{ -1.0, 1.0, 2.0 };
 567 |         var b_grad_buf = [_]f32{ 0.0, 0.0, 0.0 };
 568 |         const b = try Tensor.allocWithBuffers(f32, std.testing.allocator, &[_]u64{3}, &b_buf, &b_grad_buf);
 569 |         defer b.release();
 570 |         const c = try maxAlloc(std.testing.allocator, a, b);
 571 |         defer c.release();
 572 |         var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{3}, 3.0, 0);
 573 |         defer d.release();
 574 |         try backwardAlloc(std.testing.allocator, c, d);
 575 |         var a_grad_expected_buf = [_]f32{ 3.0, 0.0, 0.0 };
 576 |         const a_grad_expected = Array.fromBuffer(f32, &[_]u64{3}, &a_grad_expected_buf);
 577 |         var b_grad_expected_buf = [_]f32{ 0.0, 3.0, 3.0 };
 578 |         const b_grad_expected = Array.fromBuffer(f32, &[_]u64{3}, &b_grad_expected_buf);
 579 |         std.testing.expect(array.equal(a.grad.?, a_grad_expected));
 580 |         std.testing.expect(array.equal(b.grad.?, b_grad_expected));
 581 |     }
 582 | 
 583 |     {
 584 |         var a_buf = [_]f32{ 0.0, 0.0, 2.0 };
 585 |         var a_grad_buf = [_]f32{ 0.0, 0.0, 0.0 };
 586 |         const a = try Tensor.allocWithBuffers(f32, std.testing.allocator, &[_]u64{3}, &a_buf, &a_grad_buf);
 587 |         defer a.release();
 588 |         var b_buf = [_]f32{ -1.0, 1.0, 2.0 };
 589 |         var b_grad_buf = [_]f32{ 0.0, 0.0, 0.0 };
 590 |         const b = try Tensor.allocWithBuffers(f32, std.testing.allocator, &[_]u64{3}, &b_buf, &b_grad_buf);
 591 |         defer b.release();
 592 |         const c = try maxAlloc(std.testing.allocator, b, a);
 593 |         defer c.release();
 594 |         var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{3}, 3.0, 0);
 595 |         defer d.release();
 596 |         try backwardAlloc(std.testing.allocator, c, d);
 597 |         var a_grad_expected_buf = [_]f32{ 3.0, 0.0, 3.0 };
 598 |         const a_grad_expected = Array.fromBuffer(f32, &[_]u64{3}, &a_grad_expected_buf);
 599 |         var b_grad_expected_buf = [_]f32{ 0.0, 3.0, 0.0 };
 600 |         const b_grad_expected = Array.fromBuffer(f32, &[_]u64{3}, &b_grad_expected_buf);
 601 |         std.testing.expect(array.equal(a.grad.?, a_grad_expected));
 602 |         std.testing.expect(array.equal(b.grad.?, b_grad_expected));
 603 |     }
 604 | }
 605 | 
 606 | const CastArgs = struct {
 607 |     dtype: DType,
 608 | };
 609 | 
 610 | pub fn castAlloc(alc: *std.mem.Allocator, x: Tensor, dtype: DType) !Tensor {
 611 |     if (x.data.dtype == dtype) {
 612 |         x.retain();
 613 |         return x;
 614 |     }
 615 |     // we only need this temporarily for the autograd interface, the backward pass can figure the dtype out
 616 |     var cast_args = CastArgs{
 617 |         .dtype = dtype,
 618 |     };
 619 |     var result = autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, @ptrToInt(&cast_args), castForwardAlloc, castBackwardAlloc, null);
 620 |     return result;
 621 | }
 622 | 
 623 | pub fn castForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 624 |     var cast_args_ptr = @intToPtr(*CastArgs, extra_args_ptr);
 625 |     return try array.castAlloc(alc, inputs[0], cast_args_ptr.dtype);
 626 | }
 627 | 
 628 | pub fn castBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 629 |     if (inputs.len != 1) {
 630 |         @panic("invalid number of inputs");
 631 |     }
 632 | 
 633 |     if (grad_inputs_out[0]) |grad| {
 634 |         array.cast(grad_output, grad);
 635 |     }
 636 | }
 637 | 
 638 | test "cast_grad" {
 639 |     var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 1.0, REQUIRES_GRAD);
 640 |     defer a.release();
 641 |     var b = try castAlloc(std.testing.allocator, a, DType.f32);
 642 |     defer b.release();
 643 |     var c = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3 }, 3.0, 0);
 644 |     defer c.release();
 645 |     var d = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 3.0, 0);
 646 |     defer d.release();
 647 |     try backwardAlloc(std.testing.allocator, b, c);
 648 |     std.testing.expect(array.equal(a.grad.?, d.data));
 649 | }
 650 | 
 651 | const GatherArgs = struct {
 652 |     dim: u64,
 653 | };
 654 | 
 655 | pub fn gatherAlloc(alc: *std.mem.Allocator, x: Tensor, dim: u64, index: Tensor) !Tensor {
 656 |     var args = try alc.create(GatherArgs);
 657 |     args.* = GatherArgs{
 658 |         .dim = dim,
 659 |     };
 660 |     if (index.requires_grad) {
 661 |         @panic("Index cannot have requires_grad set");
 662 |     }
 663 |     return autogradAlloc(alc, &[_]Tensor{ x, index }, &[_]Array{ x.data, index.data }, @ptrToInt(args), gatherForwardAlloc, gatherBackwardAlloc, gatherDealloc);
 664 | }
 665 | 
 666 | pub fn gatherForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 667 |     var args_ptr = @intToPtr(*GatherArgs, extra_args_ptr);
 668 |     return try array.gatherAlloc(alc, inputs[0], args_ptr.dim, inputs[1]);
 669 | }
 670 | 
 671 | pub fn gatherBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 672 |     if (inputs.len != 2) {
 673 |         @panic("invalid number of inputs");
 674 |     }
 675 | 
 676 |     var index = inputs[1];
 677 |     var args_ptr = @intToPtr(*GatherArgs, extra_args_ptr);
 678 |     var dim = args_ptr.dim;
 679 |     if (grad_inputs_out[0]) |grad| {
 680 |         array.scatter(grad_output, grad, dim, index);
 681 |     }
 682 | }
 683 | 
 684 | fn gatherDealloc(alc: *std.mem.Allocator, extra_args_ptr: u64) void {
 685 |     var args = @intToPtr(*GatherArgs, extra_args_ptr);
 686 |     alc.destroy(args);
 687 | }
 688 | 
 689 | test "gather_grad" {
 690 |     const input = try Tensor.allocWithRange(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 0.0, 1.0, REQUIRES_GRAD);
 691 |     defer input.release();
 692 |     const index = try Tensor.allocWithString(u64, std.testing.allocator, "[[[0], [1], [2]], [[3], [2], [1]]]", NO_FLAGS);
 693 |     defer index.release();
 694 |     var output = try gatherAlloc(std.testing.allocator, input, 2, index);
 695 |     defer output.release();
 696 |     const expected_output = try Tensor.allocWithString(f32, std.testing.allocator, "[[[0], [5], [10]], [[15], [18], [21]]]", NO_FLAGS);
 697 |     defer expected_output.release();
 698 |     std.testing.expect(array.equal(output.data, expected_output.data));
 699 |     const grad_output = try Tensor.allocWithValue(f32, std.testing.allocator, output.data.getShape(), 3.0, NO_FLAGS);
 700 |     defer grad_output.release();
 701 |     try backwardAlloc(std.testing.allocator, output, grad_output);
 702 |     const expected_grad = try Tensor.allocWithString(f32, std.testing.allocator, "[[[3,0,0,0], [0,3,0,0], [0,0,3,0]], [[0,0,0,3], [0,0,3,0], [0,3,0,0]]]", NO_FLAGS);
 703 |     defer expected_grad.release();
 704 |     std.testing.expect(array.equal(input.grad.?, expected_grad.data));
 705 | }
 706 | 
 707 | const ReduceArgs = struct {
 708 |     dims: array.DimArray,
 709 |     keepdims: bool,
 710 |     op: array.ReduceOperation,
 711 | };
 712 | 
 713 | fn reduceAlloc(alc: *std.mem.Allocator, x: Tensor, dims: []const u64, keepdims: bool, op: array.ReduceOperation) !Tensor {
 714 |     var args = try alc.create(ReduceArgs);
 715 |     args.* = ReduceArgs{
 716 |         .dims = array.DimArray.init(dims),
 717 |         .keepdims = keepdims,
 718 |         .op = op,
 719 |     };
 720 |     var result = autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, @ptrToInt(args), reduceForwardAlloc, reduceBackwardAlloc, reduceDealloc);
 721 |     return result;
 722 | }
 723 | 
 724 | fn reduceDealloc(alc: *std.mem.Allocator, extra_args_ptr: u64) void {
 725 |     var args = @intToPtr(*ReduceArgs, extra_args_ptr);
 726 |     alc.destroy(args);
 727 | }
 728 | 
 729 | fn reduceForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 730 |     var args = @intToPtr(*ReduceArgs, extra_args_ptr);
 731 |     return try array.reduceAlloc(alc, inputs[0], args.dims.getSlice(), args.keepdims, args.op);
 732 | }
 733 | 
 734 | fn reduceBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 735 |     if (inputs.len != 1) {
 736 |         @panic("Invalid number of inputs");
 737 |     }
 738 | 
 739 |     var args = @intToPtr(*ReduceArgs, extra_args_ptr);
 740 |     var reduced_numel: u64 = 1;
 741 |     var expand_shape = array.DimArray.init(inputs[0].getShape());
 742 |     // if we reduced along a dimension, set its size to 1 in the expanded shape of grad_output
 743 |     for (args.dims.getSlice()) |d| {
 744 |         reduced_numel *= expand_shape.array[d];
 745 |         expand_shape.array[d] = 1;
 746 |     }
 747 |     var expanded_grad_output = grad_output.reshapeView(expand_shape.getSlice());
 748 |     if (grad_inputs_out[0]) |grad| {
 749 |         switch (args.op) {
 750 |             .sum => array.copy(expanded_grad_output, grad),
 751 |             .max => {
 752 |                 if (args.dims.ndim != 1) {
 753 |                     @panic("Too many dimensions for max");
 754 |                 }
 755 |                 var dim = args.dims.array[0];
 756 |                 var index = try array.keepArgMaxAlloc(alc, inputs[0], dim);
 757 |                 defer index.release();
 758 |                 array.scatter(expanded_grad_output, grad, dim, index);
 759 |             },
 760 |             .mean => {
 761 |                 array.copy(expanded_grad_output, grad);
 762 |                 var divisor = try array.scalarAlloc(alc, grad.dtype, @intToFloat(f64, reduced_numel));
 763 |                 defer divisor.release();
 764 |                 array.divide(grad, divisor, grad);
 765 |             },
 766 |         }
 767 |     }
 768 | }
 769 | 
 770 | pub fn reduceSumAlloc(alc: *std.mem.Allocator, x: Tensor, dims: []const u64) !Tensor {
 771 |     return reduceAlloc(alc, x, dims, false, .sum);
 772 | }
 773 | 
 774 | pub fn keepSumAlloc(alc: *std.mem.Allocator, x: Tensor, dims: []const u64) !Tensor {
 775 |     return reduceAlloc(alc, x, dims, true, .sum);
 776 | }
 777 | 
 778 | pub fn reduceMaxAlloc(alc: *std.mem.Allocator, x: Tensor, dim: u64) !Tensor {
 779 |     return reduceAlloc(alc, x, &[_]u64{dim}, false, .max);
 780 | }
 781 | 
 782 | pub fn keepMaxAlloc(alc: *std.mem.Allocator, x: Tensor, dim: u64) !Tensor {
 783 |     return reduceAlloc(alc, x, &[_]u64{dim}, true, .max);
 784 | }
 785 | 
 786 | pub fn reduceMeanAlloc(alc: *std.mem.Allocator, x: Tensor, dims: []const u64) !Tensor {
 787 |     return reduceAlloc(alc, x, dims, false, .mean);
 788 | }
 789 | 
 790 | pub fn keepMeanAlloc(alc: *std.mem.Allocator, x: Tensor, dims: []const u64) !Tensor {
 791 |     return reduceAlloc(alc, x, dims, true, .mean);
 792 | }
 793 | 
 794 | test "reduce_sum_grad" {
 795 |     const a = try Tensor.allocWithRange(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 0.0, 1.0, REQUIRES_GRAD);
 796 |     defer a.release();
 797 |     const b = try reduceSumAlloc(std.testing.allocator, a, &[_]u64{ 1, 2 });
 798 |     defer b.release();
 799 |     var c = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{2}, 3.0, 0);
 800 |     defer c.release();
 801 |     try backwardAlloc(std.testing.allocator, b, c);
 802 |     var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
 803 |     defer d.release();
 804 |     std.testing.expect(array.equal(a.grad.?, d.data));
 805 | }
 806 | 
 807 | test "keep_sum_grad" {
 808 |     const a = try Tensor.allocWithRange(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 0.0, 1.0, REQUIRES_GRAD);
 809 |     defer a.release();
 810 |     const b = try keepSumAlloc(std.testing.allocator, a, &[_]u64{ 1, 2 });
 811 |     defer b.release();
 812 |     var c = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 1, 1 }, 3.0, 0);
 813 |     defer c.release();
 814 |     try backwardAlloc(std.testing.allocator, b, c);
 815 |     var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
 816 |     defer d.release();
 817 |     std.testing.expect(array.equal(a.grad.?, d.data));
 818 | }
 819 | 
 820 | test "reduce_mean_grad" {
 821 |     const input = try Tensor.allocWithString(f32, std.testing.allocator, "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]", REQUIRES_GRAD);
 822 |     defer input.release();
 823 |     const output = try reduceMeanAlloc(std.testing.allocator, input, &[_]u64{1});
 824 |     defer output.release();
 825 |     var grad_output = try Tensor.allocWithValue(f32, std.testing.allocator, output.data.getShape(), 6.0, 0);
 826 |     defer grad_output.release();
 827 |     try backwardAlloc(std.testing.allocator, output, grad_output);
 828 |     const expected_grad_input = try Tensor.allocWithString(f32, std.testing.allocator, "[[2, 2, 2], [2, 2, 2], [2, 2, 2]]", NO_FLAGS);
 829 |     defer expected_grad_input.release();
 830 |     std.testing.expect(array.equal(input.grad.?, expected_grad_input.data));
 831 | }
 832 | 
 833 | test "reduce_max_grad" {
 834 |     const TestCase = struct {
 835 |         input: []const u8,
 836 |         dim: u64,
 837 |         expected_grad: []const u8,
 838 |     };
 839 |     var testcases = [_]TestCase{
 840 |         TestCase{
 841 |             .input = "[[[0,1], [1,0], [1,1]]]",
 842 |             .dim = 2,
 843 |             .expected_grad = "[[[0,3], [3,0], [3,0]]]",
 844 |         },
 845 |         TestCase{
 846 |             .input = "[[[0,1], [1,0], [1,1]]]",
 847 |             .dim = 1,
 848 |             .expected_grad = "[[[0,3], [3,0], [0,0]]]",
 849 |         },
 850 |     };
 851 | 
 852 |     for (testcases) |tc| {
 853 |         const input = try Tensor.allocWithString(f32, std.testing.allocator, tc.input, REQUIRES_GRAD);
 854 |         defer input.release();
 855 |         const output = try reduceMaxAlloc(std.testing.allocator, input, tc.dim);
 856 |         defer output.release();
 857 |         const grad_output = try Tensor.allocWithValue(f32, std.testing.allocator, output.data.getShape(), 3.0, 0);
 858 |         defer grad_output.release();
 859 |         try backwardAlloc(std.testing.allocator, output, grad_output);
 860 |         const expected_grad = try Tensor.allocWithString(f32, std.testing.allocator, tc.expected_grad, 0);
 861 |         defer expected_grad.release();
 862 |         std.testing.expect(array.equal(input.grad.?, expected_grad.data));
 863 |     }
 864 | }
 865 | 
 866 | pub fn reduceSumExprAlloc(alc: *std.mem.Allocator, x: Tensor, dims: Tensor) !Tensor {
 867 |     var dims_cast = try castAlloc(alc, dims, .u64);
 868 |     defer dims_cast.release();
 869 |     var dims_buf = dims_cast.data.getBuffer(u64);
 870 |     return try reduceSumAlloc(alc, x, dims_buf);
 871 | }
 872 | 
 873 | pub fn keepSumExprAlloc(alc: *std.mem.Allocator, x: Tensor, dims: Tensor) !Tensor {
 874 |     var dims_cast = try castAlloc(alc, dims, .u64);
 875 |     defer dims_cast.release();
 876 |     var dims_buf = dims_cast.data.getBuffer(u64);
 877 |     return try keepSumAlloc(alc, x, dims_buf);
 878 | }
 879 | 
 880 | pub fn reduceMaxExprAlloc(alc: *std.mem.Allocator, x: Tensor, dim: Tensor) !Tensor {
 881 |     var dims_cast = try castAlloc(alc, dim, .u64);
 882 |     defer dims_cast.release();
 883 |     return reduceMaxAlloc(alc, x, dims_cast.data.getItem(u64));
 884 | }
 885 | 
 886 | pub fn keepMaxExprAlloc(alc: *std.mem.Allocator, x: Tensor, dim: Tensor) !Tensor {
 887 |     var dims_cast = try castAlloc(alc, dim, .u64);
 888 |     defer dims_cast.release();
 889 |     return keepMaxAlloc(alc, x, dims_cast.data.getItem(u64));
 890 | }
 891 | 
 892 | pub fn reduceMeanExprAlloc(alc: *std.mem.Allocator, x: Tensor, dim: Tensor) !Tensor {
 893 |     var dims_cast = try castAlloc(alc, dim, .u64);
 894 |     defer dims_cast.release();
 895 |     var dims_buf = dims_cast.data.getBuffer(u64);
 896 |     return reduceMeanAlloc(alc, x, dims_buf);
 897 | }
 898 | 
 899 | pub fn keepMeanExprAlloc(alc: *std.mem.Allocator, x: Tensor, dim: Tensor) !Tensor {
 900 |     var dims_cast = try castAlloc(alc, dim, .u64);
 901 |     defer dims_cast.release();
 902 |     var dims_buf = dims_cast.data.getBuffer(u64);
 903 |     return keepMeanAlloc(alc, x, dims_buf);
 904 | }
 905 | 
 906 | pub fn gatherExprAlloc(alc: *std.mem.Allocator, x: Tensor, dim: Tensor, index: Tensor) !Tensor {
 907 |     var dims_cast = try castAlloc(alc, dim, .u64);
 908 |     defer dims_cast.release();
 909 |     var index_cast = try castAlloc(alc, index, .u64);
 910 |     defer index_cast.release();
 911 |     return gatherAlloc(alc, x, dims_cast.data.getItem(u64), index_cast);
 912 | }
 913 | 
 914 | pub fn transposeAlloc(alc: *std.mem.Allocator, x: Tensor) !Tensor {
 915 |     return autogradAlloc(alc, &[_]Tensor{x}, &[_]Array{x.data}, 0, transposeForwardAlloc, transposeBackwardAlloc, null);
 916 | }
 917 | 
 918 | pub fn transposeForwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64) ForwardError!Array {
 919 |     return array.transposeAlloc(alc, inputs[0]);
 920 | }
 921 | 
 922 | pub fn transposeBackwardAlloc(alc: *std.mem.Allocator, inputs: []Array, extra_args_ptr: u64, output: Array, grad_inputs_out: []?Array, grad_output: Array) BackwardError!void {
 923 |     if (inputs.len != 1) {
 924 |         @panic("invalid number of inputs");
 925 |     }
 926 | 
 927 |     if (grad_inputs_out[0]) |grad| {
 928 |         array.transpose(grad_output, grad);
 929 |     }
 930 | }
 931 | 
 932 | test "transpose_gradcheck" {
 933 |     var a = try Tensor.allocWithRange(f64, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 0.0, 1.0, REQUIRES_GRAD);
 934 |     defer a.release();
 935 |     var inputs = [_]Tensor{a};
 936 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, transposeForwardAlloc, transposeBackwardAlloc, &inputs, 0));
 937 | }
 938 | 
 939 | pub fn autogradAlloc(alc: *std.mem.Allocator, inputs: []Tensor, input_arrays: []Array, extra_args_ptr: u64, forwardFn: ForwardFn, backwardFn: BackwardFn, deallocFn: ?DeallocFn) !Tensor {
 940 |     var output_array = try forwardFn(alc, input_arrays, extra_args_ptr);
 941 |     var requires_grad = false;
 942 |     for (inputs) |input| {
 943 |         if (input.requires_grad) {
 944 |             requires_grad = true;
 945 |         }
 946 |     }
 947 |     var flags: u64 = IS_BRANCH;
 948 |     if (requires_grad) {
 949 |         flags |= REQUIRES_GRAD;
 950 |     }
 951 |     var output = try Tensor.allocWithData(alc, output_array, flags);
 952 |     output_array.release();
 953 |     if (requires_grad) {
 954 |         var gr = try alc.create(GradientRecord);
 955 |         gr.* = try GradientRecord.alloc(alc, inputs, extra_args_ptr, output, backwardFn, deallocFn);
 956 |         output.grad_record = gr;
 957 |     } else {
 958 |         if (deallocFn) |dealloc| {
 959 |             dealloc(alc, extra_args_ptr);
 960 |         }
 961 |     }
 962 |     return output;
 963 | }
 964 | 
 965 | const RecordQueue = struct {
 966 |     const Queue = std.TailQueue(*GradientRecord);
 967 | 
 968 |     queue: Queue,
 969 |     alc: *std.mem.Allocator,
 970 | 
 971 |     const Self = @This();
 972 | 
 973 |     pub fn init(alc: *std.mem.Allocator) Self {
 974 |         return Self{ .queue = Queue{}, .alc = alc };
 975 |     }
 976 | 
 977 |     pub fn empty(self: *Self) bool {
 978 |         return self.queue.len == 0;
 979 |     }
 980 | 
 981 |     pub fn pushNode(self: *Self, grad_record: *GradientRecord) !void {
 982 |         var queue_node = try self.alc.create(Queue.Node);
 983 |         queue_node.data = grad_record;
 984 |         self.queue.append(queue_node);
 985 |     }
 986 | 
 987 |     pub fn popNode(self: *Self) *GradientRecord {
 988 |         var maybe_queue_node = self.queue.popFirst();
 989 |         if (maybe_queue_node) |queue_node| {
 990 |             var gr = queue_node.data;
 991 |             self.alc.destroy(queue_node);
 992 |             return gr;
 993 |         } else {
 994 |             @panic("attempted to dequeue from empty queue");
 995 |         }
 996 |     }
 997 | };
 998 | 
 999 | fn Counter(comptime T: type) type {
1000 |     return struct {
1001 |         map: std.AutoHashMap(T, u64),
1002 | 
1003 |         const Self = @This();
1004 | 
1005 |         fn init(alc: *std.mem.Allocator) !Self {
1006 |             return Self{ .map = std.AutoHashMap(T, u64).init(alc) };
1007 |         }
1008 | 
1009 |         fn incr(self: *Self, key: T) !u64 {
1010 |             var count: u64 = 1;
1011 |             if (self.map.get(key)) |c| {
1012 |                 count += c;
1013 |             }
1014 |             try self.map.put(key, count);
1015 |             return count;
1016 |         }
1017 | 
1018 |         fn decr(self: *Self, key: T) !u64 {
1019 |             var count: u64 = 0;
1020 |             if (self.map.get(key)) |c| {
1021 |                 count = c - 1;
1022 |             }
1023 |             try self.map.put(key, count);
1024 |             return count;
1025 |         }
1026 | 
1027 |         fn deinit(self: *Self) void {
1028 |             self.map.deinit();
1029 |         }
1030 |     };
1031 | }
1032 | 
1033 | fn toposort(alc: *std.mem.Allocator, root: *GradientRecord, records: []*GradientRecord) !void {
1034 |     var incoming_edge_counter = try Counter(*GradientRecord).init(std.testing.allocator);
1035 |     defer incoming_edge_counter.deinit();
1036 | 
1037 |     for (records) |rec| {
1038 |         for (rec.inputs) |input| {
1039 |             if (input.grad_record) |input_rec| {
1040 |                 _ = try incoming_edge_counter.incr(input_rec);
1041 |             }
1042 |         }
1043 |     }
1044 | 
1045 |     var sorted_records = std.ArrayList(*GradientRecord).init(std.testing.allocator);
1046 |     defer sorted_records.deinit();
1047 |     var q = RecordQueue.init(std.testing.allocator);
1048 |     try q.pushNode(root);
1049 |     while (!q.empty()) {
1050 |         var rec = q.popNode();
1051 |         try sorted_records.append(rec);
1052 |         for (rec.inputs) |input| {
1053 |             if (input.grad_record) |input_rec| {
1054 |                 var count = try incoming_edge_counter.decr(input_rec);
1055 |                 if (count == 0) {
1056 |                     try q.pushNode(input_rec);
1057 |                 }
1058 |             }
1059 |         }
1060 |     }
1061 |     if (sorted_records.items.len != records.len) {
1062 |         @panic("Failed to sort graph");
1063 |     }
1064 |     std.mem.copy(*GradientRecord, records, sorted_records.items);
1065 | }
1066 | 
1067 | pub fn backwardScalarAlloc(alc: *std.mem.Allocator, output: Tensor) !void {
1068 |     if (output.data.ndim != 0) {
1069 |         std.debug.panic("Expected scalar, got ndim {}", .{output.data.ndim});
1070 |     }
1071 |     var grad_output = try onesLikeAlloc(alc, output, NO_FLAGS);
1072 |     defer grad_output.release();
1073 |     return backwardAlloc(alc, output, grad_output);
1074 | }
1075 | 
1076 | pub fn backwardAlloc(alc: *std.mem.Allocator, output: Tensor, grad_output: Tensor) !void {
1077 |     if (output.grad_record == null) {
1078 |         return;
1079 |     }
1080 | 
1081 |     if (!std.mem.eql(u64, output.data.getShape(), grad_output.data.getShape())) {
1082 |         @panic("output shape does not match grad_output shape");
1083 |     }
1084 | 
1085 |     grad_output.data.retain();
1086 |     output.grad_record.?.grad_output = grad_output.data;
1087 |     var root = output.grad_record.?;
1088 | 
1089 |     // find all gradient records
1090 |     var seen = std.AutoHashMap(*GradientRecord, bool).init(std.testing.allocator);
1091 |     defer seen.deinit();
1092 |     var q = RecordQueue.init(std.testing.allocator);
1093 |     var records = std.ArrayList(*GradientRecord).init(std.testing.allocator);
1094 |     defer records.deinit();
1095 |     try q.pushNode(root);
1096 |     while (!q.empty()) {
1097 |         var rec = q.popNode();
1098 |         if (seen.get(rec) != null) {
1099 |             continue;
1100 |         }
1101 |         try records.append(rec);
1102 |         try seen.put(rec, true);
1103 |         for (rec.inputs) |input| {
1104 |             if (input.grad_record) |grad_record| {
1105 |                 try q.pushNode(grad_record);
1106 |             }
1107 |         }
1108 |     }
1109 | 
1110 |     // sort the records
1111 |     try toposort(std.testing.allocator, root, records.items);
1112 | 
1113 |     // perform backward pass
1114 |     for (records.items) |rec| {
1115 |         var inputs = try alc.alloc(Array, rec.inputs.len);
1116 |         defer alc.free(inputs);
1117 |         var grad_inputs = try alc.alloc(?Array, rec.inputs.len);
1118 |         defer alc.free(grad_inputs);
1119 |         for (grad_inputs) |_, i| {
1120 |             if (rec.inputs[i].requires_grad) {
1121 |                 grad_inputs[i] = try array.zerosLikeAlloc(alc, rec.inputs[i].data);
1122 |             } else {
1123 |                 grad_inputs[i] = null;
1124 |             }
1125 |             inputs[i] = rec.inputs[i].data;
1126 |         }
1127 |         try rec.backward_fn(alc, inputs, rec.extra_args_ptr, rec.output.data, grad_inputs, rec.grad_output.?);
1128 |         rec.grad_output.?.release();
1129 |         rec.grad_output = null;
1130 |         for (grad_inputs) |maybe_grad_input, i| {
1131 |             if (maybe_grad_input == null) {
1132 |                 continue;
1133 |             }
1134 | 
1135 |             var grad_input = maybe_grad_input.?;
1136 |             var input = rec.inputs[i];
1137 |             defer grad_input.release();
1138 | 
1139 |             if (!input.requires_grad) {
1140 |                 @panic("Input does not require grad but we created a grad input for it");
1141 |             }
1142 | 
1143 |             if (input.is_leaf) {
1144 |                 if (input.grad == null) {
1145 |                     @panic("missing grad buffer on leaf variable");
1146 |                 }
1147 |                 if (input.grad_record != null) {
1148 |                     @panic("leaf variable has grad record");
1149 |                 }
1150 |             } else {
1151 |                 if (input.grad_record == null) {
1152 |                     @panic("non-leaf tensor requires grad but has no grad record");
1153 |                 }
1154 |             }
1155 | 
1156 |             if (input.grad) |input_grad| {
1157 |                 array.plus(input_grad, grad_input, input_grad);
1158 |             }
1159 | 
1160 |             if (input.grad_record) |input_grad_record| {
1161 |                 // enqueue a node to run backward on it
1162 |                 // this node now owns the grad_input array
1163 |                 if (input_grad_record.grad_output) |gradout| {
1164 |                     // there's already a grad output on this node, accumulate into it
1165 |                     array.plus(gradout, grad_input, gradout);
1166 |                 } else {
1167 |                     // there's no grad output, put this value there
1168 |                     grad_input.retain();
1169 |                     input_grad_record.grad_output = grad_input;
1170 |                 }
1171 |             }
1172 |         }
1173 |     }
1174 | }
1175 | 
1176 | fn gradCheck(comptime T: type, alc: *std.mem.Allocator, forwardFn: ForwardFn, backwardFn: BackwardFn, inputs: []Tensor, extra_args_ptr: u64) !bool {
1177 |     const epsilon = 1e-6;
1178 |     const rtol = 1e-3;
1179 |     const atol = 1e-5;
1180 | 
1181 |     for (inputs) |input| {
1182 |         if (!input.data.is_contiguous) {
1183 |             @panic("contiguous inputs required");
1184 |         }
1185 |     }
1186 |     var input_arrays = try alc.alloc(Array, inputs.len);
1187 |     defer alc.free(input_arrays);
1188 |     for (inputs) |input, i| {
1189 |         input_arrays[i] = input.data;
1190 |     }
1191 |     var output = try forwardFn(alc, input_arrays, extra_args_ptr);
1192 |     defer output.release();
1193 | 
1194 |     for (inputs) |input, input_index| {
1195 |         if (input.requires_grad) {
1196 |             var fd_jacobian = try Array.allocWithValue(T, alc, &[_]u64{ output.numel, input.data.numel }, 0.0);
1197 |             defer fd_jacobian.release();
1198 | 
1199 |             // use finite differences to build up the jacobian column by column
1200 |             var input_elem_index: u64 = 0;
1201 |             while (input_elem_index < input.data.numel) : (input_elem_index += 1) {
1202 |                 var buf = input.data.getBuffer(T);
1203 |                 var val = buf[input_elem_index];
1204 |                 buf[input_elem_index] = val + epsilon;
1205 |                 var plus_output = try forwardFn(alc, input_arrays, extra_args_ptr);
1206 |                 defer plus_output.release();
1207 |                 buf[input_elem_index] = val - epsilon;
1208 |                 var minus_output = try forwardFn(alc, input_arrays, extra_args_ptr);
1209 |                 defer minus_output.release();
1210 |                 buf[input_elem_index] = val;
1211 |                 var diff = try array.minusAlloc(alc, plus_output, minus_output);
1212 |                 defer diff.release();
1213 |                 var divisor = try Array.allocWithValue(T, alc, &[_]u64{}, 2.0 * epsilon);
1214 |                 defer divisor.release();
1215 |                 var jacobian_column = try array.divideAlloc(alc, diff, divisor);
1216 |                 defer jacobian_column.release();
1217 |                 var fd_column = fd_jacobian.narrowView(&[_]u64{ 0, input_elem_index }, &[_]u64{ output.numel, 1 });
1218 |                 array.copy(jacobian_column.flatView(), fd_column.flatView());
1219 |             }
1220 | 
1221 |             // use our backward functions to build up the jacobian row by row
1222 |             var backward_jacobian = try Array.allocWithValue(T, alc, &[_]u64{ output.numel, input.data.numel }, 0.0);
1223 |             defer backward_jacobian.release();
1224 | 
1225 |             var output_elem_index: u64 = 0;
1226 |             while (output_elem_index < output.numel) : (output_elem_index += 1) {
1227 |                 var grad_inputs = try alc.alloc(?Array, inputs.len);
1228 |                 defer alc.free(grad_inputs);
1229 |                 for (grad_inputs) |_, i| {
1230 |                     grad_inputs[i] = try array.zerosLikeAlloc(alc, input_arrays[i]);
1231 |                 }
1232 |                 var grad_output = try array.zerosLikeAlloc(alc, output);
1233 |                 var buf = grad_output.getBuffer(T);
1234 |                 buf[output_elem_index] = 1.0;
1235 |                 defer grad_output.release();
1236 |                 try backwardFn(alc, input_arrays, extra_args_ptr, output, grad_inputs, grad_output);
1237 |                 var jacobian_row = grad_inputs[input_index].?;
1238 |                 array.copy(jacobian_row.flatView(), backward_jacobian.narrowView(&[_]u64{ output_elem_index, 0 }, &[_]u64{ 1, input.data.numel }).flatView());
1239 |                 for (grad_inputs) |maybe_grad_input| {
1240 |                     if (maybe_grad_input) |grad_input| {
1241 |                         grad_input.release();
1242 |                     }
1243 |                 }
1244 |             }
1245 | 
1246 |             if (!array.allclose(fd_jacobian, backward_jacobian, rtol, atol)) {
1247 |                 std.debug.print("jacobian mismatch\n", .{});
1248 |                 std.debug.print("fd_jacobian {}\n", .{fd_jacobian});
1249 |                 std.debug.print("backward_jacobian {}\n", .{backward_jacobian});
1250 |                 return false;
1251 |             }
1252 |         }
1253 |     }
1254 |     return true;
1255 | }
1256 | 
1257 | test "plus_gradcheck" {
1258 |     var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 1.0, REQUIRES_GRAD);
1259 |     defer a.release();
1260 |     var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 2.0, REQUIRES_GRAD);
1261 |     defer b.release();
1262 |     var inputs = [_]Tensor{ a, b };
1263 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, plusForwardAlloc, plusBackwardAlloc, &inputs, 0));
1264 | }
1265 | 
1266 | test "plus_grad" {
1267 |     var a = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, REQUIRES_GRAD);
1268 |     defer a.release();
1269 |     var b = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 2.0, REQUIRES_GRAD);
1270 |     defer b.release();
1271 |     var out = try plusAlloc(std.testing.allocator, a, b);
1272 |     defer out.release();
1273 |     var grad_out = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 4.0, 0);
1274 |     defer grad_out.release();
1275 |     try backwardAlloc(std.testing.allocator, out, grad_out);
1276 |     std.testing.expect(array.equal(a.grad.?, grad_out.data));
1277 |     std.testing.expect(array.equal(b.grad.?, grad_out.data));
1278 | }
1279 | 
1280 | test "plus_grad_multiple_levels" {
1281 |     var a = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, REQUIRES_GRAD);
1282 |     defer a.release();
1283 |     var b = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 2.0, REQUIRES_GRAD);
1284 |     defer b.release();
1285 |     var c = try plusAlloc(std.testing.allocator, a, b);
1286 |     defer c.release();
1287 |     var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 2.0, REQUIRES_GRAD);
1288 |     defer d.release();
1289 |     var e = try plusAlloc(std.testing.allocator, c, d);
1290 |     defer e.release();
1291 |     var f = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
1292 |     defer f.release();
1293 |     try backwardAlloc(std.testing.allocator, e, f);
1294 |     std.testing.expect(array.equal(a.grad.?, f.data));
1295 |     std.testing.expect(array.equal(b.grad.?, f.data));
1296 | }
1297 | 
1298 | test "plus_grad_bcast" {
1299 |     var a = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, REQUIRES_GRAD);
1300 |     defer a.release();
1301 |     var b = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{4}, 2.0, REQUIRES_GRAD);
1302 |     defer b.release();
1303 |     var c = try plusAlloc(std.testing.allocator, a, b);
1304 |     defer c.release();
1305 |     var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
1306 |     defer d.release();
1307 |     try backwardAlloc(std.testing.allocator, c, d);
1308 |     std.testing.expect(array.equal(a.grad.?, d.data));
1309 |     var e = try Array.allocWithValue(f32, std.testing.allocator, &[_]u64{4}, 18.0);
1310 |     defer e.release();
1311 |     std.testing.expect(array.equal(b.grad.?, e));
1312 | }
1313 | 
1314 | test "plus_no_grad" {
1315 |     var a = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, 0);
1316 |     defer a.release();
1317 |     var b = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 2.0, 0);
1318 |     defer b.release();
1319 |     var c = try plusAlloc(std.testing.allocator, a, b);
1320 |     defer c.release();
1321 |     var d = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
1322 |     defer d.release();
1323 |     try backwardAlloc(std.testing.allocator, c, d);
1324 |     std.testing.expect(a.grad == null);
1325 |     std.testing.expect(b.grad == null);
1326 | }
1327 | 
1328 | test "minus_gradcheck" {
1329 |     {
1330 |         var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 1, 3 }, 1.0, REQUIRES_GRAD);
1331 |         defer a.release();
1332 |         var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 2.0, REQUIRES_GRAD);
1333 |         defer b.release();
1334 |         var inputs = [_]Tensor{ a, b };
1335 |         std.testing.expect(try gradCheck(f64, std.testing.allocator, minusForwardAlloc, minusBackwardAlloc, &inputs, 0));
1336 |     }
1337 | 
1338 |     {
1339 |         var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{3}, 1.0, REQUIRES_GRAD);
1340 |         defer a.release();
1341 |         var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 2.0, REQUIRES_GRAD);
1342 |         defer b.release();
1343 |         var inputs = [_]Tensor{ a, b };
1344 |         std.testing.expect(try gradCheck(f64, std.testing.allocator, minusForwardAlloc, minusBackwardAlloc, &inputs, 0));
1345 |     }
1346 | }
1347 | 
1348 | test "times_gradcheck" {
1349 |     var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{3}, 2.0, REQUIRES_GRAD);
1350 |     defer a.release();
1351 |     var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 3.0, REQUIRES_GRAD);
1352 |     defer b.release();
1353 |     var inputs = [_]Tensor{ a, b };
1354 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, timesForwardAlloc, timesBackwardAlloc, &inputs, 0));
1355 | }
1356 | 
1357 | test "divide_gradcheck" {
1358 |     var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{3}, 2.0, REQUIRES_GRAD);
1359 |     defer a.release();
1360 |     var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 3.0, REQUIRES_GRAD);
1361 |     defer b.release();
1362 |     var inputs = [_]Tensor{ a, b };
1363 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, divideForwardAlloc, divideBackwardAlloc, &inputs, 0));
1364 | }
1365 | 
1366 | test "power_gradcheck" {
1367 |     var a = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{3}, 2.0, REQUIRES_GRAD);
1368 |     defer a.release();
1369 |     var b = try Tensor.allocWithValue(f64, std.testing.allocator, &[_]u64{ 2, 3 }, 3.0, REQUIRES_GRAD);
1370 |     defer b.release();
1371 |     var inputs = [_]Tensor{ a, b };
1372 |     std.testing.expect(try gradCheck(f64, std.testing.allocator, powerForwardAlloc, powerBackwardAlloc, &inputs, 0));
1373 | }
1374 | 
1375 | fn getDType(t: Tensor) DType {
1376 |     return t.data.dtype;
1377 | }
1378 | 
1379 | pub fn binaryNotImplemented(alc: *std.mem.Allocator, x: Tensor, y: Tensor) !Tensor {
1380 |     @panic("operation not implemented");
1381 | }
1382 | 
1383 | pub fn unaryNotImplemented(alc: *std.mem.Allocator, x: Tensor) !Tensor {
1384 |     @panic("operation not implemented");
1385 | }
1386 | 
1387 | pub fn scalarNotImplemented(alc: *std.mem.Allocator, dtype: DType, value: f64) !Tensor {
1388 |     @panic("scalar not implemented");
1389 | }
1390 | 
1391 | pub fn expr(alc: *std.mem.Allocator, comptime exp: []const u8, args: anytype) !Tensor {
1392 |     comptime var opsTable = array.OpsTable(Tensor){
1393 |         .plus = plusAlloc,
1394 |         .minus = minusAlloc,
1395 |         .uplus = uplusAlloc,
1396 |         .uminus = uminusAlloc,
1397 |         .times = timesAlloc,
1398 |         .mtimes = mtimesAlloc,
1399 |         .divide = divideAlloc,
1400 |         .mdivide = binaryNotImplemented,
1401 |         .power = powerAlloc,
1402 |         .mpower = binaryNotImplemented,
1403 |         .eq = binaryNotImplemented,
1404 |         .gt = binaryNotImplemented,
1405 |         .gte = binaryNotImplemented,
1406 |         .lt = binaryNotImplemented,
1407 |         .lte = binaryNotImplemented,
1408 |         .transpose = transposeAlloc,
1409 |         .ctranspose = transposeAlloc,
1410 |         .scalar = scalarAlloc,
1411 |         .cast = castAlloc,
1412 |         .detach = detachAlloc,
1413 |         .log = logAlloc,
1414 |         .log2 = log2Alloc,
1415 |         .exp = expAlloc,
1416 |         .max = maxAlloc,
1417 |         .reduce_sum = reduceSumExprAlloc,
1418 |         .keep_sum = keepSumExprAlloc,
1419 |         .reduce_max = reduceMaxExprAlloc,
1420 |         .keep_max = keepMaxExprAlloc,
1421 |         .reduce_mean = reduceMeanExprAlloc,
1422 |         .keep_mean = keepMeanExprAlloc,
1423 |         .reduce_arg_max = binaryNotImplemented,
1424 |         .keep_arg_max = binaryNotImplemented,
1425 |         .gather = gatherExprAlloc,
1426 |         .get_dtype = getDType,
1427 |     };
1428 |     return try array.genericExpr(Tensor, opsTable, alc, exp, args);
1429 | }
1430 | 
1431 | test "expr" {
1432 |     {
1433 |         const a_data = try Array.allocWithRange(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, 1.0);
1434 |         defer a_data.release();
1435 |         const b_data = try Array.allocWithRange(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 1.0, 1.0);
1436 |         defer b_data.release();
1437 |         const a = try Tensor.allocWithData(std.testing.allocator, a_data, REQUIRES_GRAD);
1438 |         defer a.release();
1439 |         const b = try Tensor.allocWithData(std.testing.allocator, b_data, REQUIRES_GRAD);
1440 |         defer b.release();
1441 |         var c = try expr(std.testing.allocator, "a + b", .{ .a = a, .b = b });
1442 |         defer c.release();
1443 |         var d = try plusAlloc(std.testing.allocator, a, b);
1444 |         defer d.release();
1445 |         std.testing.expect(array.equal(c.data, d.data));
1446 |         var e = try Tensor.allocWithValue(f32, std.testing.allocator, &[_]u64{ 2, 3, 4 }, 3.0, 0);
1447 |         defer e.release();
1448 |         try backwardAlloc(std.testing.allocator, c, e);
1449 |         std.testing.expect(array.equal(a.grad.?, e.data));
1450 |         std.testing.expect(array.equal(b.grad.?, e.data));
1451 |     }
1452 | }


--------------------------------------------------------------------------------