├── .gitignore
├── cpp
    ├── stl
    │   ├── README.md
    │   ├── Makefile
    │   └── main.cpp
    └── xtensor
    │   ├── README.md
    │   ├── CMakeLists.txt
    │   └── src
    │       └── main.cpp
├── README.md
├── julia
    └── danisch
    │   ├── benchmark.jl
    │   └── solution.jl
└── nim
    └── nim_sol_mratsim.nim


/.gitignore:
--------------------------------------------------------------------------------
1 | # For binaries while testing/benchmarking
2 | build/
3 | 


--------------------------------------------------------------------------------
/cpp/stl/README.md:
--------------------------------------------------------------------------------
1 | # C++ submission to the Julia challenge
2 | 
3 | See blog article by @wolfv
4 | 
5 | https://medium.com/@wolfv/the-julia-challenge-in-c-21272d36c002
6 | 


--------------------------------------------------------------------------------
/cpp/stl/Makefile:
--------------------------------------------------------------------------------
1 | main.o : main.cpp
2 | 		$(CXX) -O3 -march=native -mtune=native -std=c++17 main.cpp -o main.o
3 | clean :
4 | 		rm main.o
5 | benchmark : main.o
6 | 		./main.o
7 | 


--------------------------------------------------------------------------------
/cpp/xtensor/README.md:
--------------------------------------------------------------------------------
 1 | # C++ - xtensor submission to the Julia challenge
 2 | 
 3 | This requires xtensor, xsimd and a C++14 compiler
 4 | 
 5 | ```
 6 | mkdir build
 7 | cd build
 8 | cmake ..
 9 | make
10 | ./julia_challenge
11 | ```
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Julia Challenge
 2 | 
 3 | Repository to collect the code for:
 4 | https://nextjournal.com/sdanisch/the-julia-challenge
 5 | 
 6 | 
 7 | A submission should look the following:
 8 | 
 9 |  * place into `language_name/authorname/solution.ext + benchmark.ext`
10 |  * a solution should implement an n-dimensional, n-argument lazy [broadcast](https://julia.guide/broadcasting) from scratch
11 |  * lazy means, one can aggregate recursive calls to a broadcasting operation - and decide when and how to materialize the result
12 | 


--------------------------------------------------------------------------------
/cpp/xtensor/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | 
 3 | # Project Julia Challenge
 4 | project(julia-challenge)
 5 | 
 6 | # Require xtensor and xsimd
 7 | find_package(xtensor 0.18.1 REQUIRED)
 8 | find_package(xsimd 7.0.0 REQUIRED)
 9 | 
10 | # Force build type to Release
11 | message(STATUS "Forcing build type to Release")
12 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
13 | 
14 | # Compilation flags (march=native and check for c++14 compilation flag))
15 | if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
16 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -std=c++14")
17 | endif()
18 | 
19 | # Sources
20 | set(SOURCES src/main.cpp)
21 | 
22 | # Enable XSIMD acceleration
23 | add_definitions(-DXTENSOR_USE_XSIMD)
24 | 
25 | # Setup executable
26 | set(CHALLENGE_TARGET julia_challenge)
27 | add_executable(${CHALLENGE_TARGET} ${SOURCES})
28 | target_link_libraries(${CHALLENGE_TARGET} xtensor)
29 | 


--------------------------------------------------------------------------------
/julia/danisch/benchmark.jl:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | using BenchmarkTools
 4 | 
 5 | reference(out, a, b, c) = (out .= a .+ b .- sin.(c))
 6 | 
 7 | a = rand(1000, 1000);
 8 | b = rand(1000);
 9 | c = 1.0
10 | out = similar(a);
11 | br = @broadcast a + b - sin(c)
12 | 
13 | @btime materialize!($out, $br)
14 | @btime reference($out, $a, $b, $c)
15 | 
16 | # Any library with NVectors specializing to the length will do!
17 | using GeometryTypes
18 | const Point3 = Point{3, Float32}
19 | 
20 | # function needs to come from different library
21 | module LibraryB
22 |   # no odd stuff, no functors, no special lambda expression!
23 |   # this function needs to be a normal language function
24 |   # as can be found in the wild
25 | 	super_custom_func(a, b) = sqrt(sum(a .* b))
26 | end
27 | # emulate that the function comes from a different library
28 | using .LibraryB: super_custom_func
29 | 
30 | using BenchmarkTools
31 | 
32 | a = rand(Point3, 10^6)
33 | b = rand(Point3, 10^6)
34 | out = fill(0f0, 10^6)
35 | 
36 | @btime $out .= super_custom_func.($a, $b)
37 | br = @broadcast super_custom_func(a, b)
38 | @btime materialize!($out, $br)
39 | 
40 | @btime sum($br)
41 | @btime sum($a .+ $b .- sin.($c))
42 | 


--------------------------------------------------------------------------------
/julia/danisch/solution.jl:
--------------------------------------------------------------------------------
 1 | import Base: getindex, iterate, axes, eachindex, tail, @propagate_inbounds
 2 | struct LazyBroadcast{F, Args}
 3 |     f::F
 4 |     args::Args
 5 | end
 6 | @propagate_inbounds function br_getindex(A::AbstractArray, I)
 7 |     idx = ntuple(i-> ifelse(size(A, i) === 1, 1, I[i]), Val(ndims(A)))
 8 |     return A[CartesianIndex(idx)]
 9 | end
10 | br_getindex(scalar, I) = scalar # Scalars no need to index them
11 | @propagate_inbounds function br_getindex(x::LazyBroadcast, I)
12 |     # this could be a map, but the current map in 1.0 has a perf problem
13 |     return x.f(getindex_arg(x.args, I)...)
14 | end
15 | getindex_arg(args::Tuple{}, I) = () # recursion ancor
16 | @propagate_inbounds function getindex_arg(args::NTuple{N, Any}, I) where N
17 |     return (br_getindex(args[1], I), getindex_arg(tail(args), I)...)
18 | end
19 | @propagate_inbounds getindex(x::LazyBroadcast, I) = br_getindex(x, Tuple(I))
20 | function materialize!(out::AbstractArray, x::LazyBroadcast)
21 |     # an n-dimensional simd accelerated loop
22 |     @simd for i in CartesianIndices(axes(out))
23 |         @inbounds out[i] = x[i]
24 |     end
25 |     return out
26 | end
27 | br_construct(x) = x
28 | function br_construct(x::Expr)
29 |     x.args .= br_construct.(x.args) # apply recursively
30 |     if Meta.isexpr(x, :call) # replace calls to construct LazyBroadcasts
31 |         x = :(LazyBroadcast($(x.args[1]), ($(x.args[2:end]...),)))
32 |     end
33 |     x
34 | end
35 | # macro to enable the syntax @broadcast a + b - sin(c) to construct our type
36 | macro broadcast(call_expr)
37 |     esc(br_construct(call_expr))
38 | end
39 | # Simplified implementation to take the axes of the array with the largest
40 | # dimensionality (axes -> the range an array iterates over)
41 | biggest(a, b, c, rest...) = biggest(biggest(a, b), biggest(c, rest...))
42 | biggest(a::NTuple{N1, Any}, b::NTuple{N2, Any}) where {N1, N2} =
43 | 	ifelse(N1 > N2, a, b)
44 | biggest(a) = a
45 | flatten_args(t::LazyBroadcast, rest...) =
46 | 	(flatten_args(t.args...)..., flatten_args(rest...)...)
47 | flatten_args(t::Any, rest...) = (t, flatten_args(rest...)...)
48 | flatten_args() = ()
49 | # the indexing axes of our array
50 | axes(br::LazyBroadcast) = biggest(map(axes, flatten_args(br))...)
51 | # lazy view that can be used to index over all elements in br
52 | eachindex(br::LazyBroadcast) = CartesianIndices(axes(br))
53 | iterate(br::LazyBroadcast) = iterate(br, (eachindex(br),))
54 | @propagate_inbounds function iterate(bc::LazyBroadcast, s)
55 |     y = iterate(s...)
56 |     y === nothing && return nothing
57 |     i, newstate = y
58 |     return (bc[i], (s[1], newstate))
59 | end
60 | 


--------------------------------------------------------------------------------
/cpp/xtensor/src/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | #include <cmath>
  4 | #include <cstddef>
  5 | #include <iostream>
  6 | 
  7 | #include <xtensor/xfixed.hpp>
  8 | #include <xtensor/xnoalias.hpp>
  9 | #include <xtensor/xtensor.hpp>
 10 | #include <xtensor/xrandom.hpp>
 11 | #include <xtensor/xvectorize.hpp>
 12 | 
 13 | // Simple stopwatch object
 14 | class stopwatch
 15 | {
 16 | public:
 17 | 
 18 |     stopwatch() : m_start(clock_type::now())
 19 |     {
 20 |     }
 21 | 
 22 |     void reset()
 23 |     {
 24 |         m_start = clock_type::now();
 25 |     }
 26 | 
 27 |     std::size_t elapsed() const
 28 |     {
 29 |         return std::chrono::duration_cast<std::chrono::nanoseconds>(clock_type::now() - m_start).count();
 30 |     }
 31 | 
 32 | private:
 33 | 
 34 |     typedef std::chrono::high_resolution_clock clock_type;
 35 |     std::chrono::time_point<clock_type> m_start;
 36 | };
 37 | 
 38 | template <class T>
 39 | using xpoint = xt::xtensor_fixed<T, xt::xshape<3>>;
 40 | 
 41 | template <class T>
 42 | auto sum_xpoint(const xpoint<T>& t)
 43 | {
 44 |     return std::accumulate(t.cbegin(), t.cend(), T());
 45 | }
 46 | 
 47 | // Simple test program
 48 | int main()
 49 | {
 50 |     // First Benchmark:
 51 |     //
 52 |     //  a[1000x1000] + b[1000] - sin(c[])
 53 |     {
 54 |         xt::xtensor<double, 2> a = xt::random::rand<double>({1000, 1000});
 55 |         xt::xtensor<double, 1> b = xt::random::rand<double>({1000});
 56 |         double c = 1.0;
 57 | 
 58 |         // Un-evaluated broadcasting exprression
 59 |         auto expr = a + b - std::sin(c);
 60 |         auto res = xt::xtensor<double, 2>::from_shape({1000, 1000});
 61 | 
 62 |         // Benchmark loop
 63 |         std::cout << "Benchmarking a[1000x1000] + b[1000] - sin(c[])" << std::endl;
 64 |         std::size_t min_time = 100000000;
 65 |         for (int i = 0; i < 200; ++i)
 66 |         {
 67 |             stopwatch timer;                // Create timer
 68 |             xt::noalias(res) = expr;        // Evaluate the expression.
 69 |             auto elapsed = timer.elapsed(); // Nanoseconds
 70 |             if (elapsed < min_time)
 71 |             {
 72 |                 min_time = elapsed;
 73 |             }
 74 |         }
 75 | 
 76 |         // Output results
 77 |         std::cout << "MIN TIME: " << min_time << " ns" << std::endl;
 78 |         std::cout << "        = " << (double) min_time / (double) 1000 << " μs" << std::endl;
 79 |         std::cout << std::endl << std::endl;
 80 |     }
 81 | 
 82 |     // Second Benchmark:
 83 |     //
 84 |     //  std::sqrt(sum(a * b));
 85 |     {
 86 |         constexpr std::size_t psz = 1000000;
 87 |         auto px = xt::xtensor<xpoint<float>, 1>({psz}, {0.5f, 2.1f, 3.2f}),
 88 |              py = xt::xtensor<xpoint<float>, 1>({psz}, {0.5f, 2.1f, 3.2f});
 89 |         auto res = xt::xtensor<float, 1>({psz});
 90 |         auto sum = xt::vectorize(sum_xpoint<float>);
 91 | 
 92 |         // Un-evaluated broadcasting expression
 93 |         auto expr = xt::sqrt(sum(px * py));
 94 | 
 95 |         // Benchmark loop
 96 |         std::cout << "Benchmarking sqrt(sum(a * b))" << std::endl;
 97 |         std::size_t min_time = 100000000;
 98 |         for (int i = 0; i < 200; ++i)
 99 |         {
100 |             stopwatch timer;                // Create timer
101 |             xt::noalias(res) = expr;        // Evaluate the expression.
102 |             auto elapsed = timer.elapsed(); // Nanoseconds
103 |             if (elapsed < min_time)
104 |             {
105 |                 min_time = elapsed;
106 |             }
107 |         }
108 | 
109 |         // Output results
110 |         std::cout << "MIN TIME: " << min_time << " ns" << std::endl;
111 |         std::cout << "        = " << (double) min_time / (double) 1000 << " μs" << std::endl;
112 |         std::cout << std::endl << std::endl;
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/nim/nim_sol_mratsim.nim:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | # Copyright (c) 2018 Mamy André-Ratsimbazafy
  3 | 
  4 | ## This files gives basic tensor library functionality, because yes we can
  5 | import strformat, macros, sequtils, random
  6 | 
  7 | type
  8 |   Tensor[Rank: static[int], T] = object
  9 |     ## Tensor data structure stored on Cpu
 10 |     ##   - ``shape``: Dimensions of the tensor
 11 |     ##   - ``strides``: Numbers of items to skip to get the next item along a dimension.
 12 |     ##   - ``offset``: Offset to get the first item of the tensor. Note: offset can be negative, in particular for slices.
 13 |     ##   - ``storage``: A data storage for the tensor
 14 |     ##   - Rank is part of the type for optimization purposes
 15 |     ##
 16 |     ## Warning ⚠:
 17 |     ##   Assignment ```var a = b``` does not copy the data. Data modification on one tensor will be reflected on the other.
 18 |     ##   However modification on metadata (shape, strides or offset) will not affect the other tensor.
 19 |     shape: array[Rank, int]
 20 |     strides: array[Rank, int]
 21 |     offset: int
 22 |     storage: CpuStorage[T]
 23 | 
 24 |   CpuStorage*{.shallow.}[T] = object
 25 |     ## Data storage for the tensor, copies are shallow by default
 26 |     data*: seq[T]
 27 | 
 28 | template tensor(result: var Tensor, shape: array) =
 29 |   result.shape = shape
 30 | 
 31 |   var accum = 1
 32 |   for i in countdown(Rank - 1, 0):
 33 |     result.strides[i] = accum
 34 |     accum *= shape[i]
 35 | 
 36 | func newTensor*[Rank: static[int], T](shape: array[Rank, int]): Tensor[Rank, T] =
 37 |   tensor(result, shape)
 38 |   result.storage.data = newSeq[T](shape.product)
 39 | 
 40 | proc rand[T: object|tuple](max: T): T =
 41 |   ## A generic random function for any stack object or tuple
 42 |   ## that initialize all fields randomly
 43 |   result = max
 44 |   for field in result.fields:
 45 |     field = rand(field)
 46 | 
 47 | proc randomTensor*[Rank: static[int], T](shape: array[Rank, int], max: T): Tensor[Rank, T] =
 48 |   tensor(result, shape)
 49 |   result.storage.data = newSeqWith(shape.product, T(rand(max)))
 50 | 
 51 | func getIndex[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): int {.inline.} =
 52 |   ## Convert [i, j, k, l ...] to the memory location referred by the index
 53 |   result = t.offset
 54 |   for i in 0 ..< t.Rank:
 55 |     {.unroll.} # I'm sad this doesn't work yet
 56 |     result += t.strides[i] * idx[i]
 57 | 
 58 | func `[]`[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): T {.inline.}=
 59 |   ## Index tensor
 60 |   t.storage.data[t.getIndex(idx)]
 61 | 
 62 | func `[]=`[Rank, T](t: var Tensor[Rank, T], idx: array[Rank, int], val: T) {.inline.}=
 63 |   ## Index tensor
 64 |   t.storage.data[t.getIndex(idx)] = val
 65 | 
 66 | template `[]`[T: SomeNumber](x: T, idx: varargs[int]): T =
 67 |   ## "Index" scalars
 68 |   x
 69 | 
 70 | func shape(x: SomeNumber): array[1, int] = [1]
 71 | 
 72 | func bcShape[R1, R2: static[int]](x: array[R1, int]; y: array[R2, int]): auto =
 73 |   when R1 > R2:
 74 |     result = x
 75 |     for i, idx in result.mpairs:
 76 |       if idx == 1 and y[i] != 1:
 77 |         idx = y[i]
 78 |   else:
 79 |     result = y
 80 |     for i, idx in result.mpairs:
 81 |       if idx == 1 and x[i] != 1:
 82 |         idx = x[i]
 83 | 
 84 | macro getBroadcastShape(x: varargs[typed]): untyped =
 85 |   assert x.len >= 2
 86 |   result = nnkDotExpr.newTree(x[0], ident"shape")
 87 |   for i in 1 ..< x.len:
 88 |     let xi = x[i]
 89 |     result = quote do: bcShape(`result`, `xi`.shape)
 90 | 
 91 | func bc[R1, R2: static[int], T](t: Tensor[R1, T], shape: array[R2, int]): Tensor[R2, T] =
 92 |   ## Broadcast tensors
 93 |   result.shape = shape
 94 |   for i in 0 ..< R1:
 95 |     if t.shape[i] == 1 and shape[i] != 1:
 96 |       result.strides[i] = 0
 97 |     else:
 98 |       result.strides[i] = t.strides[i]
 99 |       if t.shape[i] != result.shape[i]:
100 |         raise newException(ValueError, "The broadcasted size of the tensor must match existing size for non-singleton dimension")
101 |   result.offset = t.offset
102 |   result.storage = t.storage
103 | 
104 | func bc[Rank; T: SomeNumber](x: T, shape: array[Rank, int]): T {.inline.}=
105 |   ## "Broadcast" scalars
106 |   x
107 | 
108 | func product(x: varargs[int]): int =
109 |   result = 1
110 |   for val in x: result *= val
111 | 
112 | proc replaceNodes(ast: NimNode, values: NimNode, containers: NimNode): NimNode =
113 |   # Args:
114 |   #   - The full syntax tree
115 |   #   - an array of replacement value
116 |   #   - an array of identifiers to replace
117 |   proc inspect(node: NimNode): NimNode =
118 |     case node.kind:
119 |     of {nnkIdent, nnkSym}:
120 |       for i, c in containers:
121 |         if node.eqIdent($c):
122 |           return values[i]
123 |       return node
124 |     of nnkEmpty: return node
125 |     of nnkLiterals: return node
126 |     else:
127 |       var rTree = node.kind.newTree()
128 |       for child in node:
129 |         rTree.add inspect(child)
130 |       return rTree
131 |   result = inspect(ast)
132 | 
133 | proc pop*(tree: var NimNode): NimNode =
134 |   ## varargs[untyped] consumes all arguments so the actual value should be popped
135 |   ## https://github.com/nim-lang/Nim/issues/5855
136 |   result = tree[tree.len-1]
137 |   tree.del(tree.len-1)
138 | 
139 | func nb_elems[N: static[int], T](x: typedesc[array[N, T]]): static[int] =
140 |   N
141 | 
142 | macro broadcastImpl(output: untyped, inputs_body: varargs[untyped]): untyped =
143 |   ## If output is empty node it will return a value
144 |   ## otherwise, result will be assigned in-place to output
145 |   let
146 |     in_place = newLit output.kind != nnkEmpty
147 | 
148 |   var
149 |     inputs = inputs_body
150 |     body = inputs.pop()
151 | 
152 |   let
153 |     shape = genSym(nskLet, "broadcast_shape__")
154 |     coord = genSym(nskVar, "broadcast_coord__")
155 | 
156 |   var doBroadcast = newStmtList()
157 |   var bcInputs = nnkArgList.newTree()
158 |   for input in inputs:
159 |     let broadcasted = genSym(nskLet, "broadcast_" & $input & "__")
160 |     doBroadcast.add newLetStmt(
161 |       broadcasted,
162 |       newCall(ident"bc", input, shape)
163 |     )
164 |     bcInputs.add nnkBracketExpr.newTree(broadcasted, coord)
165 | 
166 |   body = body.replaceNodes(bcInputs, inputs)
167 | 
168 |   result = quote do:
169 |     block:
170 |       let `shape` = getBroadcastShape(`inputs`)
171 |       const rank = `shape`.type.nb_elems
172 |       var `coord`: array[rank, int] # Current coordinates in the n-dimensional space
173 |       `doBroadcast`
174 | 
175 |       when not `in_place`:
176 |         var output = newTensor[rank, type(`body`)](`shape`)
177 |       else:
178 |         assert `output`.shape == `shape`
179 | 
180 |       for _ in 0 ..< `shape`.product:
181 |         # Assign for the current iteration
182 |         when not `in_place`:
183 |           output[`coord`] = `body`
184 |         else:
185 |           `output`[`coord`] = `body`
186 | 
187 |         # Compute the next position
188 |         for k in countdown(rank - 1, 0):
189 |           if `coord`[k] < `shape`[k] - 1:
190 |             `coord`[k] += 1
191 |             break
192 |           else:
193 |             `coord`[k] = 0
194 | 
195 |       # Now return the value
196 |       when not `in_place`:
197 |         output
198 | 
199 | macro broadcast(inputs_body: varargs[untyped]): untyped =
200 |   getAST(broadcastImpl(newEmptyNode(), inputs_body))
201 | 
202 | macro materialize(output: var Tensor, inputs_body: varargs[untyped]): untyped =
203 |   getAST(broadcastImpl(output, inputs_body))
204 | 
205 | #################################################################################
206 | 
207 | import math
208 | proc sanityChecks() =
209 |   # Sanity checks
210 | 
211 |   let x = randomTensor([1, 2, 3], 10)
212 |   let y = randomTensor([5, 2], 10)
213 | 
214 |   echo x # (shape: [1, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1, 10, 5, 5, 7, 3]))
215 |   echo y # (shape: [5, 2], strides: [2, 1], offset: 0, storage: (data: @[8, 3, 7, 9, 3, 8, 5, 3, 7, 1]))
216 | 
217 |   block: # Simple assignation
218 |     echo "\nSimple assignation"
219 |     let a = broadcast(x, y):
220 |       x * y
221 | 
222 |     echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[8, 80, 40, 15, 21, 9, 7, 70, 35, 45, 63, 27, 3, 30, 15, 40, 56, 24, 5, 50, 25, 15, 21, 9, 7, 70, 35, 5, 7, 3]))
223 | 
224 |   block: # In-place, similar to Julia impl
225 |     echo "\nIn-place, similar to Julia impl"
226 |     var a = newTensor[3, int]([5, 2, 3])
227 |     materialize(a, x, y):
228 |       x * y
229 | 
230 |     echo a
231 | 
232 |   block: # Complex multi statement with type conversion
233 |     echo "\nComplex multi statement with type conversion"
234 |     let a = broadcast(x, y):
235 |       let c = cos x.float64
236 |       let s = sin y.float64
237 | 
238 |       sqrt(c.pow(2) + s.pow(2))
239 | 
240 |     echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1.12727828058919, 1.297255090978019, 1.029220081237957, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.5003057878335346, 0.859191628789455, 1.072346394223034, 0.5584276483137685, 0.8508559734652587, 0.3168265963213802, 1.029220081237957, 1.243864280886628, 1.399612404734566, 1.100664502137075, 1.274196529364651, 1.0, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.8879964266455946, 1.129797339073468, 1.299291561428286]))
241 | 
242 |   block: # Variadic number of types with proc declaration inside
243 |     echo "\nVariadic number of types with proc declaration inside"
244 |     var u, v, w, x, y, z = randomTensor([3, 3], 10)
245 | 
246 |     let c = 2
247 | 
248 |     let a = broadcast(u, v, w, x, y, z):
249 |       # ((u * v * w) div c) mod (if not zero (x - y + z) else 42)
250 | 
251 |       proc ifNotZero(val, default: int): int =
252 |         if val == 0: default
253 |         else: val
254 | 
255 |       let uvw_divc = u * v * w div c
256 |       let xmypz = x - y + z
257 | 
258 |       uvw_divc mod ifNotZero(xmypz, 42)
259 | 
260 |     echo a # (shape: [3, 3], strides: [3, 1], offset: 0, storage: (data: @[0, 0, 0, 7, 4, 0, 0, 2, 0]))
261 | 
262 |   block: # Simple broadcasted addition test
263 |     echo "\nSimple broadcasted addition test"
264 |     var a = newTensor[2, int]([2, 3])
265 |     a.storage.data = @[3, 2, 1, 1, 2, 3] # Ideally we should have arrays of arrays -> tensor conversion
266 |     var b = newTensor[2, int]([1, 3])
267 |     b.storage.data = @[1, 2, 3]
268 | 
269 |     let c = broadcast(a, b): a + b
270 |     doAssert c.storage.data == @[4, 4, 4, 2, 4, 6]
271 |     echo "✓ Passed"
272 | 
273 | #################################################################################
274 | 
275 | import math, random, times, stats, strformat
276 | proc mainBench(nb_samples: int) =
277 |   ## Bench with standard lib
278 |   block: # Warmup - make sure cpu is on max perf
279 |     let start = cpuTime()
280 |     var foo = 123
281 |     for i in 0 ..< 100_000_000:
282 |       foo += i*i mod 456
283 |       foo = foo mod 789
284 | 
285 |     # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
286 |     let stop = cpuTime()
287 |     echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)"
288 | 
289 |   let
290 |     a = randomTensor([1000, 1000], 1.0)
291 |     b = randomTensor([1000], 1.0)
292 |     c = 1.0
293 |   var output = newTensor[2, float64](a.shape)
294 | 
295 |   block: # Actual bench
296 |     var stats: RunningStat
297 |     for _ in 0 ..< nb_samples:
298 |       let start = cpuTime()
299 |       materialize(output, a, b, c):
300 |         a + b - sin c
301 |       let stop = cpuTime()
302 |       stats.push stop - start
303 | 
304 |     echo &"\nTensors of Float64 bench"
305 |     echo &"Collected {stats.n} samples"
306 |     echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
307 |     echo &"Stddev  broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
308 |     echo &"Min     broadcast time: {stats.min * 1000 :>4.3f}ms"
309 |     echo &"Max     broadcast time: {stats.max * 1000 :>4.3f}ms"
310 |     echo "\nDisplay output[[0,0]] to make sure it's not optimized away"
311 |     echo output[[0, 0]]
312 | 
313 | proc geometryBench(nb_samples: int) =
314 |   type Point3 = object
315 |     x, y, z: float32
316 | 
317 |   template liftBinary(op: untyped): untyped =
318 |     func `op`(a, b: Point3): Point3 {.inline.}=
319 |       result.x = `op`(a.x, b.x)
320 |       result.y = `op`(a.y, b.y)
321 |       result.z = `op`(a.z, b.z)
322 |     func `op`(a: Point3, b: float32): Point3 {.inline.}=
323 |       result.x = `op`(a.x, b)
324 |       result.y = `op`(a.y, b)
325 |       result.z = `op`(a.z, b)
326 |   template liftReduce(opName, op: untyped): untyped =
327 |     func `opName`(a: Point3): float32 {.inline.}=
328 |       a.x.`op`(a.y).`op`(a.z)
329 | 
330 |   liftBinary(`+`)
331 |   liftBinary(`*`)
332 |   liftBinary(`-`)
333 |   liftReduce(sum, `+`)
334 | 
335 |   let
336 |     a = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
337 |     b = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
338 |     c = 1.0'f32 # Julia has Point3 has float32 but C has float64
339 |   var output = newTensor[1, float32](a.shape)
340 | 
341 |   block: # Custom function sqrt(sum(a .* b))
342 |     func super_custom_func(a, b: Point3): float32 = sqrt sum(a * b)
343 | 
344 |     var stats: RunningStat
345 |     for _ in 0 ..< nb_samples:
346 |       let start = cpuTime()
347 |       materialize(output, a, b):
348 |         super_custom_func(a, b)
349 |       let stop = cpuTime()
350 |       stats.push stop - start
351 | 
352 |     echo &"\nTensor of 3D float32 points bench"
353 |     echo &"Collected {stats.n} samples"
354 |     echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
355 |     echo &"Stddev  broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
356 |     echo &"Min     broadcast time: {stats.min * 1000 :>4.3f}ms"
357 |     echo &"Max     broadcast time: {stats.max * 1000 :>4.3f}ms"
358 |     echo "\nDisplay output[[0]] to make sure it's not optimized away"
359 |     echo output[[0]]
360 | 
361 | when isMainModule:
362 |   sanityChecks()
363 |   echo "\n###################"
364 |   echo "Benchmark"
365 |   # {.passC: "-march=native" .} # uncomment to enable full optim (AVX/AVX2, ...)
366 |   # randomize(seed = 0)
367 |   mainBench(1_000)
368 |   geometryBench(1_000)
369 | 
370 |   # Compile with
371 |   # nim c -d:release nim/nim_sol_mratsim.nim     # for binary only
372 |   # nim c -r -d:release nim/nim_sol_mratsim.nim  # for binary + running
373 | 


--------------------------------------------------------------------------------
/cpp/stl/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <array>
  2 | #include <tuple>
  3 | #include <cmath>
  4 | #include <iostream>
  5 | #include <algorithm>
  6 | #include <type_traits>
  7 | #include <vector>
  8 | #include <chrono>
  9 | #include <numeric>
 10 | 
 11 | //////////////////////////////////////////////////////////////////////////
 12 | // Helper Stuff
 13 | //////////////////////////////////////////////////////////////////////////
 14 | 
 15 | template <class T>
 16 | void print_container(T& container) {
 17 |     for (auto el : container)
 18 |         std::cout << el << ", ";
 19 |     std::cout << "\n";
 20 | }
 21 | 
 22 | template <typename T, typename F, std::size_t... I>
 23 | void for_each_impl(F&& f, T&& tuple, std::index_sequence<I...>) {
 24 |     (void) std::initializer_list<int>{
 25 |         (f(std::get<I>(std::forward<T>(tuple))), void(), int{})...
 26 |     };
 27 | }
 28 | 
 29 | template <typename T, typename F>
 30 | void for_each(F&& f, T&& tuple) {
 31 |     constexpr std::size_t N = std::tuple_size<std::decay_t<T>>::value;
 32 |     for_each_impl(std::forward<F>(f), std::forward<T>(tuple),
 33 |                   std::make_index_sequence<N>{});
 34 | }
 35 | 
 36 | class Timer
 37 | {
 38 | public:
 39 |     Timer() : beg_(clock_::now()) {}
 40 |     void reset() { beg_ = clock_::now(); }
 41 |     size_t elapsed() const { 
 42 |         return std::chrono::duration_cast<std::chrono::nanoseconds>
 43 |             (clock_::now() - beg_).count(); }
 44 | 
 45 | private:
 46 |     typedef std::chrono::high_resolution_clock clock_;
 47 |     std::chrono::time_point<clock_> beg_;
 48 | };
 49 | 
 50 | //////////////////////////////////////////////////////////////////////////
 51 | // Array class
 52 | //////////////////////////////////////////////////////////////////////////
 53 | 
 54 | enum layout { row_major, col_major };
 55 | 
 56 | template <std::size_t N, std::size_t I = 0, class... X, class... Args>
 57 | auto recursive_for(const std::tuple<X...>& x, Args... args) {
 58 |     if constexpr (I == N)
 59 |         for (std::size_t i = 0; i < std::get<0>(x).shape[I]; ++i)
 60 |             std::get<0>(x)(args..., i);
 61 |     else
 62 |         for (std::size_t i = 0; i < std::get<0>(x).shape[I]; ++i) {
 63 |             if constexpr (sizeof...(X) > N)
 64 |                 std::get<N>(x)(args..., i);
 65 |             recursive_for<N, I + 1>(x, args..., i);
 66 |         }
 67 | }
 68 | 
 69 | template <class T, layout L = row_major>
 70 | struct qiterator
 71 | {
 72 |     using index_type = typename T::shape_type;
 73 | 
 74 |     qiterator(const T& parent)
 75 |         : m_ref(parent), m_size(m_ref.size()), m_index{0} {
 76 |     }
 77 | 
 78 |     qiterator& operator++() {
 79 |         m_linear_idx++;
 80 |         if constexpr (L == row_major)
 81 |         {
 82 |             for (std::size_t i = std::tuple_size_v<index_type>; ++m_index[i - 1], i > 0; --i)
 83 |             {
 84 |                 if (m_index[i - 1] == m_ref.shape[i - 1]) { m_index[i - 1] = 0; }
 85 |                 else { return *this; }
 86 |             }
 87 |         }
 88 |         else 
 89 |         {
 90 |             for (std::size_t i = 0; ++m_index[i], i < std::tuple_size_v<index_type>; ++i)
 91 |             {
 92 |                 if (m_index[i] == m_ref.shape[i]) { m_index[i] = 0; }
 93 |                 else { return *this; }
 94 |             }
 95 |         }
 96 |         return *this;
 97 |     }
 98 | 
 99 |     template <std::size_t... I>
100 |     auto deref_impl(std::index_sequence<I...>) { return m_ref(std::get<I>(m_index)...); }
101 |     template <std::size_t... I>
102 |     const auto deref_impl(std::index_sequence<I...>) const { return m_ref(std::get<I>(m_index)...); }
103 | 
104 |     auto operator*() { return deref_impl(std::make_index_sequence<std::tuple_size_v<index_type>>{}); }
105 |     const auto operator*() const { return deref_impl(std::make_index_sequence<std::tuple_size_v<index_type>>{}); }
106 | 
107 |     bool operator==(const qiterator&) { return m_linear_idx == m_size; }
108 |     bool operator!=(const qiterator& end) { return !(*this == end); }
109 | 
110 |     const T& m_ref;
111 |     index_type m_index;
112 |     std::size_t m_linear_idx = 0, m_size = 0;
113 | };
114 | 
115 | template <class T>
116 | void container_resize(T& container, std::size_t sz) { container.resize(sz); };
117 | template <class T, std::size_t N>
118 | void container_resize(std::array<T, N>& container, std::size_t sz) {};
119 | 
120 | template <class L, class... X>
121 | class qfunction;
122 | 
123 | template <class CT, std::size_t N, layout L = row_major>
124 | class simple_array_view {
125 | public:
126 | 
127 |     using self_type = simple_array_view<CT, N>;
128 |     using shape_type = std::array<ptrdiff_t, N>;
129 |     using container_type = std::decay_t<CT>;
130 |     using value_type = std::decay_t<decltype(std::declval<CT>()[0])>;
131 |     using container_reference = std::decay_t<CT>&;
132 | 
133 |     simple_array_view() = default;
134 | 
135 |     auto compute_strides()
136 |     {
137 |         ptrdiff_t data_size = 1;
138 |         if constexpr (N > 0) {
139 |             if constexpr (L == row_major) {
140 |                 strides[N - 1] = shape[N - 1] != 1 ? 1 : 0;
141 |                 for (std::ptrdiff_t i = N - 1; i > 0; --i) {
142 |                     data_size *= static_cast<ptrdiff_t>(shape[i]);
143 |                     strides[i - 1] = shape[i - 1] != 1 ? data_size : 0;
144 |                 }
145 |                 data_size *= shape[0];
146 |             }
147 |             else {
148 |                 for (std::size_t i = 0; i < N; ++i) {
149 |                     strides[i] = data_size;
150 |                     data_size = strides[i] * static_cast<ptrdiff_t>(shape[i]);
151 |                     if (shape[i] == 1) { strides[i] = 0; }
152 |                 }
153 |             }
154 |         }
155 |         return data_size;
156 |     }
157 | 
158 |     auto constexpr compute_offset() const { return ptrdiff_t(0); }; 
159 | 
160 |     template <class Arg, class... Args>
161 |     auto constexpr compute_offset(Arg a1, Args... args) const {
162 |         if constexpr (sizeof...(Args) + 1 > N)
163 |             return compute_offset(args...);
164 |         else {
165 |             std::array<ptrdiff_t, sizeof...(Args) + 1> idx({static_cast<long>(a1), static_cast<long>(args)...});
166 |             ptrdiff_t offset = 0;
167 |             for (std::size_t i = 0; i < N; ++i) {
168 |                 offset += strides[i] * idx[i];
169 |             }
170 |             return offset;
171 |         }
172 |     }
173 | 
174 |     explicit simple_array_view(value_type data, const std::array<ptrdiff_t, N>& i_shape) : shape(i_shape) {
175 |         container_resize(memory, compute_strides());
176 |         std::fill(memory.begin(), memory.end(), data);
177 |         compute_strides();
178 |     }
179 | 
180 |     explicit simple_array_view(CT data, const std::array<ptrdiff_t, N>& i_shape,
181 |                                const std::array<ptrdiff_t, N>& i_strides) : memory(data), shape(i_shape), strides(i_strides)
182 |     {
183 |     }
184 | 
185 |     template <class T>
186 |     explicit simple_array_view(T&& data, const std::array<ptrdiff_t, N>& i_shape) : memory(std::forward<T>(data)), shape(i_shape) {
187 |         compute_strides();
188 |     }
189 | 
190 |     explicit simple_array_view(const std::array<ptrdiff_t, N>& i_shape) : shape(i_shape) {
191 |         container_resize(memory, compute_strides());
192 |     }
193 | 
194 |     template <class T>
195 |     void assign_impl(T&& rhs) {
196 |         auto assign_func = make_qfunc([](auto& lhs, auto rhs) { lhs = rhs; }, *this, rhs);
197 |         recursive_for<std::tuple_size<shape_type>::value - 1>(std::make_tuple(std::move(assign_func)));
198 |     }
199 | 
200 |     template <class LM, class... X>
201 |     simple_array_view(const qfunction<LM, X...>& e) : shape(e.shape) {
202 |         container_resize(memory, compute_strides());
203 |         assign_impl(e);
204 |     }
205 | 
206 |     template <class LM, class... X>
207 |     self_type& operator=(const qfunction<LM, X...>& e) {
208 |         if (!std::equal(shape.begin(), shape.end(), e.shape.begin())) {
209 |             std::copy(e.shape.begin(), e.shape.end(), shape.begin());
210 |             container_resize(memory, compute_strides());
211 |         }
212 |         assign_impl(e);
213 |         return *this;
214 |     }
215 | 
216 |     auto begin() { return qiterator(*this); }
217 |     auto end() { return qiterator(*this); }
218 | 
219 |     void fill(value_type val) { std::fill(memory.begin(), memory.end(), val); }
220 | 
221 |     template <class... Args>
222 |     value_type& operator()(Args... args) { return memory[compute_offset(args...)]; }
223 | 
224 |     template <class... Args>
225 |     const value_type& operator()(Args... args) const { return memory[compute_offset(args...)]; }
226 | 
227 |     container_reference data() { return memory; }
228 |     size_t size() const { return memory.size(); };
229 | 
230 |     CT memory;
231 |     std::array<ptrdiff_t, N> shape;
232 |     std::array<ptrdiff_t, N> strides;
233 | };
234 | 
235 | template <class... Args>
236 | constexpr auto max_dim() {
237 |     constexpr auto arr = std::array<size_t, sizeof...(Args)>{std::tuple_size<Args>::value...};
238 |     return *std::max_element(arr.begin(), arr.end());
239 | }
240 | 
241 | ////////////////////////////////////////////////////////////////////////////////
242 | // Lazy function
243 | ////////////////////////////////////////////////////////////////////////////////
244 | 
245 | template <class F, class... X>
246 | class qfunction
247 | {
248 | public:
249 | 
250 |     using shape_type = std::array<ptrdiff_t, max_dim<typename std::decay_t<X>::shape_type...>()>;
251 | 
252 |     template <class... Args>
253 |     qfunction(F f, Args&&... args)
254 |         : m_f(f), m_args(std::forward<Args>(args)...)
255 |     {
256 |         std::fill(shape.begin(), shape.end(), 1);
257 | 
258 |         auto broadcast_shape = [this](const auto& v) constexpr {
259 |             std::size_t offset = this->shape.size() - v.shape.size();
260 |             for (std::size_t i = 0; i < v.shape.size(); ++i) {
261 |                 if (this->shape[offset + i] == 1)
262 |                     this->shape[offset + i] = v.shape[i];
263 |                 else
264 |                     if (v.shape[i] != this->shape[offset + i] && v.shape[i] != 1)
265 |                         throw std::runtime_error("Broadcast error.");
266 |             }
267 |             return true;
268 |         };
269 | 
270 |         for_each(broadcast_shape, m_args);
271 |     }
272 | 
273 |     template <std::size_t... I, class... Args>
274 |     auto access_impl(std::index_sequence<I...>, Args... args) const { 
275 |         return m_f(std::get<I>(m_args)(args...)...); 
276 |     }
277 | 
278 |     template <class... Args>
279 |     auto operator()(Args... args) const {
280 |         return access_impl(std::make_index_sequence<sizeof...(X)>(), args...);
281 |     }
282 | 
283 |     auto begin() { return qiterator(*this); }
284 |     auto end() { return qiterator(*this); }
285 |     auto begin() const { return qiterator(*this); }
286 |     auto end() const { return qiterator(*this); }
287 | 
288 |     size_t size() const { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>{}); }
289 | 
290 |     F m_f;
291 |     shape_type shape;
292 |     std::tuple<X...> m_args;
293 | };
294 | 
295 | template <std::size_t N, class T = double>
296 | using qarray = simple_array_view<std::vector<T>, N>;
297 | 
298 | template <std::size_t N, class T = double>
299 | using qview = simple_array_view<T*, N>;
300 | 
301 | template <class T>
302 | decltype(auto) wrap_if_scalar(T&& arg)
303 | {
304 |     if constexpr (std::is_scalar_v<std::decay_t<decltype(arg)>>)
305 |         return qarray<0>({arg}, {});
306 |     else
307 |         return std::forward<T>(arg);
308 | }
309 | 
310 | template <class S>
311 | struct closure_type {
312 |     using underlying_type = std::conditional_t<std::is_const<std::remove_reference_t<S>>::value,
313 |                                                const std::decay_t<S>,
314 |                                                std::decay_t<S>>;
315 |     using type = typename std::conditional<std::is_lvalue_reference<S>::value,
316 |                                            underlying_type&,
317 |                                            underlying_type>::type;
318 | };
319 | 
320 | template <class T>
321 | using closure_type_t = typename closure_type<T>::type;
322 | 
323 | template <class L, class... Args>
324 | auto make_qfunc(L func, Args&&... args) {
325 |     return qfunction<L, closure_type_t<Args>...>(func, std::forward<Args>(args)...);
326 | }
327 | 
328 | #define BINARY_OP(OP)                                                                   \
329 |     template <class A, class B>                                                         \
330 |     auto operator OP (A&& a, B&& b) {                                                   \
331 |         return make_qfunc([](auto x, auto y) { return x OP y; },                        \
332 |             wrap_if_scalar(std::forward<A>(a)), wrap_if_scalar(std::forward<B>(b)));    \
333 |     }
334 | 
335 | #define UNARY_FUNC(FUNC)                                                                   \
336 |     template <class A>                                                                     \
337 |     auto FUNC (A&& a) {                                                                    \
338 |         return make_qfunc([](auto x) {                                                     \
339 |             using namespace std; return FUNC (x); }, wrap_if_scalar(std::forward<A>(a)));  \
340 |     }
341 | 
342 | BINARY_OP(+); BINARY_OP(-); BINARY_OP(*); BINARY_OP(/);
343 | UNARY_FUNC(sin); UNARY_FUNC(cos);
344 | 
345 | #include <iomanip>
346 | template <class T, class O = std::ostream>
347 | void qprint(T&& t, O& os = std::cout)
348 | {
349 |     using shape_type = typename std::decay_t<T>::shape_type;
350 |     auto print_func = make_qfunc([&os](auto el) { 
351 |         if constexpr (std::is_scalar_v<decltype(el)>) os << std::fixed << std::setprecision(2) << std::setw(8) << el << ", ";
352 |         else qprint(el);
353 |     },
354 |     std::forward<T>(t));
355 |     recursive_for<std::tuple_size<shape_type>::value - 1>(std::make_tuple(print_func, [&os](auto) { os << "\n"; }));
356 |     os << "\n";
357 | }
358 | 
359 | using Point3 = simple_array_view<std::array<float, 3>, 1>;
360 | 
361 | template <class C>
362 | constexpr auto compute_size(const C& cnt)
363 | {
364 |     return std::accumulate(cnt.cbegin(), cnt.cend(), size_t(1), std::multiplies<size_t>());
365 | }
366 | 
367 | template <class T, class I, std::size_t N>
368 | auto sum_axis(const T& t, const I (&axes)[N])
369 | {
370 |     auto res_shape = t.shape;
371 |     for (auto el : axes)
372 |         res_shape[el] = 1;
373 |     auto data = std::vector<float>(compute_size(res_shape));
374 |     qarray<2, float> res(data, res_shape);
375 | 
376 |     auto sum_func = make_qfunc([](auto& lhs, auto rhs) {
377 |          lhs += rhs;
378 |     }, res, t);
379 |     recursive_for<std::tuple_size<typename T::shape_type>::value - 1>(std::make_tuple(sum_func));
380 |     return res;
381 | }
382 | 
383 | // auto sum(const Point3& a)
384 | // {
385 | //     float result = 0;
386 | //     for (std::size_t i = 0; i < a.size(); ++i)
387 | //     {
388 | //         result += a.memory[i];
389 | //     }
390 | //     return result;
391 | // }
392 | 
393 | template <class T>
394 | auto cumsum(const T& t, std::ptrdiff_t ax)
395 | {
396 |     std::array<ptrdiff_t, 2> index = {0, 0};
397 |     index[ax] = 1; // first elem
398 |     auto offset = std::inner_product(index.begin(), index.end(), t.strides.begin(), 0);
399 | 
400 |     auto view_shape = t.shape;
401 |     view_shape[ax] -= 1;
402 | 
403 |     qarray<2, double> res = t; // copy (copy only first "line")
404 | 
405 |     // pick first element, e.g. res(0, 1) but indexing with std array not possible r
406 |     qview<2, double> rhs_v(&res(0, 0), view_shape, t.strides);
407 |     qview<2, double> lhs_v(&res.memory[offset], view_shape, t.strides);
408 |     auto sum_func = make_qfunc([](auto& lhs, auto rhs) {
409 |          lhs += rhs;
410 |     }, lhs_v, rhs_v);
411 | 
412 |     recursive_for<std::tuple_size<typename T::shape_type>::value - 1>(std::make_tuple(sum_func));
413 |     return res;
414 | }
415 | 
416 | 
417 | template <class T>
418 | auto sum(const T& t)
419 | {
420 |     return std::accumulate(t.begin(), t.end(), 0.f);
421 | }
422 | 
423 | auto super_custom_func(const Point3& a, const Point3& b)
424 | {
425 |     return std::sqrt(sum(a * b));
426 | }
427 | 
428 | int main()
429 | {
430 |     // Just to show that broadcasting works
431 |     auto b1 = std::vector<double>(25);
432 |     std::iota(b1.begin(), b1.end(), 0.0);
433 |     auto b2 = std::vector<double>(5);
434 |     std::iota(b2.begin(), b2.end(), 0.0);
435 |     auto b3 = 2.5;
436 |     auto bc_func = qarray<2>(b1, {5, 5}) + qarray<1>(b2, {5})  * b3;
437 |     qprint(bc_func);
438 | 
439 |     auto q1 = qarray<2>(b1, {5, 5});
440 | 
441 |     qprint(cumsum(q1, 0));
442 |     qprint(cumsum(q1, 1));
443 | 
444 |     // std::cout << sum(bc_func) << std::endl;
445 | 
446 |     auto d1 = std::vector<double>(1000 * 1000, 0.1);
447 |     auto d2 = std::vector<double>(1000, 0.232);
448 |     auto dres = std::vector<double>(1000 * 1000, 0);
449 |     double c = 1.0;
450 | 
451 |     simple_array_view<std::vector<double>, 2> a(d1, {1000, 1000});
452 |     simple_array_view<std::vector<double>, 1> b(d2, {1000});
453 |     simple_array_view<std::vector<double>, 2> res(dres, {1000, 1000});
454 | 
455 |     auto sqfunc = a + b - sin(c);
456 |     
457 |     std::size_t min_time = 100000000;
458 | 
459 |     qprint(cumsum(q1, 1));
460 |     std::cout << "\n\nBenchmarking cumsum(a[5x5], 1)\n===============================\n";
461 | 
462 |     for (int i = 0; i < 10'000; ++i)
463 |     {
464 |         Timer timer;
465 |         res = cumsum(q1, 1);
466 |         auto elapsed = timer.elapsed(); // nanoseconds
467 |         if (elapsed < min_time) min_time = elapsed;
468 |         // std::cout << "TIME: "  << (double) elapsed / (double) 1000 << " μs" << std::endl;
469 |     }
470 | 
471 |     std::cout << "\nMIN TIME: " << min_time << " ns\n";
472 |     std::cout << "        = " << (double) min_time / (double) 1000 << " μs" << std::endl;
473 | 
474 |     min_time = 100000000;
475 | 
476 |     std::cout << "\n\nBenchmarking sum_axis(a[5x5], {0})\n===============================\n";
477 | 
478 |     for (int i = 0; i < 10'000; ++i)
479 |     {
480 |         Timer timer;
481 |         auto s_res = sum_axis(q1, {0});
482 |         auto elapsed = timer.elapsed(); // nanoseconds
483 |         if (elapsed < min_time) min_time = elapsed;
484 |         // std::cout << "TIME: "  << (double) elapsed / (double) 1000 << " μs" << std::endl;
485 |     }
486 | 
487 |     std::cout << "\nMIN TIME: " << min_time << " ns\n";
488 |     std::cout << "        = " << (double) min_time / (double) 1000 << " μs" << std::endl;
489 | 
490 |     min_time = 100000000;
491 |     std::cout << "\n\nBenchmarking a[1000x1000] + b[1000] - sin(c[])\n===================================\n";
492 |     for (int i = 0; i < 200; ++i)
493 |     {
494 |         Timer timer;
495 |         res = sqfunc;
496 |         auto elapsed = timer.elapsed(); // nanoseconds
497 |         if (elapsed < min_time) min_time = elapsed;
498 |         // std::cout << "TIME: "  << (double) elapsed / (double) 1000 << " μs" << std::endl;
499 |     }
500 | 
501 |     std::cout << "\nMIN TIME: " << min_time << " ns\n";
502 |     std::cout << "        = " << (double) min_time / (double) 1000 << " μs" << std::endl;
503 | 
504 | 
505 |     constexpr std::ptrdiff_t psz = 1'000'000;
506 | 
507 |     simple_array_view<std::vector<Point3>, 1> px(std::vector<Point3>(psz, Point3(std::array<float, 3>{0.5, 2.1, 3.2}, {3})), {psz});
508 |     simple_array_view<std::vector<Point3>, 1> py(std::vector<Point3>(psz, Point3(std::array<float, 3>{0.5, 2.1, 3.2}, {3})), {psz});
509 |     simple_array_view<std::vector<float>, 1> pres(std::vector<float>(psz), {psz});
510 | 
511 |     auto pointfunc = make_qfunc(super_custom_func, px, py);
512 | 
513 |     pres = make_qfunc(super_custom_func, px, py);
514 |     auto bc_super_custom_func = make_qfunc(super_custom_func, px, py);
515 | 
516 |     min_time = 100000000;
517 | 
518 |     std::cout << "\n\nBenchmarking super_custom_func\n===============================\n";
519 | 
520 |     for (int i = 0; i < 200; ++i)
521 |     {
522 |         Timer timer;
523 |         pres = bc_super_custom_func;
524 |         auto elapsed = timer.elapsed(); // nanoseconds
525 |         if (elapsed < min_time) min_time = elapsed;
526 |         // std::cout << "TIME: "  << (double) elapsed / (double) 1000 << " μs" << std::endl;
527 |     }
528 | 
529 |     std::cout << "\nMIN TIME: " << min_time << " ns\n";
530 |     std::cout << "        = " << (double) min_time / (double) 1000 << " μs\n" << std::endl;
531 | }


--------------------------------------------------------------------------------