├── .gitignore ├── cpp ├── stl │ ├── README.md │ ├── Makefile │ └── main.cpp └── xtensor │ ├── README.md │ ├── CMakeLists.txt │ └── src │ └── main.cpp ├── README.md ├── julia └── danisch │ ├── benchmark.jl │ └── solution.jl └── nim └── nim_sol_mratsim.nim /.gitignore: -------------------------------------------------------------------------------- 1 | # For binaries while testing/benchmarking 2 | build/ 3 | -------------------------------------------------------------------------------- /cpp/stl/README.md: -------------------------------------------------------------------------------- 1 | # C++ submission to the Julia challenge 2 | 3 | See blog article by @wolfv 4 | 5 | https://medium.com/@wolfv/the-julia-challenge-in-c-21272d36c002 6 | -------------------------------------------------------------------------------- /cpp/stl/Makefile: -------------------------------------------------------------------------------- 1 | main.o : main.cpp 2 | $(CXX) -O3 -march=native -mtune=native -std=c++17 main.cpp -o main.o 3 | clean : 4 | rm main.o 5 | benchmark : main.o 6 | ./main.o 7 | -------------------------------------------------------------------------------- /cpp/xtensor/README.md: -------------------------------------------------------------------------------- 1 | # C++ - xtensor submission to the Julia challenge 2 | 3 | This requires xtensor, xsimd and a C++14 compiler 4 | 5 | ``` 6 | mkdir build 7 | cd build 8 | cmake .. 9 | make 10 | ./julia_challenge 11 | ``` 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Julia Challenge 2 | 3 | Repository to collect the code for: 4 | https://nextjournal.com/sdanisch/the-julia-challenge 5 | 6 | 7 | A submission should look the following: 8 | 9 | * place into `language_name/authorname/solution.ext + benchmark.ext` 10 | * a solution should implement an n-dimensional, n-argument lazy [broadcast](https://julia.guide/broadcasting) from scratch 11 | * lazy means, one can aggregate recursive calls to a broadcasting operation - and decide when and how to materialize the result 12 | -------------------------------------------------------------------------------- /cpp/xtensor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | 3 | # Project Julia Challenge 4 | project(julia-challenge) 5 | 6 | # Require xtensor and xsimd 7 | find_package(xtensor 0.18.1 REQUIRED) 8 | find_package(xsimd 7.0.0 REQUIRED) 9 | 10 | # Force build type to Release 11 | message(STATUS "Forcing build type to Release") 12 | set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) 13 | 14 | # Compilation flags (march=native and check for c++14 compilation flag)) 15 | if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") 16 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -std=c++14") 17 | endif() 18 | 19 | # Sources 20 | set(SOURCES src/main.cpp) 21 | 22 | # Enable XSIMD acceleration 23 | add_definitions(-DXTENSOR_USE_XSIMD) 24 | 25 | # Setup executable 26 | set(CHALLENGE_TARGET julia_challenge) 27 | add_executable(${CHALLENGE_TARGET} ${SOURCES}) 28 | target_link_libraries(${CHALLENGE_TARGET} xtensor) 29 | -------------------------------------------------------------------------------- /julia/danisch/benchmark.jl: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | using BenchmarkTools 4 | 5 | reference(out, a, b, c) = (out .= a .+ b .- sin.(c)) 6 | 7 | a = rand(1000, 1000); 8 | b = rand(1000); 9 | c = 1.0 10 | out = similar(a); 11 | br = @broadcast a + b - sin(c) 12 | 13 | @btime materialize!($out, $br) 14 | @btime reference($out, $a, $b, $c) 15 | 16 | # Any library with NVectors specializing to the length will do! 17 | using GeometryTypes 18 | const Point3 = Point{3, Float32} 19 | 20 | # function needs to come from different library 21 | module LibraryB 22 | # no odd stuff, no functors, no special lambda expression! 23 | # this function needs to be a normal language function 24 | # as can be found in the wild 25 | super_custom_func(a, b) = sqrt(sum(a .* b)) 26 | end 27 | # emulate that the function comes from a different library 28 | using .LibraryB: super_custom_func 29 | 30 | using BenchmarkTools 31 | 32 | a = rand(Point3, 10^6) 33 | b = rand(Point3, 10^6) 34 | out = fill(0f0, 10^6) 35 | 36 | @btime $out .= super_custom_func.($a, $b) 37 | br = @broadcast super_custom_func(a, b) 38 | @btime materialize!($out, $br) 39 | 40 | @btime sum($br) 41 | @btime sum($a .+ $b .- sin.($c)) 42 | -------------------------------------------------------------------------------- /julia/danisch/solution.jl: -------------------------------------------------------------------------------- 1 | import Base: getindex, iterate, axes, eachindex, tail, @propagate_inbounds 2 | struct LazyBroadcast{F, Args} 3 | f::F 4 | args::Args 5 | end 6 | @propagate_inbounds function br_getindex(A::AbstractArray, I) 7 | idx = ntuple(i-> ifelse(size(A, i) === 1, 1, I[i]), Val(ndims(A))) 8 | return A[CartesianIndex(idx)] 9 | end 10 | br_getindex(scalar, I) = scalar # Scalars no need to index them 11 | @propagate_inbounds function br_getindex(x::LazyBroadcast, I) 12 | # this could be a map, but the current map in 1.0 has a perf problem 13 | return x.f(getindex_arg(x.args, I)...) 14 | end 15 | getindex_arg(args::Tuple{}, I) = () # recursion ancor 16 | @propagate_inbounds function getindex_arg(args::NTuple{N, Any}, I) where N 17 | return (br_getindex(args[1], I), getindex_arg(tail(args), I)...) 18 | end 19 | @propagate_inbounds getindex(x::LazyBroadcast, I) = br_getindex(x, Tuple(I)) 20 | function materialize!(out::AbstractArray, x::LazyBroadcast) 21 | # an n-dimensional simd accelerated loop 22 | @simd for i in CartesianIndices(axes(out)) 23 | @inbounds out[i] = x[i] 24 | end 25 | return out 26 | end 27 | br_construct(x) = x 28 | function br_construct(x::Expr) 29 | x.args .= br_construct.(x.args) # apply recursively 30 | if Meta.isexpr(x, :call) # replace calls to construct LazyBroadcasts 31 | x = :(LazyBroadcast($(x.args[1]), ($(x.args[2:end]...),))) 32 | end 33 | x 34 | end 35 | # macro to enable the syntax @broadcast a + b - sin(c) to construct our type 36 | macro broadcast(call_expr) 37 | esc(br_construct(call_expr)) 38 | end 39 | # Simplified implementation to take the axes of the array with the largest 40 | # dimensionality (axes -> the range an array iterates over) 41 | biggest(a, b, c, rest...) = biggest(biggest(a, b), biggest(c, rest...)) 42 | biggest(a::NTuple{N1, Any}, b::NTuple{N2, Any}) where {N1, N2} = 43 | ifelse(N1 > N2, a, b) 44 | biggest(a) = a 45 | flatten_args(t::LazyBroadcast, rest...) = 46 | (flatten_args(t.args...)..., flatten_args(rest...)...) 47 | flatten_args(t::Any, rest...) = (t, flatten_args(rest...)...) 48 | flatten_args() = () 49 | # the indexing axes of our array 50 | axes(br::LazyBroadcast) = biggest(map(axes, flatten_args(br))...) 51 | # lazy view that can be used to index over all elements in br 52 | eachindex(br::LazyBroadcast) = CartesianIndices(axes(br)) 53 | iterate(br::LazyBroadcast) = iterate(br, (eachindex(br),)) 54 | @propagate_inbounds function iterate(bc::LazyBroadcast, s) 55 | y = iterate(s...) 56 | y === nothing && return nothing 57 | i, newstate = y 58 | return (bc[i], (s[1], newstate)) 59 | end 60 | -------------------------------------------------------------------------------- /cpp/xtensor/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | // Simple stopwatch object 14 | class stopwatch 15 | { 16 | public: 17 | 18 | stopwatch() : m_start(clock_type::now()) 19 | { 20 | } 21 | 22 | void reset() 23 | { 24 | m_start = clock_type::now(); 25 | } 26 | 27 | std::size_t elapsed() const 28 | { 29 | return std::chrono::duration_cast(clock_type::now() - m_start).count(); 30 | } 31 | 32 | private: 33 | 34 | typedef std::chrono::high_resolution_clock clock_type; 35 | std::chrono::time_point m_start; 36 | }; 37 | 38 | template 39 | using xpoint = xt::xtensor_fixed>; 40 | 41 | template 42 | auto sum_xpoint(const xpoint& t) 43 | { 44 | return std::accumulate(t.cbegin(), t.cend(), T()); 45 | } 46 | 47 | // Simple test program 48 | int main() 49 | { 50 | // First Benchmark: 51 | // 52 | // a[1000x1000] + b[1000] - sin(c[]) 53 | { 54 | xt::xtensor a = xt::random::rand({1000, 1000}); 55 | xt::xtensor b = xt::random::rand({1000}); 56 | double c = 1.0; 57 | 58 | // Un-evaluated broadcasting exprression 59 | auto expr = a + b - std::sin(c); 60 | auto res = xt::xtensor::from_shape({1000, 1000}); 61 | 62 | // Benchmark loop 63 | std::cout << "Benchmarking a[1000x1000] + b[1000] - sin(c[])" << std::endl; 64 | std::size_t min_time = 100000000; 65 | for (int i = 0; i < 200; ++i) 66 | { 67 | stopwatch timer; // Create timer 68 | xt::noalias(res) = expr; // Evaluate the expression. 69 | auto elapsed = timer.elapsed(); // Nanoseconds 70 | if (elapsed < min_time) 71 | { 72 | min_time = elapsed; 73 | } 74 | } 75 | 76 | // Output results 77 | std::cout << "MIN TIME: " << min_time << " ns" << std::endl; 78 | std::cout << " = " << (double) min_time / (double) 1000 << " μs" << std::endl; 79 | std::cout << std::endl << std::endl; 80 | } 81 | 82 | // Second Benchmark: 83 | // 84 | // std::sqrt(sum(a * b)); 85 | { 86 | constexpr std::size_t psz = 1000000; 87 | auto px = xt::xtensor, 1>({psz}, {0.5f, 2.1f, 3.2f}), 88 | py = xt::xtensor, 1>({psz}, {0.5f, 2.1f, 3.2f}); 89 | auto res = xt::xtensor({psz}); 90 | auto sum = xt::vectorize(sum_xpoint); 91 | 92 | // Un-evaluated broadcasting expression 93 | auto expr = xt::sqrt(sum(px * py)); 94 | 95 | // Benchmark loop 96 | std::cout << "Benchmarking sqrt(sum(a * b))" << std::endl; 97 | std::size_t min_time = 100000000; 98 | for (int i = 0; i < 200; ++i) 99 | { 100 | stopwatch timer; // Create timer 101 | xt::noalias(res) = expr; // Evaluate the expression. 102 | auto elapsed = timer.elapsed(); // Nanoseconds 103 | if (elapsed < min_time) 104 | { 105 | min_time = elapsed; 106 | } 107 | } 108 | 109 | // Output results 110 | std::cout << "MIN TIME: " << min_time << " ns" << std::endl; 111 | std::cout << " = " << (double) min_time / (double) 1000 << " μs" << std::endl; 112 | std::cout << std::endl << std::endl; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /nim/nim_sol_mratsim.nim: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # Copyright (c) 2018 Mamy André-Ratsimbazafy 3 | 4 | ## This files gives basic tensor library functionality, because yes we can 5 | import strformat, macros, sequtils, random 6 | 7 | type 8 | Tensor[Rank: static[int], T] = object 9 | ## Tensor data structure stored on Cpu 10 | ## - ``shape``: Dimensions of the tensor 11 | ## - ``strides``: Numbers of items to skip to get the next item along a dimension. 12 | ## - ``offset``: Offset to get the first item of the tensor. Note: offset can be negative, in particular for slices. 13 | ## - ``storage``: A data storage for the tensor 14 | ## - Rank is part of the type for optimization purposes 15 | ## 16 | ## Warning ⚠: 17 | ## Assignment ```var a = b``` does not copy the data. Data modification on one tensor will be reflected on the other. 18 | ## However modification on metadata (shape, strides or offset) will not affect the other tensor. 19 | shape: array[Rank, int] 20 | strides: array[Rank, int] 21 | offset: int 22 | storage: CpuStorage[T] 23 | 24 | CpuStorage*{.shallow.}[T] = object 25 | ## Data storage for the tensor, copies are shallow by default 26 | data*: seq[T] 27 | 28 | template tensor(result: var Tensor, shape: array) = 29 | result.shape = shape 30 | 31 | var accum = 1 32 | for i in countdown(Rank - 1, 0): 33 | result.strides[i] = accum 34 | accum *= shape[i] 35 | 36 | func newTensor*[Rank: static[int], T](shape: array[Rank, int]): Tensor[Rank, T] = 37 | tensor(result, shape) 38 | result.storage.data = newSeq[T](shape.product) 39 | 40 | proc rand[T: object|tuple](max: T): T = 41 | ## A generic random function for any stack object or tuple 42 | ## that initialize all fields randomly 43 | result = max 44 | for field in result.fields: 45 | field = rand(field) 46 | 47 | proc randomTensor*[Rank: static[int], T](shape: array[Rank, int], max: T): Tensor[Rank, T] = 48 | tensor(result, shape) 49 | result.storage.data = newSeqWith(shape.product, T(rand(max))) 50 | 51 | func getIndex[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): int {.inline.} = 52 | ## Convert [i, j, k, l ...] to the memory location referred by the index 53 | result = t.offset 54 | for i in 0 ..< t.Rank: 55 | {.unroll.} # I'm sad this doesn't work yet 56 | result += t.strides[i] * idx[i] 57 | 58 | func `[]`[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): T {.inline.}= 59 | ## Index tensor 60 | t.storage.data[t.getIndex(idx)] 61 | 62 | func `[]=`[Rank, T](t: var Tensor[Rank, T], idx: array[Rank, int], val: T) {.inline.}= 63 | ## Index tensor 64 | t.storage.data[t.getIndex(idx)] = val 65 | 66 | template `[]`[T: SomeNumber](x: T, idx: varargs[int]): T = 67 | ## "Index" scalars 68 | x 69 | 70 | func shape(x: SomeNumber): array[1, int] = [1] 71 | 72 | func bcShape[R1, R2: static[int]](x: array[R1, int]; y: array[R2, int]): auto = 73 | when R1 > R2: 74 | result = x 75 | for i, idx in result.mpairs: 76 | if idx == 1 and y[i] != 1: 77 | idx = y[i] 78 | else: 79 | result = y 80 | for i, idx in result.mpairs: 81 | if idx == 1 and x[i] != 1: 82 | idx = x[i] 83 | 84 | macro getBroadcastShape(x: varargs[typed]): untyped = 85 | assert x.len >= 2 86 | result = nnkDotExpr.newTree(x[0], ident"shape") 87 | for i in 1 ..< x.len: 88 | let xi = x[i] 89 | result = quote do: bcShape(`result`, `xi`.shape) 90 | 91 | func bc[R1, R2: static[int], T](t: Tensor[R1, T], shape: array[R2, int]): Tensor[R2, T] = 92 | ## Broadcast tensors 93 | result.shape = shape 94 | for i in 0 ..< R1: 95 | if t.shape[i] == 1 and shape[i] != 1: 96 | result.strides[i] = 0 97 | else: 98 | result.strides[i] = t.strides[i] 99 | if t.shape[i] != result.shape[i]: 100 | raise newException(ValueError, "The broadcasted size of the tensor must match existing size for non-singleton dimension") 101 | result.offset = t.offset 102 | result.storage = t.storage 103 | 104 | func bc[Rank; T: SomeNumber](x: T, shape: array[Rank, int]): T {.inline.}= 105 | ## "Broadcast" scalars 106 | x 107 | 108 | func product(x: varargs[int]): int = 109 | result = 1 110 | for val in x: result *= val 111 | 112 | proc replaceNodes(ast: NimNode, values: NimNode, containers: NimNode): NimNode = 113 | # Args: 114 | # - The full syntax tree 115 | # - an array of replacement value 116 | # - an array of identifiers to replace 117 | proc inspect(node: NimNode): NimNode = 118 | case node.kind: 119 | of {nnkIdent, nnkSym}: 120 | for i, c in containers: 121 | if node.eqIdent($c): 122 | return values[i] 123 | return node 124 | of nnkEmpty: return node 125 | of nnkLiterals: return node 126 | else: 127 | var rTree = node.kind.newTree() 128 | for child in node: 129 | rTree.add inspect(child) 130 | return rTree 131 | result = inspect(ast) 132 | 133 | proc pop*(tree: var NimNode): NimNode = 134 | ## varargs[untyped] consumes all arguments so the actual value should be popped 135 | ## https://github.com/nim-lang/Nim/issues/5855 136 | result = tree[tree.len-1] 137 | tree.del(tree.len-1) 138 | 139 | func nb_elems[N: static[int], T](x: typedesc[array[N, T]]): static[int] = 140 | N 141 | 142 | macro broadcastImpl(output: untyped, inputs_body: varargs[untyped]): untyped = 143 | ## If output is empty node it will return a value 144 | ## otherwise, result will be assigned in-place to output 145 | let 146 | in_place = newLit output.kind != nnkEmpty 147 | 148 | var 149 | inputs = inputs_body 150 | body = inputs.pop() 151 | 152 | let 153 | shape = genSym(nskLet, "broadcast_shape__") 154 | coord = genSym(nskVar, "broadcast_coord__") 155 | 156 | var doBroadcast = newStmtList() 157 | var bcInputs = nnkArgList.newTree() 158 | for input in inputs: 159 | let broadcasted = genSym(nskLet, "broadcast_" & $input & "__") 160 | doBroadcast.add newLetStmt( 161 | broadcasted, 162 | newCall(ident"bc", input, shape) 163 | ) 164 | bcInputs.add nnkBracketExpr.newTree(broadcasted, coord) 165 | 166 | body = body.replaceNodes(bcInputs, inputs) 167 | 168 | result = quote do: 169 | block: 170 | let `shape` = getBroadcastShape(`inputs`) 171 | const rank = `shape`.type.nb_elems 172 | var `coord`: array[rank, int] # Current coordinates in the n-dimensional space 173 | `doBroadcast` 174 | 175 | when not `in_place`: 176 | var output = newTensor[rank, type(`body`)](`shape`) 177 | else: 178 | assert `output`.shape == `shape` 179 | 180 | for _ in 0 ..< `shape`.product: 181 | # Assign for the current iteration 182 | when not `in_place`: 183 | output[`coord`] = `body` 184 | else: 185 | `output`[`coord`] = `body` 186 | 187 | # Compute the next position 188 | for k in countdown(rank - 1, 0): 189 | if `coord`[k] < `shape`[k] - 1: 190 | `coord`[k] += 1 191 | break 192 | else: 193 | `coord`[k] = 0 194 | 195 | # Now return the value 196 | when not `in_place`: 197 | output 198 | 199 | macro broadcast(inputs_body: varargs[untyped]): untyped = 200 | getAST(broadcastImpl(newEmptyNode(), inputs_body)) 201 | 202 | macro materialize(output: var Tensor, inputs_body: varargs[untyped]): untyped = 203 | getAST(broadcastImpl(output, inputs_body)) 204 | 205 | ################################################################################# 206 | 207 | import math 208 | proc sanityChecks() = 209 | # Sanity checks 210 | 211 | let x = randomTensor([1, 2, 3], 10) 212 | let y = randomTensor([5, 2], 10) 213 | 214 | echo x # (shape: [1, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1, 10, 5, 5, 7, 3])) 215 | echo y # (shape: [5, 2], strides: [2, 1], offset: 0, storage: (data: @[8, 3, 7, 9, 3, 8, 5, 3, 7, 1])) 216 | 217 | block: # Simple assignation 218 | echo "\nSimple assignation" 219 | let a = broadcast(x, y): 220 | x * y 221 | 222 | echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[8, 80, 40, 15, 21, 9, 7, 70, 35, 45, 63, 27, 3, 30, 15, 40, 56, 24, 5, 50, 25, 15, 21, 9, 7, 70, 35, 5, 7, 3])) 223 | 224 | block: # In-place, similar to Julia impl 225 | echo "\nIn-place, similar to Julia impl" 226 | var a = newTensor[3, int]([5, 2, 3]) 227 | materialize(a, x, y): 228 | x * y 229 | 230 | echo a 231 | 232 | block: # Complex multi statement with type conversion 233 | echo "\nComplex multi statement with type conversion" 234 | let a = broadcast(x, y): 235 | let c = cos x.float64 236 | let s = sin y.float64 237 | 238 | sqrt(c.pow(2) + s.pow(2)) 239 | 240 | echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1.12727828058919, 1.297255090978019, 1.029220081237957, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.5003057878335346, 0.859191628789455, 1.072346394223034, 0.5584276483137685, 0.8508559734652587, 0.3168265963213802, 1.029220081237957, 1.243864280886628, 1.399612404734566, 1.100664502137075, 1.274196529364651, 1.0, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.8879964266455946, 1.129797339073468, 1.299291561428286])) 241 | 242 | block: # Variadic number of types with proc declaration inside 243 | echo "\nVariadic number of types with proc declaration inside" 244 | var u, v, w, x, y, z = randomTensor([3, 3], 10) 245 | 246 | let c = 2 247 | 248 | let a = broadcast(u, v, w, x, y, z): 249 | # ((u * v * w) div c) mod (if not zero (x - y + z) else 42) 250 | 251 | proc ifNotZero(val, default: int): int = 252 | if val == 0: default 253 | else: val 254 | 255 | let uvw_divc = u * v * w div c 256 | let xmypz = x - y + z 257 | 258 | uvw_divc mod ifNotZero(xmypz, 42) 259 | 260 | echo a # (shape: [3, 3], strides: [3, 1], offset: 0, storage: (data: @[0, 0, 0, 7, 4, 0, 0, 2, 0])) 261 | 262 | block: # Simple broadcasted addition test 263 | echo "\nSimple broadcasted addition test" 264 | var a = newTensor[2, int]([2, 3]) 265 | a.storage.data = @[3, 2, 1, 1, 2, 3] # Ideally we should have arrays of arrays -> tensor conversion 266 | var b = newTensor[2, int]([1, 3]) 267 | b.storage.data = @[1, 2, 3] 268 | 269 | let c = broadcast(a, b): a + b 270 | doAssert c.storage.data == @[4, 4, 4, 2, 4, 6] 271 | echo "✓ Passed" 272 | 273 | ################################################################################# 274 | 275 | import math, random, times, stats, strformat 276 | proc mainBench(nb_samples: int) = 277 | ## Bench with standard lib 278 | block: # Warmup - make sure cpu is on max perf 279 | let start = cpuTime() 280 | var foo = 123 281 | for i in 0 ..< 100_000_000: 282 | foo += i*i mod 456 283 | foo = foo mod 789 284 | 285 | # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects 286 | let stop = cpuTime() 287 | echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)" 288 | 289 | let 290 | a = randomTensor([1000, 1000], 1.0) 291 | b = randomTensor([1000], 1.0) 292 | c = 1.0 293 | var output = newTensor[2, float64](a.shape) 294 | 295 | block: # Actual bench 296 | var stats: RunningStat 297 | for _ in 0 ..< nb_samples: 298 | let start = cpuTime() 299 | materialize(output, a, b, c): 300 | a + b - sin c 301 | let stop = cpuTime() 302 | stats.push stop - start 303 | 304 | echo &"\nTensors of Float64 bench" 305 | echo &"Collected {stats.n} samples" 306 | echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms" 307 | echo &"Stddev broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms" 308 | echo &"Min broadcast time: {stats.min * 1000 :>4.3f}ms" 309 | echo &"Max broadcast time: {stats.max * 1000 :>4.3f}ms" 310 | echo "\nDisplay output[[0,0]] to make sure it's not optimized away" 311 | echo output[[0, 0]] 312 | 313 | proc geometryBench(nb_samples: int) = 314 | type Point3 = object 315 | x, y, z: float32 316 | 317 | template liftBinary(op: untyped): untyped = 318 | func `op`(a, b: Point3): Point3 {.inline.}= 319 | result.x = `op`(a.x, b.x) 320 | result.y = `op`(a.y, b.y) 321 | result.z = `op`(a.z, b.z) 322 | func `op`(a: Point3, b: float32): Point3 {.inline.}= 323 | result.x = `op`(a.x, b) 324 | result.y = `op`(a.y, b) 325 | result.z = `op`(a.z, b) 326 | template liftReduce(opName, op: untyped): untyped = 327 | func `opName`(a: Point3): float32 {.inline.}= 328 | a.x.`op`(a.y).`op`(a.z) 329 | 330 | liftBinary(`+`) 331 | liftBinary(`*`) 332 | liftBinary(`-`) 333 | liftReduce(sum, `+`) 334 | 335 | let 336 | a = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100)) 337 | b = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100)) 338 | c = 1.0'f32 # Julia has Point3 has float32 but C has float64 339 | var output = newTensor[1, float32](a.shape) 340 | 341 | block: # Custom function sqrt(sum(a .* b)) 342 | func super_custom_func(a, b: Point3): float32 = sqrt sum(a * b) 343 | 344 | var stats: RunningStat 345 | for _ in 0 ..< nb_samples: 346 | let start = cpuTime() 347 | materialize(output, a, b): 348 | super_custom_func(a, b) 349 | let stop = cpuTime() 350 | stats.push stop - start 351 | 352 | echo &"\nTensor of 3D float32 points bench" 353 | echo &"Collected {stats.n} samples" 354 | echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms" 355 | echo &"Stddev broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms" 356 | echo &"Min broadcast time: {stats.min * 1000 :>4.3f}ms" 357 | echo &"Max broadcast time: {stats.max * 1000 :>4.3f}ms" 358 | echo "\nDisplay output[[0]] to make sure it's not optimized away" 359 | echo output[[0]] 360 | 361 | when isMainModule: 362 | sanityChecks() 363 | echo "\n###################" 364 | echo "Benchmark" 365 | # {.passC: "-march=native" .} # uncomment to enable full optim (AVX/AVX2, ...) 366 | # randomize(seed = 0) 367 | mainBench(1_000) 368 | geometryBench(1_000) 369 | 370 | # Compile with 371 | # nim c -d:release nim/nim_sol_mratsim.nim # for binary only 372 | # nim c -r -d:release nim/nim_sol_mratsim.nim # for binary + running 373 | -------------------------------------------------------------------------------- /cpp/stl/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | ////////////////////////////////////////////////////////////////////////// 12 | // Helper Stuff 13 | ////////////////////////////////////////////////////////////////////////// 14 | 15 | template 16 | void print_container(T& container) { 17 | for (auto el : container) 18 | std::cout << el << ", "; 19 | std::cout << "\n"; 20 | } 21 | 22 | template 23 | void for_each_impl(F&& f, T&& tuple, std::index_sequence) { 24 | (void) std::initializer_list{ 25 | (f(std::get(std::forward(tuple))), void(), int{})... 26 | }; 27 | } 28 | 29 | template 30 | void for_each(F&& f, T&& tuple) { 31 | constexpr std::size_t N = std::tuple_size>::value; 32 | for_each_impl(std::forward(f), std::forward(tuple), 33 | std::make_index_sequence{}); 34 | } 35 | 36 | class Timer 37 | { 38 | public: 39 | Timer() : beg_(clock_::now()) {} 40 | void reset() { beg_ = clock_::now(); } 41 | size_t elapsed() const { 42 | return std::chrono::duration_cast 43 | (clock_::now() - beg_).count(); } 44 | 45 | private: 46 | typedef std::chrono::high_resolution_clock clock_; 47 | std::chrono::time_point beg_; 48 | }; 49 | 50 | ////////////////////////////////////////////////////////////////////////// 51 | // Array class 52 | ////////////////////////////////////////////////////////////////////////// 53 | 54 | enum layout { row_major, col_major }; 55 | 56 | template 57 | auto recursive_for(const std::tuple& x, Args... args) { 58 | if constexpr (I == N) 59 | for (std::size_t i = 0; i < std::get<0>(x).shape[I]; ++i) 60 | std::get<0>(x)(args..., i); 61 | else 62 | for (std::size_t i = 0; i < std::get<0>(x).shape[I]; ++i) { 63 | if constexpr (sizeof...(X) > N) 64 | std::get(x)(args..., i); 65 | recursive_for(x, args..., i); 66 | } 67 | } 68 | 69 | template 70 | struct qiterator 71 | { 72 | using index_type = typename T::shape_type; 73 | 74 | qiterator(const T& parent) 75 | : m_ref(parent), m_size(m_ref.size()), m_index{0} { 76 | } 77 | 78 | qiterator& operator++() { 79 | m_linear_idx++; 80 | if constexpr (L == row_major) 81 | { 82 | for (std::size_t i = std::tuple_size_v; ++m_index[i - 1], i > 0; --i) 83 | { 84 | if (m_index[i - 1] == m_ref.shape[i - 1]) { m_index[i - 1] = 0; } 85 | else { return *this; } 86 | } 87 | } 88 | else 89 | { 90 | for (std::size_t i = 0; ++m_index[i], i < std::tuple_size_v; ++i) 91 | { 92 | if (m_index[i] == m_ref.shape[i]) { m_index[i] = 0; } 93 | else { return *this; } 94 | } 95 | } 96 | return *this; 97 | } 98 | 99 | template 100 | auto deref_impl(std::index_sequence) { return m_ref(std::get(m_index)...); } 101 | template 102 | const auto deref_impl(std::index_sequence) const { return m_ref(std::get(m_index)...); } 103 | 104 | auto operator*() { return deref_impl(std::make_index_sequence>{}); } 105 | const auto operator*() const { return deref_impl(std::make_index_sequence>{}); } 106 | 107 | bool operator==(const qiterator&) { return m_linear_idx == m_size; } 108 | bool operator!=(const qiterator& end) { return !(*this == end); } 109 | 110 | const T& m_ref; 111 | index_type m_index; 112 | std::size_t m_linear_idx = 0, m_size = 0; 113 | }; 114 | 115 | template 116 | void container_resize(T& container, std::size_t sz) { container.resize(sz); }; 117 | template 118 | void container_resize(std::array& container, std::size_t sz) {}; 119 | 120 | template 121 | class qfunction; 122 | 123 | template 124 | class simple_array_view { 125 | public: 126 | 127 | using self_type = simple_array_view; 128 | using shape_type = std::array; 129 | using container_type = std::decay_t; 130 | using value_type = std::decay_t()[0])>; 131 | using container_reference = std::decay_t&; 132 | 133 | simple_array_view() = default; 134 | 135 | auto compute_strides() 136 | { 137 | ptrdiff_t data_size = 1; 138 | if constexpr (N > 0) { 139 | if constexpr (L == row_major) { 140 | strides[N - 1] = shape[N - 1] != 1 ? 1 : 0; 141 | for (std::ptrdiff_t i = N - 1; i > 0; --i) { 142 | data_size *= static_cast(shape[i]); 143 | strides[i - 1] = shape[i - 1] != 1 ? data_size : 0; 144 | } 145 | data_size *= shape[0]; 146 | } 147 | else { 148 | for (std::size_t i = 0; i < N; ++i) { 149 | strides[i] = data_size; 150 | data_size = strides[i] * static_cast(shape[i]); 151 | if (shape[i] == 1) { strides[i] = 0; } 152 | } 153 | } 154 | } 155 | return data_size; 156 | } 157 | 158 | auto constexpr compute_offset() const { return ptrdiff_t(0); }; 159 | 160 | template 161 | auto constexpr compute_offset(Arg a1, Args... args) const { 162 | if constexpr (sizeof...(Args) + 1 > N) 163 | return compute_offset(args...); 164 | else { 165 | std::array idx({static_cast(a1), static_cast(args)...}); 166 | ptrdiff_t offset = 0; 167 | for (std::size_t i = 0; i < N; ++i) { 168 | offset += strides[i] * idx[i]; 169 | } 170 | return offset; 171 | } 172 | } 173 | 174 | explicit simple_array_view(value_type data, const std::array& i_shape) : shape(i_shape) { 175 | container_resize(memory, compute_strides()); 176 | std::fill(memory.begin(), memory.end(), data); 177 | compute_strides(); 178 | } 179 | 180 | explicit simple_array_view(CT data, const std::array& i_shape, 181 | const std::array& i_strides) : memory(data), shape(i_shape), strides(i_strides) 182 | { 183 | } 184 | 185 | template 186 | explicit simple_array_view(T&& data, const std::array& i_shape) : memory(std::forward(data)), shape(i_shape) { 187 | compute_strides(); 188 | } 189 | 190 | explicit simple_array_view(const std::array& i_shape) : shape(i_shape) { 191 | container_resize(memory, compute_strides()); 192 | } 193 | 194 | template 195 | void assign_impl(T&& rhs) { 196 | auto assign_func = make_qfunc([](auto& lhs, auto rhs) { lhs = rhs; }, *this, rhs); 197 | recursive_for::value - 1>(std::make_tuple(std::move(assign_func))); 198 | } 199 | 200 | template 201 | simple_array_view(const qfunction& e) : shape(e.shape) { 202 | container_resize(memory, compute_strides()); 203 | assign_impl(e); 204 | } 205 | 206 | template 207 | self_type& operator=(const qfunction& e) { 208 | if (!std::equal(shape.begin(), shape.end(), e.shape.begin())) { 209 | std::copy(e.shape.begin(), e.shape.end(), shape.begin()); 210 | container_resize(memory, compute_strides()); 211 | } 212 | assign_impl(e); 213 | return *this; 214 | } 215 | 216 | auto begin() { return qiterator(*this); } 217 | auto end() { return qiterator(*this); } 218 | 219 | void fill(value_type val) { std::fill(memory.begin(), memory.end(), val); } 220 | 221 | template 222 | value_type& operator()(Args... args) { return memory[compute_offset(args...)]; } 223 | 224 | template 225 | const value_type& operator()(Args... args) const { return memory[compute_offset(args...)]; } 226 | 227 | container_reference data() { return memory; } 228 | size_t size() const { return memory.size(); }; 229 | 230 | CT memory; 231 | std::array shape; 232 | std::array strides; 233 | }; 234 | 235 | template 236 | constexpr auto max_dim() { 237 | constexpr auto arr = std::array{std::tuple_size::value...}; 238 | return *std::max_element(arr.begin(), arr.end()); 239 | } 240 | 241 | //////////////////////////////////////////////////////////////////////////////// 242 | // Lazy function 243 | //////////////////////////////////////////////////////////////////////////////// 244 | 245 | template 246 | class qfunction 247 | { 248 | public: 249 | 250 | using shape_type = std::array::shape_type...>()>; 251 | 252 | template 253 | qfunction(F f, Args&&... args) 254 | : m_f(f), m_args(std::forward(args)...) 255 | { 256 | std::fill(shape.begin(), shape.end(), 1); 257 | 258 | auto broadcast_shape = [this](const auto& v) constexpr { 259 | std::size_t offset = this->shape.size() - v.shape.size(); 260 | for (std::size_t i = 0; i < v.shape.size(); ++i) { 261 | if (this->shape[offset + i] == 1) 262 | this->shape[offset + i] = v.shape[i]; 263 | else 264 | if (v.shape[i] != this->shape[offset + i] && v.shape[i] != 1) 265 | throw std::runtime_error("Broadcast error."); 266 | } 267 | return true; 268 | }; 269 | 270 | for_each(broadcast_shape, m_args); 271 | } 272 | 273 | template 274 | auto access_impl(std::index_sequence, Args... args) const { 275 | return m_f(std::get(m_args)(args...)...); 276 | } 277 | 278 | template 279 | auto operator()(Args... args) const { 280 | return access_impl(std::make_index_sequence(), args...); 281 | } 282 | 283 | auto begin() { return qiterator(*this); } 284 | auto end() { return qiterator(*this); } 285 | auto begin() const { return qiterator(*this); } 286 | auto end() const { return qiterator(*this); } 287 | 288 | size_t size() const { return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>{}); } 289 | 290 | F m_f; 291 | shape_type shape; 292 | std::tuple m_args; 293 | }; 294 | 295 | template 296 | using qarray = simple_array_view, N>; 297 | 298 | template 299 | using qview = simple_array_view; 300 | 301 | template 302 | decltype(auto) wrap_if_scalar(T&& arg) 303 | { 304 | if constexpr (std::is_scalar_v>) 305 | return qarray<0>({arg}, {}); 306 | else 307 | return std::forward(arg); 308 | } 309 | 310 | template 311 | struct closure_type { 312 | using underlying_type = std::conditional_t>::value, 313 | const std::decay_t, 314 | std::decay_t>; 315 | using type = typename std::conditional::value, 316 | underlying_type&, 317 | underlying_type>::type; 318 | }; 319 | 320 | template 321 | using closure_type_t = typename closure_type::type; 322 | 323 | template 324 | auto make_qfunc(L func, Args&&... args) { 325 | return qfunction...>(func, std::forward(args)...); 326 | } 327 | 328 | #define BINARY_OP(OP) \ 329 | template \ 330 | auto operator OP (A&& a, B&& b) { \ 331 | return make_qfunc([](auto x, auto y) { return x OP y; }, \ 332 | wrap_if_scalar(std::forward(a)), wrap_if_scalar(std::forward(b))); \ 333 | } 334 | 335 | #define UNARY_FUNC(FUNC) \ 336 | template \ 337 | auto FUNC (A&& a) { \ 338 | return make_qfunc([](auto x) { \ 339 | using namespace std; return FUNC (x); }, wrap_if_scalar(std::forward(a))); \ 340 | } 341 | 342 | BINARY_OP(+); BINARY_OP(-); BINARY_OP(*); BINARY_OP(/); 343 | UNARY_FUNC(sin); UNARY_FUNC(cos); 344 | 345 | #include 346 | template 347 | void qprint(T&& t, O& os = std::cout) 348 | { 349 | using shape_type = typename std::decay_t::shape_type; 350 | auto print_func = make_qfunc([&os](auto el) { 351 | if constexpr (std::is_scalar_v) os << std::fixed << std::setprecision(2) << std::setw(8) << el << ", "; 352 | else qprint(el); 353 | }, 354 | std::forward(t)); 355 | recursive_for::value - 1>(std::make_tuple(print_func, [&os](auto) { os << "\n"; })); 356 | os << "\n"; 357 | } 358 | 359 | using Point3 = simple_array_view, 1>; 360 | 361 | template 362 | constexpr auto compute_size(const C& cnt) 363 | { 364 | return std::accumulate(cnt.cbegin(), cnt.cend(), size_t(1), std::multiplies()); 365 | } 366 | 367 | template 368 | auto sum_axis(const T& t, const I (&axes)[N]) 369 | { 370 | auto res_shape = t.shape; 371 | for (auto el : axes) 372 | res_shape[el] = 1; 373 | auto data = std::vector(compute_size(res_shape)); 374 | qarray<2, float> res(data, res_shape); 375 | 376 | auto sum_func = make_qfunc([](auto& lhs, auto rhs) { 377 | lhs += rhs; 378 | }, res, t); 379 | recursive_for::value - 1>(std::make_tuple(sum_func)); 380 | return res; 381 | } 382 | 383 | // auto sum(const Point3& a) 384 | // { 385 | // float result = 0; 386 | // for (std::size_t i = 0; i < a.size(); ++i) 387 | // { 388 | // result += a.memory[i]; 389 | // } 390 | // return result; 391 | // } 392 | 393 | template 394 | auto cumsum(const T& t, std::ptrdiff_t ax) 395 | { 396 | std::array index = {0, 0}; 397 | index[ax] = 1; // first elem 398 | auto offset = std::inner_product(index.begin(), index.end(), t.strides.begin(), 0); 399 | 400 | auto view_shape = t.shape; 401 | view_shape[ax] -= 1; 402 | 403 | qarray<2, double> res = t; // copy (copy only first "line") 404 | 405 | // pick first element, e.g. res(0, 1) but indexing with std array not possible r 406 | qview<2, double> rhs_v(&res(0, 0), view_shape, t.strides); 407 | qview<2, double> lhs_v(&res.memory[offset], view_shape, t.strides); 408 | auto sum_func = make_qfunc([](auto& lhs, auto rhs) { 409 | lhs += rhs; 410 | }, lhs_v, rhs_v); 411 | 412 | recursive_for::value - 1>(std::make_tuple(sum_func)); 413 | return res; 414 | } 415 | 416 | 417 | template 418 | auto sum(const T& t) 419 | { 420 | return std::accumulate(t.begin(), t.end(), 0.f); 421 | } 422 | 423 | auto super_custom_func(const Point3& a, const Point3& b) 424 | { 425 | return std::sqrt(sum(a * b)); 426 | } 427 | 428 | int main() 429 | { 430 | // Just to show that broadcasting works 431 | auto b1 = std::vector(25); 432 | std::iota(b1.begin(), b1.end(), 0.0); 433 | auto b2 = std::vector(5); 434 | std::iota(b2.begin(), b2.end(), 0.0); 435 | auto b3 = 2.5; 436 | auto bc_func = qarray<2>(b1, {5, 5}) + qarray<1>(b2, {5}) * b3; 437 | qprint(bc_func); 438 | 439 | auto q1 = qarray<2>(b1, {5, 5}); 440 | 441 | qprint(cumsum(q1, 0)); 442 | qprint(cumsum(q1, 1)); 443 | 444 | // std::cout << sum(bc_func) << std::endl; 445 | 446 | auto d1 = std::vector(1000 * 1000, 0.1); 447 | auto d2 = std::vector(1000, 0.232); 448 | auto dres = std::vector(1000 * 1000, 0); 449 | double c = 1.0; 450 | 451 | simple_array_view, 2> a(d1, {1000, 1000}); 452 | simple_array_view, 1> b(d2, {1000}); 453 | simple_array_view, 2> res(dres, {1000, 1000}); 454 | 455 | auto sqfunc = a + b - sin(c); 456 | 457 | std::size_t min_time = 100000000; 458 | 459 | qprint(cumsum(q1, 1)); 460 | std::cout << "\n\nBenchmarking cumsum(a[5x5], 1)\n===============================\n"; 461 | 462 | for (int i = 0; i < 10'000; ++i) 463 | { 464 | Timer timer; 465 | res = cumsum(q1, 1); 466 | auto elapsed = timer.elapsed(); // nanoseconds 467 | if (elapsed < min_time) min_time = elapsed; 468 | // std::cout << "TIME: " << (double) elapsed / (double) 1000 << " μs" << std::endl; 469 | } 470 | 471 | std::cout << "\nMIN TIME: " << min_time << " ns\n"; 472 | std::cout << " = " << (double) min_time / (double) 1000 << " μs" << std::endl; 473 | 474 | min_time = 100000000; 475 | 476 | std::cout << "\n\nBenchmarking sum_axis(a[5x5], {0})\n===============================\n"; 477 | 478 | for (int i = 0; i < 10'000; ++i) 479 | { 480 | Timer timer; 481 | auto s_res = sum_axis(q1, {0}); 482 | auto elapsed = timer.elapsed(); // nanoseconds 483 | if (elapsed < min_time) min_time = elapsed; 484 | // std::cout << "TIME: " << (double) elapsed / (double) 1000 << " μs" << std::endl; 485 | } 486 | 487 | std::cout << "\nMIN TIME: " << min_time << " ns\n"; 488 | std::cout << " = " << (double) min_time / (double) 1000 << " μs" << std::endl; 489 | 490 | min_time = 100000000; 491 | std::cout << "\n\nBenchmarking a[1000x1000] + b[1000] - sin(c[])\n===================================\n"; 492 | for (int i = 0; i < 200; ++i) 493 | { 494 | Timer timer; 495 | res = sqfunc; 496 | auto elapsed = timer.elapsed(); // nanoseconds 497 | if (elapsed < min_time) min_time = elapsed; 498 | // std::cout << "TIME: " << (double) elapsed / (double) 1000 << " μs" << std::endl; 499 | } 500 | 501 | std::cout << "\nMIN TIME: " << min_time << " ns\n"; 502 | std::cout << " = " << (double) min_time / (double) 1000 << " μs" << std::endl; 503 | 504 | 505 | constexpr std::ptrdiff_t psz = 1'000'000; 506 | 507 | simple_array_view, 1> px(std::vector(psz, Point3(std::array{0.5, 2.1, 3.2}, {3})), {psz}); 508 | simple_array_view, 1> py(std::vector(psz, Point3(std::array{0.5, 2.1, 3.2}, {3})), {psz}); 509 | simple_array_view, 1> pres(std::vector(psz), {psz}); 510 | 511 | auto pointfunc = make_qfunc(super_custom_func, px, py); 512 | 513 | pres = make_qfunc(super_custom_func, px, py); 514 | auto bc_super_custom_func = make_qfunc(super_custom_func, px, py); 515 | 516 | min_time = 100000000; 517 | 518 | std::cout << "\n\nBenchmarking super_custom_func\n===============================\n"; 519 | 520 | for (int i = 0; i < 200; ++i) 521 | { 522 | Timer timer; 523 | pres = bc_super_custom_func; 524 | auto elapsed = timer.elapsed(); // nanoseconds 525 | if (elapsed < min_time) min_time = elapsed; 526 | // std::cout << "TIME: " << (double) elapsed / (double) 1000 << " μs" << std::endl; 527 | } 528 | 529 | std::cout << "\nMIN TIME: " << min_time << " ns\n"; 530 | std::cout << " = " << (double) min_time / (double) 1000 << " μs\n" << std::endl; 531 | } --------------------------------------------------------------------------------