├── .gitignore
├── README.md
├── build
├── config.nims
├── cuda.nim
├── demo1
    ├── compile
    ├── config.nims
    ├── cpugpuarray.nim
    ├── demo1.rst
    ├── ex1.nim
    ├── ex2.nim
    ├── gpuarray.nim
    └── linalg.nim
├── demo2
    ├── .gitignore
    ├── coalesced.nim
    ├── compile
    ├── config.nims
    ├── cpugpuarray.nim
    ├── demo1.rst
    ├── ex1.nim
    ├── ex2.nim
    ├── gpuarray.nim
    └── linalg.nim
├── demo3
    ├── .gitignore
    ├── bench
    ├── bench_cpu
    ├── ccwrapper
    ├── coalesced.nim
    ├── compile
    ├── compile_cpu
    ├── config.nims
    ├── cpugpuarray.nim
    ├── doc
    │   ├── PP-Nim-metaprogramming-DOE-COE-PP-2017.pdf
    │   ├── bandwidth_knl.gp
    │   ├── bandwidth_knl.pdf
    │   ├── bandwidth_knl.tex
    │   ├── bandwidth_p100.gp
    │   ├── bandwidth_p100.pdf
    │   ├── bandwidth_p100.tex
    │   ├── readme.html
    │   └── readme.org
    ├── ex1.nim
    ├── ex2.nim
    ├── gpuarray.nim
    ├── linalg.nim
    ├── llbits.h
    ├── out
    │   ├── bludhaven.ex2
    │   ├── bludhaven.info
    │   ├── kingly.ex2
    │   ├── kingly.info
    │   ├── neddy.ftm.alcf.anl.gov.ex2
    │   └── neddy.ftm.alcf.anl.gov.info
    ├── qexLite
    │   ├── alignedMem.nim
    │   ├── comms
    │   │   ├── comms.nim
    │   │   ├── commsQmp.nim
    │   │   └── qmp.nim
    │   ├── config.nims
    │   ├── layout.nim
    │   ├── layout
    │   │   ├── layoutX.nim
    │   │   ├── qgather.c
    │   │   ├── qlayout.c
    │   │   ├── qlayout.h
    │   │   ├── qshifts.c
    │   │   └── shifts.nim
    │   ├── metaUtils.nim
    │   ├── omp.nim
    │   ├── simd.nim
    │   ├── simd
    │   │   ├── simdArray.nim
    │   │   ├── simdAvx.cnim
    │   │   ├── simdAvx.nim
    │   │   ├── simdAvx512.cnim
    │   │   ├── simdAvx512.nim
    │   │   ├── simdQpx.nim
    │   │   ├── simdSse.cnim
    │   │   ├── simdSse.nim
    │   │   ├── simdX86.nim
    │   │   ├── simdX86Ops.nim
    │   │   ├── simdX86Ops1.nim
    │   │   └── simdX86Types.nim
    │   ├── stdUtils.nim
    │   └── threading.nim
    ├── timing.nim
    └── vectorized.nim
├── expr.nim
├── genkernel.nim
├── inline.nim
├── opts.c2nim
├── runc2nim
└── test
    ├── config.nims
    ├── test
    ├── tinline000.nim
    ├── tinline001.nim
    ├── tinline002.nim
    ├── tinline003.nim
    ├── tinline004.nim
    ├── tinline005.nim
    ├── tinline006.nim
    ├── tinline007.nim
    ├── tinline008.nim
    ├── tinline009.nim
    ├── tinline010.nim
    ├── tinline011.nim
    ├── tinline012.nim
    ├── tinline013.nim
    └── tinline014.nim


/.gitignore:
--------------------------------------------------------------------------------
1 | /cuda
2 | /expr
3 | /genkernel
4 | /inline
5 | nimcache/
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cudanim
2 | CUDA for Nim
3 | 
4 | initial proof of concept
5 | 


--------------------------------------------------------------------------------
/build:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #nvcc="/usr/local/cuda/bin/nvcc"
 4 | #$nvcc vectorAdd.cu
 5 | 
 6 | #nim cpp -c -d:release cuda
 7 | 
 8 | #/usr/local/cuda/bin/nvcc -I$HOME/tmp/nim/Nim/lib nimcache/cuda.cpp
 9 | 
10 | #nim cpp -d:release cuda
11 | 
12 | #nim cpp -d:release vectorAdd
13 | nim cpp -d:release genkernel
14 | 


--------------------------------------------------------------------------------
/config.nims:
--------------------------------------------------------------------------------
1 | switch("cc", "gcc")
2 | switch("gcc.cpp.exe", "/usr/local/cuda/bin/nvcc")
3 | switch("gcc.cpp.linkerexe", "/usr/local/cuda/bin/nvcc")
4 | #switch("gcc.cpp.options.always", "--x cu -ccbin=gcc-5")
5 | switch("gcc.cpp.options.always", "--x cu")
6 | switch("gcc.cpp.options.speed", "-O3")
7 | 


--------------------------------------------------------------------------------
/cuda.nim:
--------------------------------------------------------------------------------
  1 | import macros
  2 | import inline
  3 | import expr
  4 | 
  5 | #macro dumpType(x:typed): auto =
  6 | #  result = newEmptyNode()
  7 | #  echo x.getType.treerepr
  8 | proc addChildrenFrom*(dst,src: NimNode): NimNode =
  9 |   for c in src: dst.add(c)
 10 |   result = dst
 11 | macro procInst*(p: typed): auto =
 12 |   #echo "begin procInst:"
 13 |   #echo p.treerepr
 14 |   result = p[0]
 15 | macro makeCall*(p: proc, x: tuple): NimNode =
 16 |   result = newCall(p).addChildrenFrom(x)
 17 | 
 18 | type
 19 |   CudaDim3* {.importc:"dim3",header:"cuda_runtime.h".} = object
 20 |     x*, y*, z*: cint
 21 |   cudaError_t* {.importc,header:"cuda_runtime.h".} = object
 22 |   cudaMemcpyKind* {.importc,header:"cuda_runtime.h".} = object
 23 | var
 24 |   cudaSuccess*{.importC,header:"cuda_runtime.h".}: cudaError_t
 25 |   cudaErrorNotSupported*{.importC,header:"cuda_runtime.h".}: cudaError_t
 26 |   cudaMemcpyHostToDevice*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind
 27 |   cudaMemcpyDeviceToHost*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind
 28 | 
 29 | #template toPointer*(x: pointer): pointer = x
 30 | #template toPointer*[T](x: ptr T): pointer = pointer(x)
 31 | #template toPointer*(x: seq): pointer = toPointer(x[0])
 32 | #template toPointer*(x: not (pointer|seq)): pointer = pointer(unsafeAddr(x))
 33 | template toPointer*(x: typed): pointer =
 34 |   #dumpType: x
 35 |   when x is pointer: x
 36 |   elif x is ptr: x
 37 |   elif x is seq: toPointer(x[0])
 38 |   else: pointer(unsafeAddr(x))
 39 | template dataAddr*(x: typed): pointer =
 40 |   #dumpType: x
 41 |   when x is seq: dataAddr(x[0])
 42 |   elif x is array: dataAddr(x[0])
 43 |   #elif x is ptr: x
 44 |   else: pointer(unsafeAddr(x))
 45 |   #else: x
 46 | 
 47 | proc cudaGetLastError*(): cudaError_t
 48 |   {.importC,header:"cuda_runtime.h".}
 49 | proc cudaGetErrorStringX*(error: cudaError_t): ptr char
 50 |   {.importC:"cudaGetErrorString",header:"cuda_runtime.h".}
 51 | proc cudaGetErrorString*(error: cudaError_t): cstring =
 52 |   var s {.codegendecl:"const $# $#".} = cudaGetErrorStringX(error)
 53 |   result = s
 54 | proc `$`*(error: cudaError_t): string =
 55 |   let s = cudaGetErrorString(error)
 56 |   result = $s
 57 | converter toBool*(e: cudaError_t): bool =
 58 |   cast[cint](e) != cast[cint](cudaSuccess)
 59 | 
 60 | proc cudaMalloc*(p:ptr pointer, size: csize): cudaError_t
 61 |   {.importC,header:"cuda_runtime.h".}
 62 | template cudaMalloc*(p:pointer, size: csize): cudaError_t =
 63 |   cudaMalloc((ptr pointer)(p.addr), size)
 64 | proc cudaFree*(p: pointer): cudaError_t
 65 |   {.importC,header:"cuda_runtime.h".}
 66 | proc cudaMallocManaged*(p: ptr pointer, size: csize): cudaError_t
 67 |   {.importC,header:"cuda_runtime.h".}
 68 | 
 69 | proc cudaMemcpyX*(dst,src: pointer, count: csize, kind: cudaMemcpyKind):
 70 |   cudaError_t {.importC:"cudaMemcpy",header:"cuda_runtime.h".}
 71 | template cudaMemcpy*(dst,src: typed, count: csize,
 72 |                      kind: cudaMemcpyKind): cudaError_t =
 73 |   let pdst = toPointer(dst)
 74 |   let psrc = toPointer(src)
 75 |   cudaMemcpyX(pdst, psrc, count, kind)
 76 | 
 77 | proc cudaLaunchKernel(p:pointer, gd,bd: CudaDim3, args: ptr pointer):
 78 |   cudaError_t {.importC,header:"cuda_runtime.h".}
 79 | 
 80 | proc cudaDeviceReset*(): cudaError_t
 81 |   {.importC,header:"cuda_runtime.h".}
 82 | proc cudaDeviceSynchronize*(): cudaError_t
 83 |   {.importC,header:"cuda_runtime.h".}
 84 | 
 85 | #proc printf*(fmt:cstring):cint {.importc,varargs,header:"<stdio.h>",discardable.}
 86 | #proc fprintf*(stream:ptr FILE,fmt:cstring):cint {.importc,varargs,header:"<stdio.h>".}
 87 | #proc malloc*(size: csize):pointer {.importc,header:"<stdlib.h>".}
 88 | 
 89 | template cudaDefs(body: untyped): untyped {.dirty.} =
 90 |   var gridDim{.global,importC,noDecl.}: CudaDim3
 91 |   var blockIdx{.global,importC,noDecl.}: CudaDim3
 92 |   var blockDim{.global,importC,noDecl.}: CudaDim3
 93 |   var threadIdx{.global,importC,noDecl.}: CudaDim3
 94 |   template getGridDim: untyped {.used.} = gridDim
 95 |   template getBlockIdx: untyped {.used.} = blockIdx
 96 |   template getBlockDim: untyped {.used.} = blockDim
 97 |   template getThreadIdx: untyped {.used.} = threadIdx
 98 |   template getThreadNum: untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x
 99 |   template getNumThreads: untyped {.used.} = gridDim.x * blockDim.x
100 |   bind inlineProcs
101 |   inlineProcs:
102 |     body
103 | 
104 | template cudaLaunch*(p: proc; blocksPerGrid,threadsPerBlock: SomeInteger;
105 |                      arg: varargs[pointer,dataAddr]) =
106 |   var pp: proc = p
107 |   var gridDim, blockDim: CudaDim3
108 |   gridDim.x = blocksPerGrid
109 |   gridDim.y = 1
110 |   gridDim.z = 1
111 |   blockDim.x = threadsPerBlock
112 |   blockDim.y = 1
113 |   blockDim.z = 1
114 |   var args: array[arg.len, pointer]
115 |   for i in 0..<arg.len: args[i] = arg[i]
116 |   #echo "really launching kernel"
117 |   let err = cudaLaunchKernel(pp, gridDim, blockDim, addr args[0])
118 |   if err:
119 |     echo err
120 |     quit cast[cint](err)
121 | 
122 | #macro `<<`*(x:varargs[untyped]): auto =
123 | #  result = newEmptyNode()
124 | #  echo x.treerepr
125 | template `<<`*(p: proc, x: tuple): untyped = (p,x)
126 | template getInst*(p: untyped): untyped =
127 |   #when compiles((var t=p; t)): p
128 |   #else:
129 |   procInst(p)
130 |     #var t =
131 |     #t
132 | macro `>>`*(px: tuple, y: any): auto =
133 |   #echo "begin >>:"
134 |   #echo px.treerepr
135 |   #echo "kernel type:"
136 |   #echo px[0].getTypeImpl.treerepr
137 |   #echo "kernel args:"
138 |   #echo y.treerepr
139 |   #var a = y
140 |   #if y.kind != nnkPar: a = newNimNode(nnkPar).addChildrenFrom(y)
141 |   result = newCall(ident("cudaLaunch"))
142 |   let krnl = newCall(px[0]).addChildrenFrom(y)
143 |   #echo "kernel inst call:"
144 |   #echo krnl.treerepr
145 |   result.add getAst(getInst(krnl))[0]
146 |   result.add px[1][0]
147 |   result.add px[1][1]
148 |   for c in y:
149 |     result.add c
150 |   #echo "kernel launch body:"
151 |   #echo result.treerepr
152 | 
153 | macro cuda*(s,p: untyped): auto =
154 |   #echo "begin cuda:"
155 |   #echo s.treerepr
156 |   let ss = s.strVal
157 |   #echo "proc:"
158 |   #echo p.treerepr
159 |   p.expectKind nnkProcDef
160 |   result = p
161 |   # if p.kind == nnkProcDef:
162 |   #   result = p
163 |   # else:
164 |   #   result = p[0]
165 |   result.addPragma parseExpr("{.codegenDecl:\""&ss&" $# $#$#\".}")[0]
166 |   result.body = getAst(cudaDefs(result.body))
167 |   var sl = newStmtList()
168 |   sl.add( quote do:
169 |     {.push checks: off.}
170 |     {.push stacktrace: off.} )
171 |   sl.add result
172 |   result = sl
173 |   #echo "end cuda:"
174 |   #echo result.treerepr
175 | template cudaGlobal*(p: untyped): auto = cuda("__global__",p)
176 | 
177 | template onGpu*(nn,tpb: untyped, body: untyped): untyped =
178 |   block:
179 |     var v = packVars(body, getGpuPtr)
180 |     type ByCopy {.bycopy.} [T] = object
181 |       d: T
182 |     proc kern(xx: ByCopy[type(v)]) {.cudaGlobal.} =
183 |       template deref(k: int): untyped = xx.d[k]
184 |       substVars(body, deref)
185 |     let ni = nn.int32
186 |     let threadsPerBlock = tpb.int32
187 |     let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
188 |     #echo "launching kernel"
189 |     cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
190 |     discard cudaDeviceSynchronize()
191 | template onGpu*(nn: untyped, body: untyped): untyped = onGpu(nn, 64, body)
192 | template onGpu*(body: untyped): untyped = onGpu(512*64, 64, body)
193 | 
194 | when isMainModule:
195 |   type FltArr = array[0,float32]
196 |   proc vectorAdd*(A: FltArr; B: FltArr; C: var FltArr; n: int32)
197 |     {.cudaGlobal.} =
198 |     var i = blockDim.x * blockIdx.x + threadIdx.x
199 |     if i < n:
200 |       C[i] = A[i] + B[i]
201 | 
202 |   proc test =
203 |     var n = 50000.cint
204 |     var
205 |       a = newSeq[float32](n)
206 |       b = newSeq[float32](n)
207 |       c = newSeq[float32](n)
208 |     var threadsPerBlock: cint = 256
209 |     var blocksPerGrid: cint = (n + threadsPerBlock - 1) div threadsPerBlock
210 | 
211 |     cudaLaunch(vectorAdd, blocksPerGrid, threadsPerBlock, a, b, c, n)
212 | 
213 |     template getGpuPtr(x: int): untyped = x
214 |     template getGpuPtr[T](x: seq[T]): untyped = addr(x[0])
215 |     template `[]`(x: ptr SomeNumber, i: SomeInteger): untyped {.used.} =
216 |       cast[ptr array[0,type(x[])]](x)[][i]
217 |     template `[]=`(x: ptr SomeNumber, i: SomeInteger, y:untyped): untyped {.used.} =
218 |       cast[ptr array[0,type(x[])]](x)[][i] = y
219 |     onGpu(n):
220 |       let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
221 |       if i < n:
222 |         c[i] = a[i] + b[i]
223 | 
224 |   test()
225 | 


--------------------------------------------------------------------------------
/demo1/compile:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | f="$1"
4 | if [ -z "$f" ]; then
5 |   f="ex1.nim"
6 | fi
7 | 
8 | nim cpp -d:release $f
9 | 


--------------------------------------------------------------------------------
/demo1/config.nims:
--------------------------------------------------------------------------------
 1 | switch("cc", "gcc")
 2 | switch("gcc.cpp.exe", "/usr/local/cuda/bin/nvcc")
 3 | switch("gcc.cpp.linkerexe", "/usr/local/cuda/bin/nvcc")
 4 | switch("gcc.cpp.options.always", "--x cu")
 5 | switch("gcc.cpp.options.speed", "-O3 -Xcompiler -march=native,-fPIC")
 6 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -mcpu=native,-mtune=native,-fPIC")
 7 | # switch("gcc.cpp.options.always", "--x cu -ccbin=g++-4.9")
 8 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -march=haswell,-fPIC")
 9 | 
10 | #switch("gcc.cpp.options.speed", "-O3 -march=haswell")
11 | 


--------------------------------------------------------------------------------
/demo1/cpugpuarray.nim:
--------------------------------------------------------------------------------
  1 | import gpuarray
  2 | export gpuarray
  3 | import macros
  4 | import ../cuda
  5 | export cuda
  6 | import ../expr
  7 | import linalg
  8 | export linalg
  9 | include system/ansi_c
 10 | 
 11 | #template onGpu*(x: untyped): untyped = x
 12 | #template onGpu*(a,b,x: untyped): untyped = x
 13 | 
 14 | type
 15 |   ArrayObj*[T] = object
 16 |     p*: ptr array[0,T]
 17 |     n*: int
 18 |     g*: GpuArrayObj[T]
 19 |     lastOnGpu*: bool
 20 |     unifiedMem*: bool
 21 |   ArrayRef*[T] = ref ArrayObj[T]
 22 |   Array*[T] = ArrayRef[T]
 23 |   Arrays* = ArrayObj | ArrayRef
 24 |   Arrays2* = ArrayObj | ArrayRef
 25 |   Arrays3* = ArrayObj | ArrayRef
 26 | 
 27 | proc init[T](r: var ArrayObj[T], n: int) =
 28 |   var p: ptr T
 29 |   r.unifiedMem = true
 30 |   if r.unifiedMem:
 31 |     let err = cudaMallocManaged(cast[ptr pointer](addr p), n*sizeof(T))
 32 |   else:
 33 |     p = createSharedU(T, n)
 34 |   r.n = n
 35 |   r.p = cast[type(r.p)](p)
 36 | proc init[T](r: var ArrayRef[T], n: int) =
 37 |   r.new
 38 |   r[].init(n)
 39 | 
 40 | proc newArrayObj[T](r: var ArrayObj[T], n: int) =
 41 |   r.init(n)
 42 | proc newArrayObj[T](n: int): ArrayObj[T] =
 43 |   result.init(n)
 44 | 
 45 | proc newArrayRef*[T](r: var ArrayRef[T], n: int) =
 46 |   r.init(n)
 47 | proc newArrayRef*[T](n: int): ArrayRef[T] =
 48 |   result.init(n)
 49 | 
 50 | proc toGpu*(x: var Arrays) =
 51 |   if x.unifiedMem:
 52 |     if x.g.n==0:
 53 |       x.g.n = x.n
 54 |       x.g.p = cast[type(x.g.p)](x.p)
 55 |   else:
 56 |     if not x.lastOnGpu:
 57 |       x.lastOnGpu = true
 58 |       if x.g.n==0: x.g.init(x.n)
 59 |       let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice)
 60 |       if err: echo err
 61 | 
 62 | proc toCpu*(x: var Arrays) =
 63 |   if not x.unifiedMem:
 64 |     if x.lastOnGpu:
 65 |       x.lastOnGpu = false
 66 |       let err = cudaMemcpy(x.p, x.g.p, x.n*sizeof(x.T), cudaMemcpyDeviceToHost)
 67 |       if err: echo err
 68 | 
 69 | template getGpuPtr*(x: var Arrays): untyped =
 70 |   toGpu(x)
 71 |   x.g
 72 | 
 73 | template indexArray*(x: Arrays, i: SomeInteger): untyped =
 74 |   x.p[][i]
 75 | #template `[]=`(x: ArrayObj, i: SomeInteger, y: untyped): untyped =
 76 | #  x.p[][i] = y
 77 | 
 78 | macro indexArray*(x: Arrays{call}, y: SomeInteger): untyped =
 79 |   #echo "call[", y.repr, "]"
 80 |   #echo x.treerepr
 81 |   #if siteLocalsField.contains($x[0]):
 82 |   result = newCall(ident($x[0]))
 83 |   for i in 1..<x.len:
 84 |     let xi = x[i]
 85 |     result.add( quote do:
 86 |       indexArray(`xi`,`y`) )
 87 |   #else:
 88 |   #  result = quote do:
 89 |   #    let tt = `x`
 90 |   #    tt.d[`y`]
 91 |   #echo result.treerepr
 92 |   #echo result.repr
 93 | 
 94 | template `[]`*(x: ArrayObj, i: SomeInteger): untyped = indexArray(x, i)
 95 | #template `[]=`(x: ArrayObj, i: SomeInteger, y: untyped): untyped =
 96 | #  x.p[][i] = y
 97 | 
 98 | template `[]`*(x: ArrayRef, i: SomeInteger): untyped = indexArray(x, i)
 99 | #template `[]=`(x: ArrayRef, i: SomeInteger, y: untyped): untyped =
100 | #  x.p[][i] = y
101 | 
102 | var threadNum* = 0
103 | var numThreads* = 1
104 | template getThreadNum*: untyped = threadNum
105 | template getNumThreads*: untyped = numThreads
106 | template `:=`*(x: Arrays, y: Arrays2) =
107 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
108 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
109 |   packVarsStmt((x,y), toCpu)
110 |   let tid = getThreadNum()
111 |   let nid = getNumThreads()
112 |   var i = tid
113 |   while i<x.n:
114 |     x[i] := y[i]
115 |     i += nid
116 | 
117 | template `:=`*(x: Arrays, y: SomeNumber) =
118 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
119 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
120 |   packVarsStmt(x, toCpu)
121 |   let tid = getThreadNum()
122 |   let nid = getNumThreads()
123 |   var i = tid
124 |   while i<x.n:
125 |     x[i] := y
126 |     i += nid
127 | 
128 | template `+=`*(x: Arrays, y: SomeNumber) =
129 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
130 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
131 |   packVarsStmt((x,y), toCpu)
132 |   mixin getThreadNum, getNumThreads
133 |   let tid = getThreadNum()
134 |   let nid = getNumThreads()
135 |   var i = tid
136 |   while i<x.n:
137 |     x[i] += y
138 |     i += nid
139 | 
140 | template `+=`*(x: Arrays, y: Arrays2) =
141 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
142 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
143 |   packVarsStmt((x,y), toCpu)
144 |   mixin getThreadNum, getNumThreads
145 |   let tid = getThreadNum()
146 |   let nid = getNumThreads()
147 |   var i = tid
148 |   while i<x.n:
149 |     x[i] += y[i]
150 |     i += nid
151 | 
152 | proc `+`*(x: Arrays, y: Arrays2): auto =
153 |   when x is ArrayObj:
154 |     var r: ArrayObj[type(x[0]+y[0])]
155 |   else:
156 |     var r: ArrayRef[type(x[0]+y[0])]
157 |   echo "+\n"
158 |   r
159 | proc `*`*(x: Arrays, y: Arrays2): auto =
160 |   when x is ArrayObj:
161 |     var r: ArrayObj[type(x[0]*y[0])]
162 |   else:
163 |     var r: ArrayRef[type(x[0]*y[0])]
164 |   echo "*\n"
165 |   r
166 | 
167 | template newColorMatrixArray*(n: int): untyped =
168 |   newArrayRef[Colmat[float32]](n)
169 | template newComplexArray*(n: int): untyped =
170 |   newArrayRef[Complex[float32]](n)
171 | template newFloatArray*(n: int): untyped =
172 |   newArrayRef[float32](n)
173 | 
174 | proc printf*(frmt: cstring): cint {.
175 |   importc: "printf", header: "<stdio.h>", varargs, discardable.}
176 | 
177 | when isMainModule:
178 |   var N = 100
179 | 
180 |   proc testfloat =
181 |     var x = newArrayRef[float32](N)
182 |     var y = newArrayRef[float32](N)
183 |     var z = newArrayRef[float32](N)
184 |     x := 1
185 |     y := 2
186 |     z := 3
187 |     x += y * z
188 |     if (x.n-1) mod getNumThreads() == getThreadNum():
189 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
190 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
191 |     onGpu(1,32):
192 |       x += y * z
193 |       if (x.n-1) mod getNumThreads() == getThreadNum():
194 |         cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
195 |         cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
196 |     x.toCpu
197 |     if (x.n-1) mod getNumThreads() == getThreadNum():
198 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
199 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
200 |   testfloat()
201 | 
202 |   proc testcomplex =
203 |     var x = newArrayRef[Complex[float32]](N)
204 |     var y = newArrayRef[Complex[float32]](N)
205 |     var z = newArrayRef[Complex[float32]](N)
206 |     x := 1
207 |     y := 2
208 |     z := 3
209 |     x += y * z
210 |     if (x.n-1) mod getNumThreads() == getThreadNum():
211 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
212 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re)
213 | 
214 |     onGpu:
215 |       x += y * z
216 |       x += 1
217 | 
218 |     x += y * z
219 |     if (x.n-1) mod getNumThreads() == getThreadNum():
220 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
221 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re)
222 |   testcomplex()
223 | 
224 |   proc testcolmat =
225 |     var x = newArrayRef[Colmat[float32]](N)
226 |     var y = newArrayRef[Colmat[float32]](N)
227 |     var z = newArrayRef[Colmat[float32]](N)
228 |     x := 1
229 |     y := 2
230 |     z := 3
231 |     x += y * z
232 |     if (x.n-1) mod getNumThreads() == getThreadNum():
233 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
234 |       cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re)
235 | 
236 |     onGpu(N):
237 |       x += y * z
238 | 
239 |     x += y * z
240 |     if (x.n-1) mod getNumThreads() == getThreadNum():
241 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
242 |       cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re)
243 |   testcolmat()
244 | 


--------------------------------------------------------------------------------
/demo1/demo1.rst:
--------------------------------------------------------------------------------
  1 | ===========================
  2 | Portable expressions in Nim
  3 | ===========================
  4 | 
  5 | :Author: James C. Osborn
  6 | 
  7 | .. contents::
  8 | 
  9 | Preliminaries
 10 | =============
 11 | 
 12 | This document was created with Nim's built-in documentation generator.
 13 | It can parse documentation comments in the source code and also process
 14 | separate reStructuredText_ files.
 15 | This document was made from a reStructuredText file using Nim's
 16 | document generator to try it out and also take advantage of its Nim
 17 | code highlighter.
 18 | 
 19 | .. _reStructuredText: https://en.wikipedia.org/wiki/ReStructuredText
 20 | 
 21 | Code portability in Nim
 22 | =======================
 23 | 
 24 | Here's an example of the result (so far)
 25 | 
 26 | .. code-block:: Nim
 27 | 
 28 |   import cpugpuarray
 29 | 
 30 |   let N = 1000
 31 |   var x = newColorMatrixArray(N)
 32 |   var y = newColorMatrixArray(N)
 33 |   var z = newColorMatrixArray(N)
 34 | 
 35 |   # set them to diagonal matrices on CPU
 36 |   x := 1
 37 |   y := 2
 38 |   z := 3
 39 | 
 40 |   # do something on CPU
 41 |   x += y * z
 42 | 
 43 |   # do something on GPU
 44 |   onGpu:
 45 |     x += y * z
 46 |     z := 4
 47 | 
 48 |   # do something on CPU again
 49 |   x += y * z
 50 | 
 51 |   if x[0][0,0].re == 21.0:
 52 |     echo "yay, it worked!"
 53 |     echo "do you agree, GPU?"
 54 | 
 55 |   onGpu:
 56 |     if getThreadNum()==0:
 57 |       if x[0][0,0].re == 21.0:
 58 |         printf("yes, I agree!\n")
 59 | 
 60 |   # outputs:
 61 |   #   yay, it worked!
 62 |   #   do you agree, GPU?
 63 |   #   yes, I agree!
 64 | 
 65 | 
 66 | The above can be compiled and run with
 67 | 
 68 | ::
 69 | 
 70 |   nim cpp -d:release -r ex1.nim
 71 | 
 72 | 
 73 | This is basically the main interface that the average user would need to
 74 | deal with, the rest is just details for the curious.
 75 | 
 76 | 
 77 | Implementation details
 78 | ======================
 79 | 
 80 | The main container object in the example above is an array that can live
 81 | on the CPU and also the GPU.  This is defined as
 82 | 
 83 | .. code-block:: Nim
 84 | 
 85 |   type
 86 |     ArrayObj*[T] = object
 87 |       p*: ptr array[0,T]
 88 |       n*: int
 89 |       g*: GpuArrayObj[T]
 90 |       lastOnGpu*: bool
 91 | 
 92 |     GpuArrayObj*[T] = object
 93 |       p*: ptr array[0,T]
 94 |       n*: int
 95 | 
 96 | ``ArrayObj[T]`` is a generic array-like object parameterized on the type ``T``.
 97 | This is similar to a templated type declaration in C++ with ``T`` being the template parameter (Nim uses ``[T]`` instead of ``<T>`` for generics).
 98 | The ``*`` (star) after all the type and field names above means that they are exported from this module (file).
 99 | They will be visible to another module that ``import``'s this module (otherwise they would be private to this module).
100 | 
101 | The ``ArrayObj`` contains four fields:
102 | 
103 | - ``p``: which is a pointer (``ptr``) for the data on the host. \
104 | This is implemented as a pointer to an array of length ``0`` \
105 | with elements of type ``T`` for convenience. \
106 | This should really be marked with an ``{.unchecked.}`` pragma to prevent \
107 | bounds checking in debug mode (bounds checks are off by default in release mode).
108 | - ``n``: the number of elements in the array.
109 | - ``g``: a GPU array object, defined next.
110 | - ``lastOnGpu``: a Boolean that tells us which pointer is valid.
111 | 
112 | The ``GpuArrayObj`` is similar to ``ArrayObj``, but just contains a pointer \
113 | (which will hold a GPU pointer) and the number of elements.
114 | This is the object we will pass to the GPU, so it contains a copy of the \
115 | length for convenience.
116 | 
117 | 
118 | Offloading
119 | ==========
120 | 
121 | The offload magic happens in the ``onGpu:`` block.
122 | It is defined like
123 | 
124 | .. code-block:: Nim
125 | 
126 |   # the default total threads (nn=32*256) and threads per block (tpb=256)
127 |   # are just for testing, they really should be an educated
128 |   # guess made from querying the device
129 |   template onGpu*(body: untyped): untyped = onGpu(32*256, 256, body)
130 | 
131 | This launches a CUDA kernel using the default number of threads and threads \
132 | per block.  Right now they are hard-coded, but should really come from \
133 | querying the device (or let the user specify some global default).
134 | 
135 | One can override the defaults for a call by explicitly specifying them
136 | 
137 | .. code-block:: Nim
138 | 
139 |   onGpu(x.n, 128):
140 |     x += y * z
141 |     z := 4
142 | 
143 | This would launch one (virtual) thread per element of the array ``x`` and use
144 | 128 threads per block.
145 | 
146 | The CUDA kernel gets created here
147 | 
148 | .. code-block:: Nim
149 | 
150 |   template onGpu*(nn,tpb: untyped, body: untyped): untyped =
151 |     block:
152 |       var v = packVars(body, getGpuPtr)
153 |       type myt {.bycopy.} = object
154 | 	d: type(v)
155 |       proc kern(xx: myt) {.cudaGlobal.} =
156 | 	template deref(k: int): untyped = xx.d[k]
157 | 	substVars(body, deref)
158 |       let ni = nn.int32
159 |       let threadsPerBlock = tpb.int32
160 |       let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
161 |       cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
162 |       discard cudaDeviceSynchronize()
163 | 
164 | This starts a new block scope (``block:``), similar to ``{...}`` in C.
165 | This is done to isolate the defined kernel (``proc kern ...``) from other \
166 | ``onGpu`` blocks.
167 | 
168 | The first major task is to examine the body of the ``onGpu`` block and \
169 | extract the variables that are used.
170 | This is done by the ``packVars`` macro.
171 | It walks the syntax tree of the code block passed in and keeps track of \
172 | the (unique) variables it references.
173 | It then spits out a data structure (a tuple_) containing those variables.
174 | It wraps each variable in a call to the function name that was passed in \
175 | (in this case ``getGpuPtr``).
176 | For the example above, this line would get expanded to
177 | 
178 | .. _tuple: https://nim-lang.org/docs/manual.html#types-tuples-and-object-types
179 | 
180 | .. code-block:: Nim
181 | 
182 |   var v = (getGpuPtr(x), getGpuPtr(y), getGpuPtr(z))
183 | 
184 | The function ``getGpuPtr`` can then be defined independently for each type \
185 | to return a valid GPU object (it actually doesn't have to be a pointer as we'llsee next).
186 | For the ``ArrayObj`` type it is defined as
187 | 
188 | .. code-block:: Nim
189 | 
190 |   template getGpuPtr*(x: var ArrayObj): untyped =
191 |     toGpu(x)
192 |     x.g
193 | 
194 | This copies the data to the GPU (if necessary) and then returns the \
195 | ``GpuArrayObj`` containing the GPU pointer and the length of the array.
196 | This is a (small) object residing in CPU memory, and the CUDA library \
197 | takes care of copying it to the GPU when passed as an argument.
198 | 
199 | Copying the data to the GPU is handled by
200 | 
201 | .. code-block:: Nim
202 | 
203 |   proc toGpu*(x: var ArrayObj) =
204 |     if not x.lastOnGpu:
205 |       x.lastOnGpu = true
206 |       if x.g.n==0: x.g.init(x.n)
207 |       let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice)
208 |       if err: echo err
209 | 
210 | Here we check if this array was last used on the GPU.
211 | If not we check if it has been initialized yet (``x.g.n==0``) and \
212 | initialize it if not (which will call cudaMalloc).
213 | We then copy the CPU memory to GPU memory.
214 | Here we could also translate the layout if we wanted.
215 | 
216 | Currently I am not distinguishing between read access and write access.
217 | This could lead to further optimization.
218 | It should be possible to modify the existing methods to handle that too.
219 | 
220 | Next we create the CUDA kernel (``kern``).
221 | The kernel is defined here
222 | 
223 | .. code-block:: Nim
224 | 
225 |   proc kern(xx: myt) {.cudaGlobal.} =
226 |     template deref(k: int): untyped = xx.d[k]
227 |     substVars(body, deref)
228 | 
229 | This is a function taking one argument (which contains the packed \
230 | ``GpuArrayObj``'s or any other objects used by the expressions.
231 | I originally wrote the procedure definition as
232 | 
233 | .. code-block:: Nim
234 | 
235 |   proc kern(xx: type(v)) {.cudaGlobal.} =
236 |     template deref(k: int): untyped = xx[k]
237 |     substVars(body, deref)
238 | 
239 | but found that Nim decided in some cases to pass the argument of \
240 | ``kern`` (``xx``) as a pointer, instead of by value.
241 | Nim does this to optimize function calls when it feels it is safe to do so.
242 | To prevent this I wrapped the tuple in another object type (``myt``) that \
243 | is explicitly declared ``{.bycopy.}``, so that Nim will always pass it by \
244 | value (which makes a copy).
245 | 
246 | In retrospect, another approach may have been to mark the procedure as \
247 | ``{.exportC.}``, which will also prevent Nim from changing the calling \
248 | conventions.  I would then need to make the procedure names ``kern`` unique \
249 | on my own since Nim will also not perform name-mangling on ``{.exportC.}`` \
250 | procedures.
251 | 
252 | The main body of the kernel comes from the
253 | 
254 | .. code-block:: Nim
255 | 
256 |   substVars(body, deref)
257 | 
258 | macro.
259 | It works similarly to the ``packVars`` macro above, but this time it will \
260 | identify the variables referenced in the code block and substitute them \
261 | with a call to the provided function (``deref``) with an integer argument \
262 | that specifies which position in the kernel argument tuple that variable \
263 | is in.  For the example above this would generate
264 | 
265 | .. code-block:: Nim
266 | 
267 |   deref(0) += deref(1) * deref(2)
268 |   deref(2) := 4
269 | 
270 | The ``deref`` template then simply expands to the appropriate expression \
271 | that refers to the kernel argument.
272 | 
273 | The rest of the magic needed to transform this procedure into a valid CUDA \
274 | kernel is handled in the macro ``cudaGlobal`` which is applied to the \
275 | procedure as a pragma ``{.cudaGlobal.}``.
276 | It also performs function inlining, so that one can still call host functions \
277 | from the device (and not have to worry about marking then with ``__device__``.
278 | I won't go into the details here.
279 | 
280 | The main step left now is to launch the kernel
281 | 
282 | .. code-block:: Nim
283 | 
284 |   let ni = nn.int32
285 |   let threadsPerBlock = tpb.int32
286 |   let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
287 |   cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
288 | 
289 | This selects the blocksPerGrid and threadsPerBlock to be used in the CUDA \
290 | kernel, then launches the kernel ``kern`` with the argument tuple ``v``.
291 | 
292 | Lastly, we synchronize.
293 | 
294 | .. code-block:: Nim
295 | 
296 |   discard cudaDeviceSynchronize()
297 | 
298 | This returns an error code, which I really should be checking instead \
299 | of discarding.
300 | Nim requires you to explicitly discard a return value to be clear that you \
301 | meant to ignore it and didn't just forget.
302 | We may be able to delay this until we actually use the fields again.
303 | 
304 | 
305 | Back and forth
306 | ==============
307 | 
308 | To get the expression to evaluate correctly on the CPU again we \
309 | also check on every assignment made on the CPU that the fields are \
310 | updated there.  So in the expression
311 | 
312 | .. code-block:: Nim
313 | 
314 |   # do something on CPU again
315 |   x += y * z
316 | 
317 | the ``+=`` will do something like ``packVars``, but this time will generate \
318 | statements containing ``toCpu`` calls on the used variables.
319 | 
320 | To do
321 | =====
322 | 
323 | This is just a toy example.
324 | 
325 | The next step is to get the vectorization working properly on the GPU \
326 | arrays.
327 | The explicit copy allows us to use a different vectorization layout between \
328 | the CPU and GPU.
329 | 
330 | The examples here also need to be integrated with the existing ``thread:`` \
331 | block in QEX_.
332 | One possibility is simply
333 | 
334 | .. _QEX: https://github.com/jcosborn/qex
335 | 
336 | .. code-block:: Nim
337 | 
338 |   threads:
339 |     # do something on CPU
340 |     x += y * z
341 | 
342 |     # do something on GPU
343 |     onGpu:
344 |       x += y * z
345 |       z := 4
346 | 
347 |     # do something on CPU again
348 |     x += y * z
349 | 
350 | Other variants are also possible.
351 | 


--------------------------------------------------------------------------------
/demo1/ex1.nim:
--------------------------------------------------------------------------------
 1 | import cpugpuarray
 2 | 
 3 | let N = 100
 4 | var x = newColorMatrixArray(N)
 5 | var y = newColorMatrixArray(N)
 6 | var z = newColorMatrixArray(N)
 7 | 
 8 | # set them to diagonal matrices on CPU
 9 | x := 1
10 | y := 2
11 | z := 3
12 | 
13 | # do something on CPU
14 | x += y * z
15 | 
16 | # do something on GPU
17 | onGpu:
18 |   x += y * z
19 |   z := 4
20 | 
21 | # do something on CPU again
22 | x += y * z
23 | 
24 | if x[0][0,0].re == 21.0:
25 |   echo "yay, it worked!"
26 |   echo "do you agree, GPU?"
27 | 
28 | onGpu:
29 |   if getThreadNum()==0:
30 |     if x[0][0,0].re == 21.0:
31 |       printf("yes, I agree!\n")
32 | 
33 | # outputs:
34 | #   yay, it worked!
35 | #   do you agree, GPU?
36 | #   yes, I agree!
37 | 


--------------------------------------------------------------------------------
/demo1/ex2.nim:
--------------------------------------------------------------------------------
 1 | import cpugpuarray
 2 | include system/timers
 3 | include system/ansi_c
 4 | import strUtils
 5 | 
 6 | proc test(N: int) =
 7 |   echo "=== N: ", N
 8 |   #var x = newFloatArray(N)
 9 |   #var y = newFloatArray(N)
10 |   #var z = newFloatArray(N)
11 |   #var x = newComplexArray(N)
12 |   #var y = newComplexArray(N)
13 |   #var z = newComplexArray(N)
14 |   var x = newColorMatrixArray(N)
15 |   var y = newColorMatrixArray(N)
16 |   var z = newColorMatrixArray(N)
17 | 
18 |   var t0,t1: Ticks
19 |   template tic =
20 |     t0 = getTicks()
21 |   template toc =
22 |     t1 = getTicks()
23 |     #echo "nanos: ", formatFloat((t1-t0).float, precision=0)
24 |     cprintf("nanos:   %9i\n", t1-t0)
25 |     #cprintf("GF/s: %9.3f\n", (2*N).float/(t1-t0).float)
26 |     #cprintf("GF/s: %9.3f\n", (8*N).float/(t1-t0).float)
27 |     cprintf("GF/s: %9.3f\n", (3*72*N).float/(t1-t0).float)
28 | 
29 |   # set them to diagonal matrices on CPU
30 |   x := 1
31 |   y := 2
32 |   z := 3
33 | 
34 |   # do something on CPU
35 |   tic()
36 |   x += y * z
37 |   toc()
38 |   tic()
39 |   x += y * z
40 |   toc()
41 |   #for i in 1..10000:
42 |   #  tic()
43 |   #  x += y * z
44 |   #  toc()
45 | 
46 |   var s = 1.0'f32
47 |   template getGpuPtr(x: float): float = x
48 |   # do something on GPU
49 |   echo "GPU1"
50 |   tic()
51 |   #onGpu:
52 |   onGpu(2*768,64):
53 |     #var t = s
54 |     x += y * z
55 |     #if ff(): discard
56 |       #z := 4
57 |   toc()
58 |   echo "GPU2"
59 |   tic()
60 |   onGpu(2*768,64):
61 |     x += y * z
62 |   #  #z := 4
63 |   toc()
64 | 
65 |   # do something on CPU again
66 |   tic()
67 |   x += y * z
68 |   toc()
69 |   tic()
70 |   x += y * z
71 |   toc()
72 | 
73 |   #if x[0][0,0].re == 21.0:
74 |   #  echo "yay, it worked!"
75 |   #  echo "do you agree, GPU?"
76 | 
77 |   #onGpu:
78 |   #  if getThreadNum()==0:
79 |   #    if x[0][0,0].re == 21.0:
80 |   #      printf("yes, I agree!\n")
81 | 
82 |   # outputs:
83 |   #   yay, it worked!
84 |   #   do you agree, GPU?
85 |   #   yes, I agree!
86 | 
87 | var n = 1000
88 | while n<=1_000_000:
89 |   test(n)
90 |   n *= 10
91 | 


--------------------------------------------------------------------------------
/demo1/gpuarray.nim:
--------------------------------------------------------------------------------
  1 | when not declared(haveCuda):
  2 |   const haveCuda = true
  3 | 
  4 | when haveCuda:
  5 |   import ../cuda
  6 | 
  7 | import macros
  8 | include system/ansi_c
  9 | import linalg
 10 | 
 11 | type
 12 |   GpuArrayObj*[T] = object
 13 |     p*: ptr array[0,T]
 14 |     n*: int
 15 |   GpuArrayRef*[T] = ref GpuArrayObj[T]
 16 |   GpuArray*[T] = GpuArrayRef[T]
 17 |   GpuArrays* = GpuArrayObj | GpuArrayRef
 18 |   GpuArrays2* = GpuArrayObj | GpuArrayRef
 19 |   GpuArrays3* = GpuArrayObj | GpuArrayRef
 20 | 
 21 | proc init*[T](r: var GpuArrayObj[T], n: int) =
 22 |   var p: pointer
 23 |   when haveCuda:
 24 |     let err = cudaMalloc(p.addr, n*sizeof(T))
 25 |     if err:
 26 |       echo "alloc err: ", err
 27 |       quit(-1)
 28 |   else:
 29 |     p = createSharedU(T, n)
 30 |   r.n = n
 31 |   r.p = cast[type(r.p)](p)
 32 | proc init[T](r: var GpuArrayRef[T], n: int) =
 33 |   r.new
 34 |   r[].init(n)
 35 | 
 36 | proc newGpuArrayObj[T](r: var GpuArrayObj[T], n: int) =
 37 |   r.init(n)
 38 | proc newGpuArrayObj[T](n: int): GpuArrayObj[T] =
 39 |   result.init(n)
 40 | 
 41 | proc newGpuArrayRef[T](r: var GpuArrayRef[T], n: int) =
 42 |   r.init(n)
 43 | proc newGpuArrayRef[T](n: int): GpuArrayRef[T] =
 44 |   result.init(n)
 45 | 
 46 | template getGpuPtr(x: SomeNumber): untyped = x
 47 | #template getGpuPtr(x: GpuArrayObj): untyped = x
 48 | template getGpuPtr(x: GpuArrayRef): untyped = x[]
 49 | #template getGpuPtr(x: GpuArrayRef): untyped = x.p
 50 | #template getGpuPtr(x: GpuArrayRef): untyped = (p:x.p,n:x.n)
 51 | 
 52 | template indexGpuArray(x: GpuArrays, i: SomeInteger): untyped =
 53 |   x.p[][i]
 54 | template `[]=`(x: GpuArrayObj, i: SomeInteger, y: untyped): untyped =
 55 |   x.p[][i] = y
 56 | 
 57 | macro indexGpuArray*(x: GpuArrays{call}, y: SomeInteger): untyped =
 58 |   #echo "call[", y.repr, "]"
 59 |   #echo x.treerepr
 60 |   #if siteLocalsField.contains($x[0]):
 61 |   result = newCall(ident($x[0]))
 62 |   for i in 1..<x.len:
 63 |     let xi = x[i]
 64 |     result.add( quote do:
 65 |       indexGpuArray(`xi`,`y`) )
 66 |   #else:
 67 |   #  result = quote do:
 68 |   #    let tt = `x`
 69 |   #    tt.d[`y`]
 70 |   #echo result.treerepr
 71 |   #echo result.repr
 72 | 
 73 | template `[]`*(x: GpuArrayObj, i: SomeInteger): untyped = indexGpuArray(x, i)
 74 | template `[]=`(x: GpuArrayObj, i: SomeInteger, y: untyped): untyped =
 75 |   x.p[][i] = y
 76 | 
 77 | template `[]`(x: GpuArrayRef, i: SomeInteger): untyped =
 78 |   echo "GAR[]"
 79 |   x.p[][i]
 80 | template `[]=`(x: GpuArrayRef, i: SomeInteger, y: untyped): untyped =
 81 |   x.p[][i] = y
 82 | 
 83 | var threadNum = 0
 84 | var numThreads = 1
 85 | template getThreadNum: untyped = threadNum
 86 | template getNumThreads: untyped = numThreads
 87 | template `:=`*(x: GpuArrays, y: GpuArrays2) =
 88 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
 89 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
 90 |   mixin getThreadNum, getNumThreads
 91 |   let tid = getThreadNum()
 92 |   let nid = getNumThreads()
 93 |   var i = tid
 94 |   while i<x.n:
 95 |     x[i] := y[i]
 96 |     i += nid
 97 | 
 98 | template `:=`*(x: GpuArrays, y: SomeNumber) =
 99 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
100 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
101 |   mixin getThreadNum, getNumThreads
102 |   let tid = getThreadNum()
103 |   let nid = getNumThreads()
104 |   var i = tid
105 |   while i<x.n:
106 |     x[i] := y
107 |     #echo i, "/", x.n
108 |     i += nid
109 | 
110 | template `+=`*(x: GpuArrays, y: SomeNumber) =
111 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
112 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
113 |   mixin getThreadNum, getNumThreads
114 |   let tid = getThreadNum()
115 |   let nid = getNumThreads()
116 |   var i = tid
117 |   #cprintf("%i/%i\n", i, x.n)
118 |   while i<x.n:
119 |     x[i] += y
120 |     #cprintf("%i/%i\n", i, x.n)
121 |     i += nid
122 | 
123 | template `+=`*(x: GpuArrays, y: GpuArrays2) =
124 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
125 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
126 |   mixin getThreadNum, getNumThreads
127 |   let tid = getThreadNum()
128 |   let nid = getNumThreads()
129 |   var i = tid
130 |   #cprintf("%i/%i\n", i, x.n)
131 |   while i<x.n:
132 |     x[i] += y[i]
133 |     #cprintf("%i/%i\n", i, x.n)
134 |     i += nid
135 | 
136 | proc `+`*(x: GpuArrays, y: GpuArrays2): auto =
137 |   when x is GpuArrayObj:
138 |     var r: GpuArrayObj[type(x[0]+y[0])]
139 |   else:
140 |     var r: GpuArrayRef[type(x[0]+y[0])]
141 |   cprintf("+\n")
142 |   r
143 | proc `*`*(x: GpuArrays, y: GpuArrays2): auto =
144 |   when x is GpuArrayObj:
145 |     var r: GpuArrayObj[type(x[0]*y[0])]
146 |   else:
147 |     var r: GpuArrayRef[type(x[0]*y[0])]
148 |   cprintf("*\n")
149 |   r
150 | 
151 | when isMainModule:
152 |   var N = 1000
153 | 
154 |   proc testfloat =
155 |     var x = newGpuArrayRef[float32](N)
156 |     var y = newGpuArrayRef[float32](N)
157 |     var z = newGpuArrayRef[float32](N)
158 |     #cprintf("x.n: %i\n", x.n)
159 |     onGpu(1,32):
160 |       x += y * z
161 |   testfloat()
162 | 
163 |   proc testcomplex =
164 |     var x = newGpuArrayRef[Complex[float32]](N)
165 |     var y = newGpuArrayRef[Complex[float32]](N)
166 |     var z = newGpuArrayRef[Complex[float32]](N)
167 |     onGpu(N):
168 |       x += y * z
169 |   testcomplex()
170 | 
171 |   proc testcolmat =
172 |     var x = newGpuArrayRef[Colmat[float32]](N)
173 |     var y = newGpuArrayRef[Colmat[float32]](N)
174 |     var z = newGpuArrayRef[Colmat[float32]](N)
175 |     #y := 1
176 |     #z := 2
177 |     onGpu(N):
178 |       x += y * z
179 |   testcolmat()
180 | 


--------------------------------------------------------------------------------
/demo1/linalg.nim:
--------------------------------------------------------------------------------
 1 | type SomeNumber2* = SomeInteger | SomeReal
 2 | template `:=`*(x: var SomeNumber, y: SomeNumber2) =
 3 |   x = (type(x))(y)
 4 | template `+=`*(x: var SomeNumber, y: SomeNumber2) =
 5 |   bind `+=`    # So the following += doesn't call this template again.
 6 |   x += (type(x))(y)
 7 | 
 8 | type
 9 |   Complex*[T] = object
10 |     re*,im*: T
11 | template `:=`*[T](x: var Complex[T], y: SomeNumber) =
12 |   let z = y
13 |   x.re := z
14 |   x.im := 0
15 | template `:=`*[T](x: var Complex[T], y: Complex[T]) =
16 |   x.re = y.re
17 |   x.im = y.im
18 | template `+=`*[T](x: var Complex[T], y: SomeNumber) =
19 |   let z = y
20 |   x.re += z
21 | template `+=`*[T](x: var Complex[T], y: Complex[T]) =
22 |   let z = y
23 |   x.re += z.re
24 |   x.im += z.im
25 | template `+`*[T](x,y: Complex[T]): untyped =
26 |   var r {.noInit.}: Complex[type(x.re+y.re)]
27 |   r.re = x.re + y.re
28 |   r.im = x.im + y.im
29 |   r
30 | template `*`*[T](x,y: Complex[T]): untyped =
31 |   var r {.noInit.}: Complex[type(x.re*y.re)]
32 |   r.re = x.re*y.re - x.im*y.im
33 |   r.im = x.re*y.im + x.im*y.re
34 |   r
35 | 
36 | type
37 |   Colmat*[T] = object
38 |     d*: array[3,array[3,Complex[T]]]
39 | template `[]`*(x: Colmat, i,j: int): untyped = x.d[i][j]
40 | template `:=`*[T](x: var Colmat[T], y: SomeNumber) =
41 |   let z = y
42 |   for i in 0..<x.d.len:
43 |     for j in 0..<x.d[0].len:
44 |       if i==j:
45 |         x.d[i][j] := z
46 |       else:
47 |         x.d[i][j] := 0
48 | template `:=`*[T](x: var Colmat[T], y: Colmat[T]) =
49 |   let z = y
50 |   for i in 0..<x.d.len:
51 |     for j in 0..<x.d[0].len:
52 |       x.d[i][j] = z.d[i][j]
53 | template `+=`*[T](x: var Colmat[T], y: Colmat[T]) =
54 |   let z = y
55 |   for i in 0..<x.d.len:
56 |     for j in 0..<x.d[0].len:
57 |       x.d[i][j] += z.d[i][j]
58 | template `+`*[T](x,y: Colmat[T]): untyped =
59 |   var r {.noInit.}: Colmat[type(x.d[0][0].re+y.d[0][0].re)]
60 |   for i in 0..<r.d.len:
61 |     for j in 0..<r.d[0].len:
62 |       r.d[i][j] = x.d[i][j] + y.d[i][j]
63 |   r
64 | template `*`*[T](x,y: Colmat[T]): untyped =
65 |   var r {.noInit.}: Colmat[type(x.d[0][0].re*y.d[0][0].re)]
66 |   for i in 0..<r.d.len:
67 |     for j in 0..<r.d[0].len:
68 |       var t = x.d[i][0] * y.d[0][j]
69 |       for k in 1..<y.d.len:
70 |         t += x.d[i][k] * y.d[k][j]
71 |       r.d[i][j] = t
72 |   r
73 | 
74 | when isMainModule:
75 |   var x,y,z: ref Complex[float]
76 |   x.new
77 |   y.new
78 |   z.new
79 |   x[] += y[]*z[]
80 |   echo x[]
81 | 


--------------------------------------------------------------------------------
/demo2/.gitignore:
--------------------------------------------------------------------------------
1 | coalesced
2 | cpugpuarray
3 | ex1
4 | ex2
5 | gpuarray
6 | linalg
7 | 


--------------------------------------------------------------------------------
/demo2/coalesced.nim:
--------------------------------------------------------------------------------
  1 | #[
  2 | 
  3 | Following Kate's idea of coalesced_ptr in C++, we use a wrapper
  4 | object type to hide the actual coalesced memory layout here.
  5 | Original comments from Kate's coalesced_ptr.h follows:
  6 | 
  7 |   A smart pointer that automatically provide coalesced memory
  8 |   transcations for arrays of arbtrary structures.  Given a structure
  9 |   T, of size S bytes, e.g.,
 10 | 
 11 |   struct T {
 12 |     char a[S];
 13 |   }
 14 | 
 15 |   in an array with sites elements
 16 | 
 17 |   T t[sites];
 18 | 
 19 |   using a coalesced_ptr will split the structure for reading and
 20 |   writing to memory as an array of structures of array of structures (AoSoAoS),
 21 |   where:
 22 |     - the inner structure size is given by memory_word_size
 23 |     - the inner array size is given by site_vector
 24 |     - the outer structure size is given by sizeof(T)/memory_word_size
 25 |     - the outer array size is given by sites/site_vector
 26 | 
 27 | ]#
 28 | 
 29 | import macros
 30 | 
 31 | const
 32 |   V = 32                        # Inner array length
 33 |   M = 2                         # Number of RegisterWords in a MemoryWord, which is the granularity of memory transactions.
 34 |   # M = 1                         # Number of RegisterWords in a MemoryWord, which is the granularity of memory transactions.
 35 | 
 36 | type
 37 |   Coalesced*[T] = object
 38 |     p*: ptr T                   # pointer to an array of T
 39 |     n*: int                     # the length of the array being coalesced
 40 |   CoalescedObj[T] = object
 41 |     o*: Coalesced[T]            # the coalesced array
 42 |     i*: int                     # the index to which we asks
 43 |   RegisterWord = int32          # Word fits in a register, 4 bytes for current GPU
 44 | when M == 1:
 45 |   type
 46 |     MemoryWord = object
 47 |       a*:RegisterWord
 48 | elif M == 2:
 49 |   type
 50 |     MemoryWord = object
 51 |       a*,b*:RegisterWord
 52 | elif M == 4:
 53 |   type
 54 |     MemoryWord = object
 55 |       a*,b*,c*,d*:RegisterWord
 56 | else:
 57 |   {.fatal:"Unsupported memory size " & $M.}
 58 | 
 59 | # Nim doesn't know the size of any struct for sure without the help of a C/C++ compiler.
 60 | # So we use a C++ compiler to check if the user has provided a correct size.
 61 | # The following C++ code only works with c++11 or later.
 62 | {.emit:"""
 63 | template <typename ToCheck, std::size_t ProvidedSize, std::size_t RealSize = sizeof(ToCheck)>
 64 | void check_size() {static_assert(ProvidedSize == RealSize, "newCoalesced got the wrong size!");}
 65 | """.}
 66 | proc newCoalesced*[T](p:ptr T, n:int):auto {.noinit.} =
 67 |   when compiles((const size = sizeof(T))):
 68 |     const size = sizeof(T)
 69 |   else:
 70 |     mixin structSize
 71 |     const size = structSize(T)
 72 |   {.emit:"check_size<`T`, `size`>();".}
 73 |   const N = size div (M*sizeof(RegisterWord))
 74 |   when N*(M*sizeof(RegisterWord)) != size: {.fatal:"sizeof(T) must be divisible by memory word size."}
 75 |   if n mod V != 0:
 76 |     echo "Array length for Coalesced must be multiples of ",V
 77 |     quit 1
 78 |   Coalesced[T](p:p, n:n)
 79 | proc `[]`*[T](x:Coalesced[T], i:int):auto = CoalescedObj[T](o:x, i:i)
 80 | proc len*[T](x:Coalesced[T]):auto = x.n
 81 | 
 82 | type
 83 |   RWA {.unchecked.} = array[0,RegisterWord]
 84 |   MWA {.unchecked.} = array[0,MemoryWord]
 85 | 
 86 | proc copy(x:ptr MemoryWord, y:ptr RegisterWord, n:int) = # n is number of MemoryWord in x
 87 |   let
 88 |     x = cast[ptr MWA](x)
 89 |     y = cast[ptr RWA](y)
 90 |   for i in 0..<n:
 91 |     x[i].a = y[M*i]
 92 |     when M > 1:
 93 |       x[i].b = y[M*i+1]
 94 |     when M > 2:
 95 |       x[i].c = y[M*i+2]
 96 |       x[i].d = y[M*i+3]
 97 | proc copy(x:ptr RegisterWord, y:ptr MemoryWord, n:int) = # n is number of MemoryWord in y
 98 |   let
 99 |     x = cast[ptr RWA](x)
100 |     y = cast[ptr MWA](y)
101 |   for i in 0..<n:
102 |     x[M*i] = y[i].a
103 |     when M > 1:
104 |       x[M*i+1] = y[i].b
105 |     when M > 2:
106 |       x[M*i+2] = y[i].c
107 |       x[M*i+3] = y[i].d
108 | 
109 | converter fromCoalesced*[T](x:CoalescedObj[T]):T {.noinit.} =
110 |   mixin structSize
111 |   const N = structSize(T) div (M*sizeof(RegisterWord))
112 |   let p = cast[ptr MWA](x.o.p)
113 |   var m {.noinit.}: array[N,MemoryWord]
114 |   for i in 0..<N: m[i] = p[((x.i div V)*N + i)*V + x.i mod V]
115 |   copy(cast[ptr RegisterWord](result.addr), m[0].addr, N)
116 | 
117 | proc `:=`*[T,Y](x:CoalescedObj[T], y:Y) =
118 |   when Y is T:
119 |     mixin structSize
120 |     const N = structSize(T) div (M*sizeof(RegisterWord))
121 |     var y {.noinit.} = y
122 |     let p = cast[ptr MWA](x.o.p)
123 |     var m {.noinit.}: array[N,MemoryWord]
124 |     copy(m[0].addr, cast[ptr RegisterWord](y.addr), N)
125 |     for i in 0..<N: p[((x.i div V)*N + i)*V + x.i mod V] = m[i]
126 |   else:
127 |     mixin `:=`
128 |     var ty {.noinit.} :T
129 |     ty := y
130 |     x := ty
131 | 
132 | proc `*`*[X,Y](x:CoalescedObj[X], y:CoalescedObj[Y]):auto {.noinit.} =
133 |   let
134 |     tx {.noinit.} = fromCoalesced(x)
135 |     ty {.noinit.} = fromCoalesced(y)
136 |   mixin `*`
137 |   tx * ty
138 | 
139 | template `+=`*[T,Y](x:CoalescedObj[T], y:Y) = x := fromCoalesced(x) + y
140 | 
141 | when isMainModule:
142 |   import strutils
143 |   type T = array[6,RegisterWord]
144 |   proc structSize(t:typedesc[T]):int = 24
145 |   var x {.noinit.}: array[64,T]
146 |   let p = newCoalesced(x[0].addr, x.len)
147 |   for i in 0..<p.len:
148 |     var t {.noinit.}: T
149 |     for j in 0..<t.len: t[j] = RegisterWord(100*i + j)
150 |     p[i] := t
151 |   var s:string
152 |   s = "Lexical order: p = {"
153 |   for i in 0..<p.len:
154 |     let t:T = p[i]
155 |     s &= "\n["
156 |     for j in 0..<t.len: s &= " " & align($t[j],4)
157 |     s &= " ]"
158 |   s &= "}"
159 |   echo s
160 |   s = "Memory layout: x = {"
161 |   let y = cast[ptr RWA](x[0].addr)
162 |   for i in 0..<x.len*x[0].len:
163 |     if i mod V == 0: s &= "\n"
164 |     s &= " " & align($y[i],4)
165 |   s &= "}"
166 |   echo s
167 | 


--------------------------------------------------------------------------------
/demo2/compile:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | f="$1"
4 | if [ -z "$f" ]; then
5 |   f="ex1.nim"
6 | fi
7 | 
8 | nim cpp -d:release $f
9 | 


--------------------------------------------------------------------------------
/demo2/config.nims:
--------------------------------------------------------------------------------
 1 | switch("cc", "gcc")
 2 | switch("gcc.cpp.exe", "/usr/local/cuda/bin/nvcc")
 3 | switch("gcc.cpp.linkerexe", "/usr/local/cuda/bin/nvcc")
 4 | #switch("gcc.cpp.options.always", "--x cu -std=c++11")
 5 | switch("gcc.cpp.options.always", "--x cu -ccbin=g++-4.8 -std=c++11")
 6 | switch("gcc.cpp.options.speed", "-O3 -Xcompiler -Ofast,-march=native")
 7 | #switch("gcc.cpp.options.speed", "-O3")
 8 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -mcpu=native,-mtune=native,-fPIC")
 9 | # switch("gcc.cpp.options.always", "--x cu -ccbin=g++-4.9")
10 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -march=haswell,-fPIC")
11 | 
12 | #switch("gcc.cpp.options.speed", "-O3 -march=haswell")
13 | 


--------------------------------------------------------------------------------
/demo2/cpugpuarray.nim:
--------------------------------------------------------------------------------
  1 | import gpuarray
  2 | export gpuarray
  3 | import coalesced
  4 | export coalesced
  5 | import macros
  6 | import ../cuda
  7 | export cuda
  8 | import ../expr
  9 | import linalg
 10 | export linalg
 11 | include system/ansi_c
 12 | 
 13 | #template onGpu*(x: untyped): untyped = x
 14 | #template onGpu*(a,b,x: untyped): untyped = x
 15 | 
 16 | type
 17 |   ArrayObj*[T] = object
 18 |     p*: Coalesced[T]
 19 |     n*: int
 20 |     g*: GpuArrayObj[T]
 21 |     lastOnGpu*: bool
 22 |     unifiedMem*: bool
 23 |   ArrayRef*[T] = ref ArrayObj[T]
 24 |   Array*[T] = ArrayRef[T]
 25 |   Arrays* = ArrayObj | ArrayRef
 26 |   Arrays2* = ArrayObj | ArrayRef
 27 |   Arrays3* = ArrayObj | ArrayRef
 28 | 
 29 | proc init[T](r: var ArrayObj[T], n: int) =
 30 |   var p: ptr T
 31 |   r.unifiedMem = true
 32 |   if r.unifiedMem:
 33 |     let err = cudaMallocManaged(cast[ptr pointer](addr p), n*sizeof(T))
 34 |     # Somehow == and != doesn't work as expected here??!
 35 |     if err:
 36 |       if cast[cint](err) == cast[cint](cudaErrorNotSupported):
 37 |         echo "WARNING: cudaMallocManaged not supported.  Fall back to non-unified memory."
 38 |         r.unifiedMem = false
 39 |       else:
 40 |         echo "ERROR: cudaMallocManaged ", n*sizeof(T)
 41 |         quit cast[cint](err)
 42 |   if not r.unifiedMem:
 43 |     p = createSharedU(T, n)
 44 |   r.n = n
 45 |   r.p = newCoalesced(p, n)
 46 | proc init[T](r: var ArrayRef[T], n: int) =
 47 |   r.new
 48 |   r[].init(n)
 49 | 
 50 | proc free*[T](r: var ArrayObj[T]) =
 51 |   if r.unifiedMem: discard r.p.p.cudaFree
 52 |   else: r.g.free # Same as `toGpu`, r.g is not passed to init with unifiedMem.
 53 | proc free*[T](r: ArrayRef[T]) =
 54 |   if r.unifiedMem: discard r.p.p.cudaFree
 55 |   else: r.g.free
 56 | 
 57 | proc newArrayObj*[T](r: var ArrayObj[T], n: int) =
 58 |   r.init(n)
 59 | proc newArrayObj*[T](n: int): ArrayObj[T] =
 60 |   result.init(n)
 61 | 
 62 | proc newArrayRef*[T](r: var ArrayRef[T], n: int) =
 63 |   r.init(n)
 64 | proc newArrayRef*[T](n: int): ArrayRef[T] =
 65 |   result.init(n)
 66 | 
 67 | proc toGpu*(x: var Arrays) =
 68 |   if x.unifiedMem:
 69 |     if x.g.n==0:
 70 |       x.g.n = x.n
 71 |       x.g.p = x.p
 72 |   else:
 73 |     if not x.lastOnGpu:
 74 |       x.lastOnGpu = true
 75 |       if x.g.n==0: x.g.init(x.n)
 76 |       let err = cudaMemcpy(x.g.p.p, x.p.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice)
 77 |       if err: echo err
 78 | 
 79 | proc toCpu*(x: var Arrays) =
 80 |   if not x.unifiedMem:
 81 |     if x.lastOnGpu:
 82 |       x.lastOnGpu = false
 83 |       let err = cudaMemcpy(x.p.p, x.g.p.p, x.n*sizeof(x.T), cudaMemcpyDeviceToHost)
 84 |       if err: echo err
 85 | 
 86 | template getGpuPtr*(x: var Arrays): untyped =
 87 |   toGpu(x)
 88 |   x.g
 89 | 
 90 | template indexArray*(x: Arrays, i: SomeInteger): untyped =
 91 |   x.p[i]
 92 | #template `[]=`(x: ArrayObj, i: SomeInteger, y: untyped): untyped =
 93 | #  x.p[][i] = y
 94 | 
 95 | macro indexArray*(x: Arrays{call}, y: SomeInteger): untyped =
 96 |   # proc cleanUp(n:NimNode):NimNode =
 97 |   #   if n.kind in {nnkStmtListExpr,nnkStmtList} and n.len == 1:
 98 |   #     result = n[0]
 99 |   #   else:
100 |   #     result = n
101 |   # echo ">>>>>> indexArray"
102 |   # echo "call[", y.repr, "]"
103 |   # echo x.treerepr
104 |   #if siteLocalsField.contains($x[0]):
105 |   result = newCall(ident($x[0]))
106 |   for i in 1..<x.len:
107 |     let xi = x[i]
108 |     #result.add cleanUp( quote do:
109 |     result.add ( quote do:
110 |       indexArray(`xi`,`y`) )
111 |   #else:
112 |   #  result = quote do:
113 |   #    let tt = `x`
114 |   #    tt.d[`y`]
115 |   # echo result.treerepr
116 |   # echo "<<<<<< indexArray"
117 |   #echo result.repr
118 | 
119 | template `[]`*(x: ArrayObj, i: SomeInteger): untyped = indexArray(x, i)
120 | #template `[]=`(x: ArrayObj, i: SomeInteger, y: untyped): untyped =
121 | #  x.p[][i] = y
122 | 
123 | template `[]`*(x: ArrayRef, i: SomeInteger): untyped = indexArray(x, i)
124 | #template `[]=`(x: ArrayRef, i: SomeInteger, y: untyped): untyped =
125 | #  x.p[][i] = y
126 | 
127 | var threadNum* = 0
128 | var numThreads* = 1
129 | template getThreadNum*: untyped = threadNum
130 | template getNumThreads*: untyped = numThreads
131 | template `:=`*(x: Arrays, y: Arrays2) =
132 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
133 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
134 |   packVarsStmt((x,y), toCpu)
135 |   let tid = getThreadNum()
136 |   let nid = getNumThreads()
137 |   var i = tid
138 |   while i<x.n:
139 |     x[i] := y[i]
140 |     i += nid
141 | 
142 | template `:=`*(x: Arrays, y: SomeNumber) =
143 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
144 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
145 |   packVarsStmt(x, toCpu)
146 |   let tid = getThreadNum()
147 |   let nid = getNumThreads()
148 |   var i = tid
149 |   while i<x.n:
150 |     x[i] := y
151 |     i += nid
152 | 
153 | template `+=`*(x: Arrays, y: SomeNumber) =
154 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
155 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
156 |   packVarsStmt((x,y), toCpu)
157 |   mixin getThreadNum, getNumThreads
158 |   let tid = getThreadNum()
159 |   let nid = getNumThreads()
160 |   var i = tid
161 |   while i<x.n:
162 |     x[i] += y
163 |     i += nid
164 | 
165 | template `+=`*(x: Arrays, y: Arrays2) =
166 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
167 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
168 |   packVarsStmt((x,y), toCpu)
169 |   mixin getThreadNum, getNumThreads
170 |   let tid = getThreadNum()
171 |   let nid = getNumThreads()
172 |   var i = tid
173 |   while i<x.n:
174 |     x[i] += y[i]
175 |     i += nid
176 | 
177 | proc `+`*(x: Arrays, y: Arrays2): auto =
178 |   when x is ArrayObj:
179 |     var r: ArrayObj[type(x[0]+y[0])]
180 |   else:
181 |     var r: ArrayRef[type(x[0]+y[0])]
182 |   echo "+\n"
183 |   r
184 | proc `*`*(x: Arrays, y: Arrays2): auto =
185 |   when x is ArrayObj:
186 |     var r: ArrayObj[type(x[0]*y[0])]
187 |   else:
188 |     var r: ArrayRef[type(x[0]*y[0])]
189 |   echo "*\n"
190 |   r
191 | 
192 | template newColorMatrixArray*(n: int): untyped =
193 |   newArrayRef[Colmat[3,float32]](n)
194 | template newComplexArray*(n: int): untyped =
195 |   newArrayRef[Complex[float32]](n)
196 | template newFloatArray*(n: int): untyped =
197 |   newArrayRef[float32](n)
198 | 
199 | proc printf*(frmt: cstring): cint {.
200 |   importc: "printf", header: "<stdio.h>", varargs, discardable.}
201 | 
202 | when isMainModule:
203 |   var N = 100
204 | 
205 |   proc testfloat =
206 |     var x = newArrayRef[float32](N)
207 |     var y = newArrayRef[float32](N)
208 |     var z = newArrayRef[float32](N)
209 |     x := 1
210 |     y := 2
211 |     z := 3
212 |     x += y * z
213 |     if (x.n-1) mod getNumThreads() == getThreadNum():
214 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
215 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
216 |     onGpu(1,32):
217 |       x += y * z
218 |       if (x.n-1) mod getNumThreads() == getThreadNum():
219 |         cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
220 |         cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
221 |     x.toCpu
222 |     if (x.n-1) mod getNumThreads() == getThreadNum():
223 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
224 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1])
225 |   testfloat()
226 | 
227 |   proc testcomplex =
228 |     var x = newArrayRef[Complex[float32]](N)
229 |     var y = newArrayRef[Complex[float32]](N)
230 |     var z = newArrayRef[Complex[float32]](N)
231 |     x := 1
232 |     y := 2
233 |     z := 3
234 |     x += y * z
235 |     if (x.n-1) mod getNumThreads() == getThreadNum():
236 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
237 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re)
238 | 
239 |     onGpu:
240 |       x += y * z
241 |       x += 1
242 | 
243 |     x += y * z
244 |     if (x.n-1) mod getNumThreads() == getThreadNum():
245 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
246 |       cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re)
247 |   testcomplex()
248 | 
249 |   proc testcolmat =
250 |     var x = newArrayRef[Colmat[3,float32]](N)
251 |     var y = newArrayRef[Colmat[3,float32]](N)
252 |     var z = newArrayRef[Colmat[3,float32]](N)
253 |     x := 1
254 |     y := 2
255 |     z := 3
256 |     x += y * z
257 |     if (x.n-1) mod getNumThreads() == getThreadNum():
258 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
259 |       cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re)
260 | 
261 |     onGpu(N):
262 |       x += y * z
263 | 
264 |     x += y * z
265 |     if (x.n-1) mod getNumThreads() == getThreadNum():
266 |       cprintf("thread %i/%i\n", getThreadNum(), getNumThreads())
267 |       cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re)
268 |     x.free
269 |     y.free
270 |     z.free
271 |   testcolmat()
272 | 


--------------------------------------------------------------------------------
/demo2/demo1.rst:
--------------------------------------------------------------------------------
  1 | ===========================
  2 | Portable expressions in Nim
  3 | ===========================
  4 | 
  5 | :Author: James C. Osborn
  6 | 
  7 | .. contents::
  8 | 
  9 | Preliminaries
 10 | =============
 11 | 
 12 | This document was created with Nim's built-in documentation generator.
 13 | It can parse documentation comments in the source code and also process
 14 | separate reStructuredText_ files.
 15 | This document was made from a reStructuredText file using Nim's
 16 | document generator to try it out and also take advantage of its Nim
 17 | code highlighter.
 18 | 
 19 | .. _reStructuredText: https://en.wikipedia.org/wiki/ReStructuredText
 20 | 
 21 | Code portability in Nim
 22 | =======================
 23 | 
 24 | Here's an example of the result (so far)
 25 | 
 26 | .. code-block:: Nim
 27 | 
 28 |   import cpugpuarray
 29 | 
 30 |   let N = 1000
 31 |   var x = newColorMatrixArray(N)
 32 |   var y = newColorMatrixArray(N)
 33 |   var z = newColorMatrixArray(N)
 34 | 
 35 |   # set them to diagonal matrices on CPU
 36 |   x := 1
 37 |   y := 2
 38 |   z := 3
 39 | 
 40 |   # do something on CPU
 41 |   x += y * z
 42 | 
 43 |   # do something on GPU
 44 |   onGpu:
 45 |     x += y * z
 46 |     z := 4
 47 | 
 48 |   # do something on CPU again
 49 |   x += y * z
 50 | 
 51 |   if x[0][0,0].re == 21.0:
 52 |     echo "yay, it worked!"
 53 |     echo "do you agree, GPU?"
 54 | 
 55 |   onGpu:
 56 |     if getThreadNum()==0:
 57 |       if x[0][0,0].re == 21.0:
 58 |         printf("yes, I agree!\n")
 59 | 
 60 |   # outputs:
 61 |   #   yay, it worked!
 62 |   #   do you agree, GPU?
 63 |   #   yes, I agree!
 64 | 
 65 | 
 66 | The above can be compiled and run with
 67 | 
 68 | ::
 69 | 
 70 |   nim cpp -d:release -r ex1.nim
 71 | 
 72 | 
 73 | This is basically the main interface that the average user would need to
 74 | deal with, the rest is just details for the curious.
 75 | 
 76 | 
 77 | Implementation details
 78 | ======================
 79 | 
 80 | The main container object in the example above is an array that can live
 81 | on the CPU and also the GPU.  This is defined as
 82 | 
 83 | .. code-block:: Nim
 84 | 
 85 |   type
 86 |     ArrayObj*[T] = object
 87 |       p*: ptr array[0,T]
 88 |       n*: int
 89 |       g*: GpuArrayObj[T]
 90 |       lastOnGpu*: bool
 91 | 
 92 |     GpuArrayObj*[T] = object
 93 |       p*: ptr array[0,T]
 94 |       n*: int
 95 | 
 96 | ``ArrayObj[T]`` is a generic array-like object parameterized on the type ``T``.
 97 | This is similar to a templated type declaration in C++ with ``T`` being the template parameter (Nim uses ``[T]`` instead of ``<T>`` for generics).
 98 | The ``*`` (star) after all the type and field names above means that they are exported from this module (file).
 99 | They will be visible to another module that ``import``'s this module (otherwise they would be private to this module).
100 | 
101 | The ``ArrayObj`` contains four fields:
102 | 
103 | - ``p``: which is a pointer (``ptr``) for the data on the host. \
104 | This is implemented as a pointer to an array of length ``0`` \
105 | with elements of type ``T`` for convenience. \
106 | This should really be marked with an ``{.unchecked.}`` pragma to prevent \
107 | bounds checking in debug mode (bounds checks are off by default in release mode).
108 | - ``n``: the number of elements in the array.
109 | - ``g``: a GPU array object, defined next.
110 | - ``lastOnGpu``: a Boolean that tells us which pointer is valid.
111 | 
112 | The ``GpuArrayObj`` is similar to ``ArrayObj``, but just contains a pointer \
113 | (which will hold a GPU pointer) and the number of elements.
114 | This is the object we will pass to the GPU, so it contains a copy of the \
115 | length for convenience.
116 | 
117 | 
118 | Offloading
119 | ==========
120 | 
121 | The offload magic happens in the ``onGpu:`` block.
122 | It is defined like
123 | 
124 | .. code-block:: Nim
125 | 
126 |   # the default total threads (nn=32*256) and threads per block (tpb=256)
127 |   # are just for testing, they really should be an educated
128 |   # guess made from querying the device
129 |   template onGpu*(body: untyped): untyped = onGpu(32*256, 256, body)
130 | 
131 | This launches a CUDA kernel using the default number of threads and threads \
132 | per block.  Right now they are hard-coded, but should really come from \
133 | querying the device (or let the user specify some global default).
134 | 
135 | One can override the defaults for a call by explicitly specifying them
136 | 
137 | .. code-block:: Nim
138 | 
139 |   onGpu(x.n, 128):
140 |     x += y * z
141 |     z := 4
142 | 
143 | This would launch one (virtual) thread per element of the array ``x`` and use
144 | 128 threads per block.
145 | 
146 | The CUDA kernel gets created here
147 | 
148 | .. code-block:: Nim
149 | 
150 |   template onGpu*(nn,tpb: untyped, body: untyped): untyped =
151 |     block:
152 |       var v = packVars(body, getGpuPtr)
153 |       type myt {.bycopy.} = object
154 | 	d: type(v)
155 |       proc kern(xx: myt) {.cudaGlobal.} =
156 | 	template deref(k: int): untyped = xx.d[k]
157 | 	substVars(body, deref)
158 |       let ni = nn.int32
159 |       let threadsPerBlock = tpb.int32
160 |       let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
161 |       cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
162 |       discard cudaDeviceSynchronize()
163 | 
164 | This starts a new block scope (``block:``), similar to ``{...}`` in C.
165 | This is done to isolate the defined kernel (``proc kern ...``) from other \
166 | ``onGpu`` blocks.
167 | 
168 | The first major task is to examine the body of the ``onGpu`` block and \
169 | extract the variables that are used.
170 | This is done by the ``packVars`` macro.
171 | It walks the syntax tree of the code block passed in and keeps track of \
172 | the (unique) variables it references.
173 | It then spits out a data structure (a tuple_) containing those variables.
174 | It wraps each variable in a call to the function name that was passed in \
175 | (in this case ``getGpuPtr``).
176 | For the example above, this line would get expanded to
177 | 
178 | .. _tuple: https://nim-lang.org/docs/manual.html#types-tuples-and-object-types
179 | 
180 | .. code-block:: Nim
181 | 
182 |   var v = (getGpuPtr(x), getGpuPtr(y), getGpuPtr(z))
183 | 
184 | The function ``getGpuPtr`` can then be defined independently for each type \
185 | to return a valid GPU object (it actually doesn't have to be a pointer as we'llsee next).
186 | For the ``ArrayObj`` type it is defined as
187 | 
188 | .. code-block:: Nim
189 | 
190 |   template getGpuPtr*(x: var ArrayObj): untyped =
191 |     toGpu(x)
192 |     x.g
193 | 
194 | This copies the data to the GPU (if necessary) and then returns the \
195 | ``GpuArrayObj`` containing the GPU pointer and the length of the array.
196 | This is a (small) object residing in CPU memory, and the CUDA library \
197 | takes care of copying it to the GPU when passed as an argument.
198 | 
199 | Copying the data to the GPU is handled by
200 | 
201 | .. code-block:: Nim
202 | 
203 |   proc toGpu*(x: var ArrayObj) =
204 |     if not x.lastOnGpu:
205 |       x.lastOnGpu = true
206 |       if x.g.n==0: x.g.init(x.n)
207 |       let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice)
208 |       if err: echo err
209 | 
210 | Here we check if this array was last used on the GPU.
211 | If not we check if it has been initialized yet (``x.g.n==0``) and \
212 | initialize it if not (which will call cudaMalloc).
213 | We then copy the CPU memory to GPU memory.
214 | Here we could also translate the layout if we wanted.
215 | 
216 | Currently I am not distinguishing between read access and write access.
217 | This could lead to further optimization.
218 | It should be possible to modify the existing methods to handle that too.
219 | 
220 | Next we create the CUDA kernel (``kern``).
221 | The kernel is defined here
222 | 
223 | .. code-block:: Nim
224 | 
225 |   proc kern(xx: myt) {.cudaGlobal.} =
226 |     template deref(k: int): untyped = xx.d[k]
227 |     substVars(body, deref)
228 | 
229 | This is a function taking one argument (which contains the packed \
230 | ``GpuArrayObj``'s or any other objects used by the expressions.
231 | I originally wrote the procedure definition as
232 | 
233 | .. code-block:: Nim
234 | 
235 |   proc kern(xx: type(v)) {.cudaGlobal.} =
236 |     template deref(k: int): untyped = xx[k]
237 |     substVars(body, deref)
238 | 
239 | but found that Nim decided in some cases to pass the argument of \
240 | ``kern`` (``xx``) as a pointer, instead of by value.
241 | Nim does this to optimize function calls when it feels it is safe to do so.
242 | To prevent this I wrapped the tuple in another object type (``myt``) that \
243 | is explicitly declared ``{.bycopy.}``, so that Nim will always pass it by \
244 | value (which makes a copy).
245 | 
246 | In retrospect, another approach may have been to mark the procedure as \
247 | ``{.exportC.}``, which will also prevent Nim from changing the calling \
248 | conventions.  I would then need to make the procedure names ``kern`` unique \
249 | on my own since Nim will also not perform name-mangling on ``{.exportC.}`` \
250 | procedures.
251 | 
252 | The main body of the kernel comes from the
253 | 
254 | .. code-block:: Nim
255 | 
256 |   substVars(body, deref)
257 | 
258 | macro.
259 | It works similarly to the ``packVars`` macro above, but this time it will \
260 | identify the variables referenced in the code block and substitute them \
261 | with a call to the provided function (``deref``) with an integer argument \
262 | that specifies which position in the kernel argument tuple that variable \
263 | is in.  For the example above this would generate
264 | 
265 | .. code-block:: Nim
266 | 
267 |   deref(0) += deref(1) * deref(2)
268 |   deref(2) := 4
269 | 
270 | The ``deref`` template then simply expands to the appropriate expression \
271 | that refers to the kernel argument.
272 | 
273 | The rest of the magic needed to transform this procedure into a valid CUDA \
274 | kernel is handled in the macro ``cudaGlobal`` which is applied to the \
275 | procedure as a pragma ``{.cudaGlobal.}``.
276 | It also performs function inlining, so that one can still call host functions \
277 | from the device (and not have to worry about marking then with ``__device__``.
278 | I won't go into the details here.
279 | 
280 | The main step left now is to launch the kernel
281 | 
282 | .. code-block:: Nim
283 | 
284 |   let ni = nn.int32
285 |   let threadsPerBlock = tpb.int32
286 |   let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
287 |   cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
288 | 
289 | This selects the blocksPerGrid and threadsPerBlock to be used in the CUDA \
290 | kernel, then launches the kernel ``kern`` with the argument tuple ``v``.
291 | 
292 | Lastly, we synchronize.
293 | 
294 | .. code-block:: Nim
295 | 
296 |   discard cudaDeviceSynchronize()
297 | 
298 | This returns an error code, which I really should be checking instead \
299 | of discarding.
300 | Nim requires you to explicitly discard a return value to be clear that you \
301 | meant to ignore it and didn't just forget.
302 | We may be able to delay this until we actually use the fields again.
303 | 
304 | 
305 | Back and forth
306 | ==============
307 | 
308 | To get the expression to evaluate correctly on the CPU again we \
309 | also check on every assignment made on the CPU that the fields are \
310 | updated there.  So in the expression
311 | 
312 | .. code-block:: Nim
313 | 
314 |   # do something on CPU again
315 |   x += y * z
316 | 
317 | the ``+=`` will do something like ``packVars``, but this time will generate \
318 | statements containing ``toCpu`` calls on the used variables.
319 | 
320 | To do
321 | =====
322 | 
323 | This is just a toy example.
324 | 
325 | The next step is to get the vectorization working properly on the GPU \
326 | arrays.
327 | The explicit copy allows us to use a different vectorization layout between \
328 | the CPU and GPU.
329 | 
330 | The examples here also need to be integrated with the existing ``thread:`` \
331 | block in QEX_.
332 | One possibility is simply
333 | 
334 | .. _QEX: https://github.com/jcosborn/qex
335 | 
336 | .. code-block:: Nim
337 | 
338 |   threads:
339 |     # do something on CPU
340 |     x += y * z
341 | 
342 |     # do something on GPU
343 |     onGpu:
344 |       x += y * z
345 |       z := 4
346 | 
347 |     # do something on CPU again
348 |     x += y * z
349 | 
350 | Other variants are also possible.
351 | 


--------------------------------------------------------------------------------
/demo2/ex1.nim:
--------------------------------------------------------------------------------
 1 | import cpugpuarray
 2 | 
 3 | let N = 64
 4 | var x = newColorMatrixArray(N)
 5 | var y = newColorMatrixArray(N)
 6 | var z = newColorMatrixArray(N)
 7 | 
 8 | # set them to diagonal matrices on CPU
 9 | x := 1
10 | y := 2
11 | z := 3
12 | 
13 | # do something on CPU
14 | x += y * z
15 | 
16 | # do something on GPU
17 | onGpu:
18 |   x += y * z
19 |   z := 4
20 | 
21 | # do something on CPU again
22 | x += y * z
23 | 
24 | if x[0][0,0].re == 21.0:
25 |   echo "yay, it worked!"
26 |   echo "do you agree, GPU?"
27 | else:
28 |   echo x[0][0,0].re
29 | 
30 | onGpu:
31 |   if getThreadNum()==0:
32 |     if x[0][0,0].re == 21.0:
33 |       printf("yes, I agree!\n")
34 | 
35 | # outputs:
36 | #   yay, it worked!
37 | #   do you agree, GPU?
38 | #   yes, I agree!
39 | 


--------------------------------------------------------------------------------
/demo2/ex2.nim:
--------------------------------------------------------------------------------
 1 | import cpugpuarray
 2 | include system/timers
 3 | include system/ansi_c
 4 | import strUtils
 5 | 
 6 | proc test(N: int) =
 7 |   var x = newColorMatrixArray(N)
 8 |   var y = newColorMatrixArray(N)
 9 |   var z = newColorMatrixArray(N)
10 | 
11 |   template timeit(s:string, b:untyped) =
12 |     let R = 64
13 |     let t0 = getTicks()
14 |     for i in 0..<R: b
15 |     let t1 = getTicks()
16 |     let n = x.T.N
17 |     cprintf("%8lld\t%-7s\tmsec: %7.3f\tGF/s: %6.3f\n", N, s, (t1-t0).float*1e-6/R.float, (8*n*n*n*N*R).float/(t1-t0).float)
18 | 
19 |   # set them to diagonal matrices on CPU
20 |   x := 1
21 |   y := 2
22 |   z := 3
23 | 
24 |   # do something on CPU
25 |   timeit "CPU":
26 |     x += y * z
27 | 
28 |   var s = 1.0'f32
29 |   template getGpuPtr(x: float): float = x
30 |   # do something on GPU
31 |   timeit "GPU1":
32 |     onGpu(1 shl 10,1 shl 4):
33 |       #var t = s
34 |       x += y * z
35 |       #if ff(): discard
36 |         #z := 4
37 |   timeit "GPU2":
38 |     onGpu(1 shl 10,1 shl 5):
39 |       x += y * z
40 |   timeit "GPU3":
41 |     onGpu(1 shl 10,1 shl 6):
42 |       x += y * z
43 |   timeit "GPU4":
44 |     onGpu(1 shl 10,1 shl 7):
45 |       x += y * z
46 | 
47 |   # do something on CPU again
48 |   timeit "CPU":
49 |     x += y * z
50 | 
51 |   x.free
52 |   y.free
53 |   z.free
54 | 
55 | #for n in 10..25: test(1 shl n)    # 7 GB ~ float su3 × 3 × 2^25
56 | for n in 10..24: test(1 shl n)
57 | 


--------------------------------------------------------------------------------
/demo2/gpuarray.nim:
--------------------------------------------------------------------------------
  1 | import coalesced
  2 | 
  3 | when not declared(haveCuda):
  4 |   const haveCuda = true
  5 | 
  6 | when haveCuda:
  7 |   import ../cuda
  8 | 
  9 | import macros
 10 | include system/ansi_c
 11 | import linalg
 12 | 
 13 | type
 14 |   GpuArrayObj*[T] = object
 15 |     p*: Coalesced[T]
 16 |     n*: int
 17 |   GpuArrayRef*[T] = ref GpuArrayObj[T]
 18 |   GpuArray*[T] = GpuArrayRef[T]
 19 |   GpuArrays* = GpuArrayObj | GpuArrayRef
 20 |   GpuArrays2* = GpuArrayObj | GpuArrayRef
 21 |   GpuArrays3* = GpuArrayObj | GpuArrayRef
 22 | 
 23 | proc init*[T](r: var GpuArrayObj[T], n: int) =
 24 |   var p: ptr T
 25 |   when haveCuda:
 26 |     let err = cudaMalloc(cast[ptr pointer](addr p), n*sizeof(T))
 27 |     if err:
 28 |       echo "alloc err: ", err
 29 |       quit(-1)
 30 |   else:
 31 |     p = createSharedU(T, n)
 32 |   r.n = n
 33 |   r.p = newCoalesced(p, n)
 34 | proc init[T](r: var GpuArrayRef[T], n: int) =
 35 |   r.new
 36 |   r[].init(n)
 37 | proc free*[T](r: var GpuArrayObj[T]) =
 38 |   when haveCuda: discard r.p.p.cudaFree
 39 | proc free*[T](r: GpuArrayRef[T]) =
 40 |   when haveCuda: discard r.p.p.cudaFree
 41 | 
 42 | proc newGpuArrayObj*[T](r: var GpuArrayObj[T], n: int) =
 43 |   r.init(n)
 44 | proc newGpuArrayObj*[T](n: int): GpuArrayObj[T] =
 45 |   result.init(n)
 46 | 
 47 | proc newGpuArrayRef*[T](r: var GpuArrayRef[T], n: int) =
 48 |   r.init(n)
 49 | proc newGpuArrayRef*[T](n: int): GpuArrayRef[T] =
 50 |   result.init(n)
 51 | 
 52 | template getGpuPtr*(x: SomeNumber): untyped = x
 53 | #template getGpuPtr(x: GpuArrayObj): untyped = x
 54 | template getGpuPtr*(x: GpuArrayRef): untyped = x[]
 55 | #template getGpuPtr(x: GpuArrayRef): untyped = x.p
 56 | #template getGpuPtr(x: GpuArrayRef): untyped = (p:x.p,n:x.n)
 57 | 
 58 | template indexGpuArray*(x: GpuArrays, i: SomeInteger): untyped =
 59 |   x.p[i]
 60 | 
 61 | macro indexGpuArray*(x: GpuArrays{call}, y: SomeInteger): untyped =
 62 |   #echo "call[", y.repr, "]"
 63 |   #echo x.treerepr
 64 |   #if siteLocalsField.contains($x[0]):
 65 |   result = newCall(ident($x[0]))
 66 |   for i in 1..<x.len:
 67 |     let xi = x[i]
 68 |     result.add( quote do:
 69 |       indexGpuArray(`xi`,`y`) )
 70 |   #else:
 71 |   #  result = quote do:
 72 |   #    let tt = `x`
 73 |   #    tt.d[`y`]
 74 |   #echo result.treerepr
 75 |   #echo result.repr
 76 | 
 77 | template `[]`*(x: GpuArrayObj, i: SomeInteger): untyped = indexGpuArray(x, i)
 78 | template `[]=`*(x: GpuArrayObj, i: SomeInteger, y: untyped): untyped =
 79 |   x.p[][i] = y
 80 | 
 81 | template `[]`*(x: GpuArrayRef, i: SomeInteger): untyped =
 82 |   echo "GAR[]"
 83 |   x.p[][i]
 84 | template `[]=`*(x: GpuArrayRef, i: SomeInteger, y: untyped): untyped =
 85 |   x.p[][i] = y
 86 | 
 87 | var threadNum = 0
 88 | var numThreads = 1
 89 | template getThreadNum: untyped = threadNum
 90 | template getNumThreads: untyped = numThreads
 91 | template `:=`*(x: GpuArrays, y: GpuArrays2) =
 92 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
 93 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
 94 |   mixin getThreadNum, getNumThreads
 95 |   let tid = getThreadNum()
 96 |   let nid = getNumThreads()
 97 |   var i = tid
 98 |   while i<x.n:
 99 |     x[i] := y[i]
100 |     i += nid
101 | 
102 | template `:=`*(x: GpuArrays, y: SomeNumber) =
103 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
104 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
105 |   mixin getThreadNum, getNumThreads
106 |   let tid = getThreadNum()
107 |   let nid = getNumThreads()
108 |   var i = tid
109 |   while i<x.n:
110 |     x[i] := y
111 |     #echo i, "/", x.n
112 |     i += nid
113 | 
114 | template `+=`*(x: GpuArrays, y: SomeNumber) =
115 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
116 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
117 |   mixin getThreadNum, getNumThreads
118 |   let tid = getThreadNum()
119 |   let nid = getNumThreads()
120 |   var i = tid
121 |   #cprintf("%i/%i\n", i, x.n)
122 |   while i<x.n:
123 |     x[i] += y
124 |     #cprintf("%i/%i\n", i, x.n)
125 |     i += nid
126 | 
127 | template `+=`*(x: GpuArrays, y: GpuArrays2) =
128 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
129 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
130 |   mixin getThreadNum, getNumThreads
131 |   let tid = getThreadNum()
132 |   let nid = getNumThreads()
133 |   var i = tid
134 |   #cprintf("%i/%i\n", i, x.n)
135 |   while i<x.n:
136 |     x[i] += y[i]
137 |     #cprintf("%i/%i\n", i, x.n)
138 |     i += nid
139 | 
140 | proc `+`*(x: GpuArrays, y: GpuArrays2): auto =
141 |   when x is GpuArrayObj:
142 |     var r: GpuArrayObj[type(x[0]+y[0])]
143 |   else:
144 |     var r: GpuArrayRef[type(x[0]+y[0])]
145 |   cprintf("+\n")
146 |   r
147 | proc `*`*(x: GpuArrays, y: GpuArrays2): auto =
148 |   when x is GpuArrayObj:
149 |     var r: GpuArrayObj[type(x[0]*y[0])]
150 |   else:
151 |     var r: GpuArrayRef[type(x[0]*y[0])]
152 |   cprintf("*\n")
153 |   r
154 | 
155 | when isMainModule:
156 |   var N = 1000
157 | 
158 |   proc testfloat =
159 |     var x = newGpuArrayRef[float32](N)
160 |     var y = newGpuArrayRef[float32](N)
161 |     var z = newGpuArrayRef[float32](N)
162 |     #cprintf("x.n: %i\n", x.n)
163 |     onGpu(1,32):
164 |       x += y * z
165 |   testfloat()
166 | 
167 |   proc testcomplex =
168 |     var x = newGpuArrayRef[Complex[float32]](N)
169 |     var y = newGpuArrayRef[Complex[float32]](N)
170 |     var z = newGpuArrayRef[Complex[float32]](N)
171 |     onGpu(N):
172 |       x += y * z
173 |   testcomplex()
174 | 
175 |   proc testcolmat =
176 |     var x = newGpuArrayRef[Colmat[3,float32]](N)
177 |     var y = newGpuArrayRef[Colmat[3,float32]](N)
178 |     var z = newGpuArrayRef[Colmat[3,float32]](N)
179 |     #y := 1
180 |     #z := 2
181 |     onGpu(N):
182 |       x += y * z
183 |   testcolmat()
184 | 


--------------------------------------------------------------------------------
/demo2/linalg.nim:
--------------------------------------------------------------------------------
 1 | type SomeNumber2* = SomeInteger | SomeReal
 2 | template `:=`*(x: var SomeNumber, y: SomeNumber2) =
 3 |   x = (type(x))(y)
 4 | template `+=`*(x: var SomeNumber, y: SomeNumber2) =
 5 |   bind `+=`    # So the following += doesn't call this template again.
 6 |   x += (type(x))(y)
 7 | 
 8 | type
 9 |   Complex*[T] = object
10 |     re*,im*: T
11 | template `:=`*[T](x: var Complex[T], y: SomeNumber) =
12 |   let z = y
13 |   x.re := z
14 |   x.im := 0
15 | template `:=`*[T](x: var Complex[T], y: Complex[T]) =
16 |   let z = y
17 |   x.re = z.re
18 |   x.im = z.im
19 | template `+=`*[T](x: var Complex[T], y: SomeNumber) =
20 |   let z = y
21 |   x.re += z
22 | template `+=`*[T](x: var Complex[T], y: Complex[T]) =
23 |   let z = y
24 |   x.re += z.re
25 |   x.im += z.im
26 | template `+`*[T](x,y: Complex[T]): untyped =
27 |   var r {.noInit.}: Complex[type(x.re+y.re)]
28 |   r.re = x.re + y.re
29 |   r.im = x.im + y.im
30 |   r
31 | template `*`*[T](x,y: Complex[T]): untyped =
32 |   var r {.noInit.}: Complex[type(x.re*y.re)]
33 |   r.re = x.re*y.re - x.im*y.im
34 |   r.im = x.re*y.im + x.im*y.re
35 |   r
36 | 
37 | type
38 |   Colmat*[N:static[int],T] = object
39 |     d*: array[N,array[N,Complex[T]]]
40 | proc structSize*[N:static[int],T](t:typedesc[Colmat[N,T]]):int = 2*N*N*sizeof(T)
41 | template `[]`*(x: Colmat, i,j: int): untyped = x.d[i][j]
42 | template `:=`*[N:static[int],T](x: var Colmat[N,T], y: SomeNumber) =
43 |   let z = y
44 |   for i in 0..<N:
45 |     for j in 0..<N:
46 |       if i==j:
47 |         x.d[i][j] := z
48 |       else:
49 |         x.d[i][j] := 0
50 | template `:=`*[N:static[int],T](x: var Colmat[N,T], y: Colmat[N,T]) =
51 |   let z = y
52 |   for i in 0..<N:
53 |     for j in 0..<N:
54 |       x.d[i][j] = z.d[i][j]
55 | template `+=`*[N:static[int],T](x: var Colmat[N,T], y: Colmat[N,T]) =
56 |   let z = y
57 |   for i in 0..<N:
58 |     for j in 0..<N:
59 |       x.d[i][j] += z.d[i][j]
60 | template `+`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
61 |   let xx = x
62 |   let yy = y
63 |   var r {.noInit.}: Colmat[N,type(xx.d[0][0].re+yy.d[0][0].re)]
64 |   for i in 0..<N:
65 |     for j in 0..<N:
66 |       r.d[i][j] = xx.d[i][j] + yy.d[i][j]
67 |   r
68 | template `*`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
69 |   let xx = x
70 |   let yy = y
71 |   var r {.noInit.}: Colmat[N,type(xx.d[0][0].re*yy.d[0][0].re)]
72 |   for i in 0..<N:
73 |     for j in 0..<N:
74 |       r.d[i][j] = xx.d[i][0] * yy.d[0][j]
75 |     for k in 1..<N:
76 |       for j in 0..<N:
77 |         r.d[i][j] += xx.d[i][k] * yy.d[k][j]
78 |   r
79 | 
80 | when isMainModule:
81 |   var x,y,z: ref Complex[float]
82 |   x.new
83 |   y.new
84 |   z.new
85 |   x[] += y[]*z[]
86 |   echo x[]
87 | 


--------------------------------------------------------------------------------
/demo3/.gitignore:
--------------------------------------------------------------------------------
 1 | coalesced
 2 | cpugpuarray
 3 | ex1
 4 | ex2
 5 | gpuarray
 6 | linalg
 7 | ccwrapper.err
 8 | ccwrapper.out
 9 | qexLite/omp
10 | vectorized
11 | 


--------------------------------------------------------------------------------
/demo3/bench:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | n="$(hostname)"
 3 | c="$HOME/tmp/samples/1_Utilities"
 4 | #c="../../cuda_samples-8.0.61/1_Utilities"
 5 | [[ -d out ]] || mkdir out
 6 | if [[ ! -s out/$n.info ]];then
 7 |   ./ccwrapper --version > "out/$n.info"
 8 |   cat /proc/cpuinfo >> "out/$n.info"
 9 |   "$c/deviceQuery/deviceQuery" >> "out/$n.info"
10 |   "$c/bandwidthTest/bandwidthTest" >> "out/$n.info"
11 | fi
12 | ./compile ex2
13 | ./ex2 > out/$n.ex2
14 | 


--------------------------------------------------------------------------------
/demo3/bench_cpu:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | n="$(hostname)"
 3 | [[ -d out ]] || mkdir out
 4 | if [[ ! -s out/$n.info ]];then
 5 |   gcc --version > "out/$n.info"
 6 |   numactl -H >> "out/$n.info"
 7 |   cat /proc/cpuinfo >> "out/$n.info"
 8 | fi
 9 | ./compile_cpu ex2
10 | numactl -m 1 ./ex2 > out/$n.ex2
11 | 


--------------------------------------------------------------------------------
/demo3/ccwrapper:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | nvcc="/usr/local/cuda/bin/nvcc"
 3 | #nvcc=nvcc
 4 | cc=g++-4.8
 5 | #args=(-arch sm_60 -ccbin $cc)
 6 | args=(-arch sm_21 -ccbin $cc)
 7 | if (($# == 1)) && [[ $1 == --version ]];then
 8 |   $nvcc "$@"
 9 |   $cc "$@"
10 |   exit
11 | fi
12 | ccargs=""
13 | #verbosity=1
14 | forcc(){
15 |   case "$1" in
16 |   "-pthread") return 0 ;;
17 |   "-fopenmp") return 0 ;;
18 |   *) return 1 ;;
19 |   esac
20 | }
21 | ps(){ if ((verbosity>0));then printf "#";printf " '%s'" "$@";printf "\n";fi;"$@";ret=$?; }
22 | ex(){ # OUTFILE ERRFILE CMD [...]
23 |   local O="$1" E="$2"; shift 2
24 |   { { ps "$@" | tee -a "$O"; } 2>&1 1>&3 | tee -a "$E"; } 3>&1 1>&2
25 | }
26 | while (($#>0));do
27 |   if forcc "$1";then
28 |     ccargs+=",'$1'"
29 |   else
30 |     args+=("$1")
31 |   fi
32 |   shift
33 | done
34 | ret=0
35 | ex ccwrapper.{out,err} "$nvcc" "${args[@]}" -Xcompiler "${ccargs#,}"
36 | exit $ret
37 | 


--------------------------------------------------------------------------------
/demo3/coalesced.nim:
--------------------------------------------------------------------------------
  1 | #[
  2 | 
  3 | Following Nvidia's idea of coalesced_ptr in C++, we use a wrapper
  4 | object type to hide the actual coalesced memory layout here.
  5 | Original comments from Nvidia's coalesced_ptr.h follows:
  6 | 
  7 |   A smart pointer that automatically provide coalesced memory
  8 |   transcations for arrays of arbtrary structures.  Given a structure
  9 |   T, of size S bytes, e.g.,
 10 | 
 11 |   struct T {
 12 |     char a[S];
 13 |   }
 14 | 
 15 |   in an array with sites elements
 16 | 
 17 |   T t[sites];
 18 | 
 19 |   using a coalesced_ptr will split the structure for reading and
 20 |   writing to memory as an array of structures of array of structures (AoSoAoS),
 21 |   where:
 22 |     - the inner structure size is given by memory_word_size
 23 |     - the inner array size is given by site_vector
 24 |     - the outer structure size is given by sizeof(T)/memory_word_size
 25 |     - the outer array size is given by sites/site_vector
 26 | 
 27 | ]#
 28 | 
 29 | import macros, qexLite/metaUtils
 30 | 
 31 | type
 32 |   Coalesced*[V,M:static[int],T] = object
 33 |     ## `V`: Inner array length.
 34 |     ## `M`: Number of RegisterWords in a MemoryWord, the granularity of memory transactions.
 35 |     p*: ptr T                   ## pointer to an array of T
 36 |     n*: int                     ## the length of the array being coalesced
 37 |   CoalescedObj[V,M:static[int],T] = object
 38 |     o*: Coalesced[V,M,T]
 39 |     i*: int                     # the index to which we asks
 40 | 
 41 | const llbits = currentSourcePath()[0..^14] & "llbits.h"
 42 | type
 43 |   RegisterWord* {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 44 |   MemoryWord1 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 45 |     a*: array[1,RegisterWord]
 46 |   MemoryWord2 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 47 |     a*: array[2,RegisterWord]
 48 |   MemoryWord4 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 49 |     a*: array[4,RegisterWord]
 50 |   MemoryWord8 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 51 |     a*: array[8,RegisterWord]
 52 |   MemoryWord16 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 53 |     a*: array[16,RegisterWord]
 54 |   MemoryWord32 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU
 55 |     a*: array[32,RegisterWord]
 56 | template MemoryWord(M:static[int]):untyped =
 57 |   when 1 == M: MemoryWord1
 58 |   elif 2 == M: MemoryWord2
 59 |   elif 4 == M: MemoryWord4
 60 |   elif 8 == M: MemoryWord8
 61 |   elif 16 == M: MemoryWord16
 62 |   elif 32 == M: MemoryWord32
 63 | 
 64 | template sizeOf*(t:typedesc[RegisterWord]):int = 4
 65 | 
 66 | # Nim doesn't know the size of any struct for sure without the help of a C/C++ compiler.
 67 | # So we use a C++ compiler to check if the user has provided a correct size.
 68 | # The following C++ code only works with c++11 or later.
 69 | {.emit:"""
 70 | #if __cplusplus >= 201103L
 71 |   template <typename ToCheck, std::size_t ProvidedSize, std::size_t RealSize = sizeof(ToCheck)>
 72 |   void coalesced_check_size() {static_assert(ProvidedSize == RealSize, "newCoalesced got the wrong size!");}
 73 | #else
 74 |   #define coalesced_check_size(type,size) typedef char ProvidedWrongSizeForType##type[2*!!(sizeof(type)==(size))-1]
 75 | #endif
 76 | """.}
 77 | template getSize*(T:typedesc):untyped =
 78 |   when compiles((const size = sizeof(T))):
 79 |     const size = sizeof(T)
 80 |   else:
 81 |     mixin structSize
 82 |     const size = structSize(T)
 83 |   {.emit:"""
 84 |     #if __cplusplus >= 201103L
 85 |       coalesced_check_size<`T`,`size`>();
 86 |     #else
 87 |       coalesced_check_size(`T`,`size`);
 88 |     #endif
 89 |   """.}
 90 |   size
 91 | 
 92 | # Nim bug as of 8/7/2017, cannot overload init/newCoalesced.
 93 | # Overloaded type matching would SIGSEGV.
 94 | proc initCoalesced*[V,M:static[int],T](x:var Coalesced[V,M,T], p:ptr T, n:int) =
 95 |   const
 96 |     size = getSize(T)
 97 |     N = size div (M*sizeof(RegisterWord))
 98 |   when N*(M*sizeof(RegisterWord)) != size: {.fatal:"sizeof(T) must be divisible by memory word size."}
 99 |   if n mod V != 0:
100 |     echo "Array length for Coalesced must be multiples of V = ",V
101 |     quit 1
102 |   x.p = p
103 |   x.n = n
104 | proc newCoalesced*[T](V,M:static[int], p:ptr T, n:int):auto {.noinit.} =
105 |   var r {.noinit.}:Coalesced[V,M,T]
106 |   r.initCoalesced(p,n)
107 |   r
108 | 
109 | template `[]`*(x:Coalesced, ix:int):untyped = CoalescedObj[x.V,x.M,x.T](o:x, i:ix)
110 | template len*(x:Coalesced):untyped = x.n
111 | 
112 | template fromCoalesced*(x:CoalescedObj):untyped =
113 |   const N = getSize(x.T) div (x.M*sizeof(RegisterWord))
114 |   type A {.unchecked.}= ptr array[0,MemoryWord(x.M)]
115 |   var r {.noinit.}: x.T
116 |   let offset = (x.i div x.V)*N*x.V + x.i mod x.V
117 |   staticfor j, 0, N-1: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V]
118 |   #for j in 0..<N: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V]
119 |   r
120 | macro `[]`*(x:CoalescedObj, ys:varargs[untyped]):untyped =
121 |   let o = newCall(bindsym"fromCoalesced", x)
122 |   if ys.len == 0:
123 |     result = o
124 |   else:
125 |     result = newCall("[]", o)
126 |     for y in ys: result.add y
127 | 
128 | proc `:=`*[V,M:static[int],X,Y](x:CoalescedObj[V,M,X], y:var Y) {.inline.} =
129 |   when Y is x.T:
130 |     const N = getSize(x.T) div (x.M*sizeof(RegisterWord))
131 |     type A {.unchecked.}= ptr array[0,MemoryWord(x.M)]
132 |     let offset = (x.i div x.V)*N*x.V + x.i mod x.V
133 |     staticfor j, 0, N-1: cast[A](x.o.p)[offset + j*x.V] = cast[A](y.addr)[j]
134 |     #for j in 0..<N: cast[A](x.o.p)[offset + j*x.V] = cast[A](y.addr)[j]
135 |   else:
136 |     mixin `:=`
137 |     var ty {.noinit.}:x.T
138 |     ty := y
139 |     x := ty
140 | template `:=`*[V,M:static[int],X,Y](x:CoalescedObj[V,M,X], y:Y) =
141 |   mixin `:=`
142 |   var ty {.noinit.}:x.T
143 |   ty := y
144 |   x := ty
145 | 
146 | proc `*`*[VX,MX,VY,MY:static[int],X,Y](x:CoalescedObj[VX,MX,X], y:CoalescedObj[VY,MY,Y]):auto {.noinit,inline.} =
147 |   let
148 |     tx {.noinit.} = fromCoalesced(x)
149 |     ty {.noinit.} = fromCoalesced(y)
150 |   mixin `*`
151 |   tx * ty
152 | 
153 | template `+=`*[Y](x:CoalescedObj, y:Y) = x := fromCoalesced(x) + y
154 | 
155 | when isMainModule:
156 |   import strutils
157 |   type T = array[6,int32]
158 |   proc structSize(t:typedesc[T]):int = 24
159 |   var x {.noinit.}: array[16,T]
160 |   let p = newCoalesced(8, 2, x[0].addr, x.len)
161 |   # var p:Coalesced[8,3,T]
162 |   # p.initCoalesced(x[0].addr, x.len)
163 |   for i in 0..<p.len:
164 |     var t {.noinit.}: T
165 |     for j in 0..<t.len: t[j] = int32(100*i + j)
166 |     p[i] := t
167 |   var s:string
168 |   s = "Lexical order: p = {"
169 |   for i in 0..<p.len:
170 |     let t = p[i][]
171 |     s &= "\n["
172 |     for j in 0..<t.len: s &= " " & align($t[j],4)
173 |     s &= " ]"
174 |   s &= "}"
175 |   echo s
176 |   s = "Memory layout: x = {"
177 |   var c = 0
178 |   for i in 0..<x.len:
179 |     for j in 0..<x[0].len:
180 |       if 0 == c mod (p.V*p.M): s &= "\n"
181 |       inc c
182 |       s &= " " & align($x[i][j],4)
183 |   s &= "}"
184 |   echo s
185 | 


--------------------------------------------------------------------------------
/demo3/compile:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | args=("$@")
3 | if ((${#args[@]} == 0));then args=("ex1.nim");fi
4 | nim cpp '--warning[SmallLshouldNotBeUsed]:off' -d:SSE -d:AVX -d:CPUVLEN=256 -d:release "${args[@]}"
5 | 


--------------------------------------------------------------------------------
/demo3/compile_cpu:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | args=("$@")
3 | if ((${#args[@]} == 0));then args=("ex1.nim");fi
4 | nim c '--warning[SmallLshouldNotBeUsed]:off' -d:SSE -d:AVX -d:CPUVLEN=256 -d:USEGPU=0 -d:release "${args[@]}"
5 | #nim c '--warning[SmallLshouldNotBeUsed]:off' -d:SSE -d:AVX -d:AVX512 -d:CPUVLEN=512 -d:USEGPU=0 -d:release "${args[@]}"
6 | 


--------------------------------------------------------------------------------
/demo3/config.nims:
--------------------------------------------------------------------------------
 1 | --define:release
 2 | --threads:on
 3 | --tlsEmulation:off
 4 | --cc:gcc
 5 | const USEGPU {.intdefine.} = 1
 6 | when USEGPU == 0:
 7 |   switch("gcc.exe", "gcc")
 8 |   switch("gcc.linkerexe", "gcc")
 9 |   switch("gcc.options.always", "-std=c11 -march=native")
10 |   switch("gcc.options.speed", "-Ofast")
11 |   #switch("gcc.options.speed", "-Ofast -march=native -fno-strict-aliasing")
12 | else:
13 |   switch("gcc.cpp.exe", "./ccwrapper")
14 |   switch("gcc.cpp.linkerexe", "./ccwrapper")
15 |   switch("gcc.cpp.options.always", "-x cu -std=c++11 -Xcompiler -march=native")
16 |   switch("gcc.cpp.options.speed", "-O3 -Xcompiler -Ofast")
17 |   #switch("gcc.cpp.options.speed", "-O3 -Xcompiler -Ofast,-march=native,-fno-strict-aliasing")
18 | 


--------------------------------------------------------------------------------
/demo3/doc/PP-Nim-metaprogramming-DOE-COE-PP-2017.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcosborn/cudanim/338be782104af887521f7d6a6c09ea19ed0b86c3/demo3/doc/PP-Nim-metaprogramming-DOE-COE-PP-2017.pdf


--------------------------------------------------------------------------------
/demo3/doc/bandwidth_knl.gp:
--------------------------------------------------------------------------------
 1 | p(v,m,t) = sprintf("< awk '$2==%d && $3==%d && $4~/%s/{print}' ../out/kingly.ex2",v,m,t)
 2 | set log x
 3 | set key outside width 4
 4 | set xlabel 'Memory footprint (KB)'
 5 | set ylabel 'Effective bandwidth (GB/s)'
 6 | set xrange [50:15000000]
 7 | plot \
 8 |   p(16,1, 'CPU') u 8:14 w l ls  2 t 'V=16, M=1, CPU', \
 9 |   p(16,2, 'CPU') u 8:14 w l ls 22 t 'V=16, M=2, CPU', \
10 |   p(32,1, 'CPU') u 8:14 w l ls  3 t 'V=32, M=1, CPU', \
11 |   p(32,2, 'CPU') u 8:14 w l ls 23 t 'V=32, M=2, CPU', \
12 |   p(64,1, 'CPU') u 8:14 w l ls  4 t 'V=64, M=1, CPU', \
13 |   p(64,2, 'CPU') u 8:14 w l ls 24 t 'V=64, M=2, CPU'
14 | 


--------------------------------------------------------------------------------
/demo3/doc/bandwidth_knl.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcosborn/cudanim/338be782104af887521f7d6a6c09ea19ed0b86c3/demo3/doc/bandwidth_knl.pdf


--------------------------------------------------------------------------------
/demo3/doc/bandwidth_p100.gp:
--------------------------------------------------------------------------------
 1 | p(v,m,t) = sprintf("< awk '$2==%d && $3==%d && $4~/%s/{print}' ../out/neddy.ftm.alcf.anl.gov.ex2",v,m,t)
 2 | set key outside width 4
 3 | set log x
 4 | set xlabel 'Memory footprint (KB)'
 5 | set ylabel 'Effective bandwidth (GB/s)'
 6 | set xrange [50:15000000]
 7 | plot \
 8 |   p( 8,1, 'CPU') u 8:14 w l ls  1 t  'V=8, M=1, CPU', \
 9 |   p( 8,2, 'CPU') u 8:14 w l ls 21 t  'V=8, M=2, CPU', \
10 |   p(16,1, 'CPU') u 8:14 w l ls  2 t 'V=16, M=1, CPU', \
11 |   p(16,2, 'CPU') u 8:14 w l ls 22 t 'V=16, M=2, CPU', \
12 |   p(32,1, 'CPU') u 8:14 w l ls  3 t 'V=32, M=1, CPU', \
13 |   p(32,2, 'CPU') u 8:14 w l ls 23 t 'V=32, M=2, CPU', \
14 |   p(64,1, 'CPU') u 8:14 w l ls  4 t 'V=64, M=1, CPU', \
15 |   p(64,2, 'CPU') u 8:14 w l ls 24 t 'V=64, M=2, CPU', \
16 |   p( 8,1,'GPU5') u 8:14 w l ls  5 t  'V=8, M=1, GPU T/B=32', \
17 |   p( 8,2,'GPU5') u 8:14 w l ls 25 t  'V=8, M=2, GPU T/B=32', \
18 |   p(16,1,'GPU5') u 8:14 w l ls  6 t 'V=16, M=1, GPU T/B=32', \
19 |   p(16,2,'GPU5') u 8:14 w l ls 26 t 'V=16, M=2, GPU T/B=32', \
20 |   p(32,1,'GPU5') u 8:14 w l ls  7 t 'V=32, M=1, GPU T/B=32', \
21 |   p(32,2,'GPU5') u 8:14 w l ls 27 t 'V=32, M=2, GPU T/B=32', \
22 |   p(64,1,'GPU5') u 8:14 w l ls  8 t 'V=64, M=1, GPU T/B=32', \
23 |   p(64,2,'GPU5') u 8:14 w l ls 28 t 'V=64, M=2, GPU T/B=32'
24 | 


--------------------------------------------------------------------------------
/demo3/doc/bandwidth_p100.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jcosborn/cudanim/338be782104af887521f7d6a6c09ea19ed0b86c3/demo3/doc/bandwidth_p100.pdf


--------------------------------------------------------------------------------
/demo3/doc/readme.org:
--------------------------------------------------------------------------------
  1 | #+TITLE: Portable expressions in Nim
  2 | 
  3 | #+AUTHOR: Xiao-Yong Jin and James C. Osborn
  4 | 
  5 | #+OPTIONS: toc:2
  6 | #+HTML_HEAD_EXTRA: <style type="text/css">
  7 | #+HTML_HEAD_EXTRA: <!--
  8 | #+HTML_HEAD_EXTRA: body {font-family: 'Lucida Bright OT','Source Serif Pro',Serif;
  9 | #+HTML_HEAD_EXTRA:       font-size: 18pt;
 10 | #+HTML_HEAD_EXTRA:       line-height: 1.5;}
 11 | #+HTML_HEAD_EXTRA: pre {font-family: 'Lucida Console DK','Source Code Pro',monospace;
 12 | #+HTML_HEAD_EXTRA:      line-height: 1.2;}
 13 | #+HTML_HEAD_EXTRA: -->
 14 | #+HTML_HEAD_EXTRA: </style>
 15 | 
 16 | * Code portability in Nim
 17 | 
 18 | Here's an benchmark example.
 19 | 
 20 | #+BEGIN_SRC nim -n
 21 | import timing, cpugpuarray, qexLite/metaUtils, math
 22 | 
 23 | proc test(vecLen, memLen: static[int]; N: int) =
 24 |   var
 25 |     x = newColorMatrixArray(vecLen,memLen,N) # array of N 3x3 single prec complex matrices
 26 |     y = newColorMatrixArray(vecLen,memLen,N)
 27 |     z = newColorMatrixArray(vecLen,memLen,N)
 28 |     rep = 0                     # accumulates the number of runs
 29 | 
 30 |   let
 31 |     mr = float(3 * 8 * x.T.N * x.T.N * N) / float(1024 * 1024 * 1024) # Resident memory in 2^30 bytes
 32 |     mt = 4 * mr / 3             # Memory transaction
 33 |     fp = float(8 * x.T.N * x.T.N * x.T.N * N) * 1e-9 # Floating point op / 10^9
 34 |   template timeit(label:string, s:untyped) =
 35 |     var
 36 |       R {.global.}:int
 37 |       T {.global.}:float
 38 |     threadSingle:
 39 |       R = 128                   # Base repeat
 40 |       T = 1.0                   # Time limit
 41 |     var t = timex(rep, R, s)    # Always warm up cache
 42 |     while true:
 43 |       threadSingle:
 44 |         R = min(64*R,max(R,int(R.float*0.8/t))) # set up to run for at least 0.8 sec or 64*R
 45 |       t = timex(rep, R, s)
 46 |       threadSingle: T -= t
 47 |       if T < 0: break
 48 |     threadSingle:               # Use the last R & t for performance measure
 49 |       printf("%8d %3d %d %-8s rep: %7d KB: %8.0f ms: %8.4f GF/s: %7.2f GB/s: %7.2f\n",
 50 |              N, vecLen, memLen, label, R, 1024*1024*mr, 1e3*t/R.float, fp*R.float/t, mt*R.float/t)
 51 | 
 52 |   threads:                      # CPU threads
 53 |     x := 0                      # set them to diagonal matrices on CPU
 54 |     y := 1
 55 |     z := 2
 56 |     timeit "CPU": x += y * z
 57 | 
 58 |   timeit "GPU5":                # includes kernel launching and synchronization
 59 |     onGpu(N, 32):               # Number of threads, threads per block
 60 |       x += y * z
 61 |   timeit "GPU6": onGpu(N, 64): x += y * z
 62 |   timeit "GPU7": onGpu(N, 128): x += y * z
 63 | 
 64 |   threads: timeit "CPU": x += y * z # back to CPU threads again
 65 | 
 66 |   let scale = 0.5 / (sqrt(3.0) * rep.float)
 67 |   threads:
 68 |     x *= scale
 69 |     var n = x.norm2
 70 |     threadSingle: echo "# Final scaled x.norm2: ",n,"  rep: ",rep
 71 |   x.free
 72 |   y.free
 73 |   z.free
 74 | 
 75 | for n in 8..26:
 76 |   staticFor v, 2, 7:
 77 |     when (1 shl v) >= (structsize(vectorizedElementType(float32)) div sizeof(float32)):
 78 |       staticFor ml, 1, 2:
 79 |         test(1 shl v, ml, 1 shl n)
 80 | #+END_SRC
 81 | 
 82 | The above can be compiled and run with
 83 | 
 84 | #+BEGIN_SRC sh
 85 | nim cpp -d:SSE -d:AVX -d:CPUVLEN=256 -d:release ex2
 86 | #+END_SRC
 87 | 
 88 | * Implementation details
 89 | 
 90 | The main container object in the example above is an array that can live
 91 | on the CPU and also the GPU.  This is defined as
 92 | 
 93 | #+BEGIN_SRC nim -n
 94 | when useGPU:
 95 |   type
 96 |     ArrayObj*[V,M:static[int],T] = object
 97 |       p*: Coalesced[V,M,T]
 98 |       n*: int
 99 |       g*: GpuArrayObj[V,M,T]
100 |       lastOnGpu*: bool
101 |       unifiedMem*: bool
102 |       mem:pointer ## Pointer to the allocated memory.
103 | else:
104 |   type
105 |     ArrayObj*[V,M:static[int],T] = object
106 |       p*: Coalesced[V,M,T]
107 |       n*: int
108 |       mem:pointer ## Pointer to the allocated memory.
109 | 
110 | type
111 |   GpuArrayObj*[V,M:static[int],T] = object
112 |     p*: Coalesced[V,M,T]
113 |     n*: int
114 | 
115 | type
116 |   Coalesced*[V,M:static[int],T] = object
117 |     ## `V`: Inner array length.
118 |     ## `M`: Number of RegisterWords in a MemoryWord, the granularity of memory transactions.
119 |     p*: ptr T                   ## pointer to an array of T
120 |     n*: int                     ## the length of the array being coalesced
121 |   CoalescedObj[V,M:static[int],T] = object
122 |     o*: Coalesced[V,M,T]
123 |     i*: int                     # the index to which we asks
124 | 
125 | template `[]`*(x:Coalesced, ix:int):untyped = CoalescedObj[x.V,x.M,x.T](o:x, i:ix)
126 | template len*(x:Coalesced):untyped = x.n
127 | 
128 | template fromCoalesced*(x:CoalescedObj):untyped =
129 |   const N = getSize(x.T) div (x.M*sizeof(RegisterWord))
130 |   type A {.unchecked.}= ptr array[0,MemoryWord(x.M)]
131 |   var r {.noinit.}: x.T
132 |   let offset = (x.i div x.V)*N*x.V + x.i mod x.V
133 |   staticfor j, 0, N-1: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V]
134 |   r
135 | 
136 | type
137 |   ShortVector*[V:static[int],E] = object
138 |     a*:array[V,E]
139 |   ShortVectorIndex* = distinct int
140 |   VectorizedObj*[V,M:static[int],T] = object
141 |     o*:Coalesced[V,M,T]
142 |     i*:ShortVectorIndex
143 | 
144 | template `[]`*(x:Coalesced, ix:ShortVectorIndex):untyped = VectorizedObj[x.V,x.M,x.T](o:x,i:ix)
145 | template veclen*(x:Coalesced):untyped = x.n div x.V
146 | #+END_SRC
147 | 
148 | * CPU threads
149 | 
150 | #+BEGIN_SRC nim -n
151 | import omp
152 | 
153 | when defined(noOpenmp):
154 |   template omp_set_num_threads*(x: cint) = discard
155 |   template omp_get_num_threads*(): cint = 1
156 |   template omp_get_max_threads*(): cint = 1
157 |   template omp_get_thread_num*(): cint = 0
158 |   template ompPragma(p:string):untyped = discard
159 |   template setupGc = discard
160 | else:
161 |   const OMPFlag {.strDefine.} = "-fopenmp"
162 |   {. passC: OMPFlag .}
163 |   {. passL: OMPFlag .}
164 |   {. pragma: omp, header:"omp.h" .}
165 |   proc omp_set_num_threads*(x: cint) {.omp.}
166 |   proc omp_get_num_threads*(): cint {.omp.}
167 |   proc omp_get_max_threads*(): cint {.omp.}
168 |   proc omp_get_thread_num*(): cint {.omp.}
169 |   template ompPragma(p:string):untyped =
170 |     {. emit:"\n#pragma omp " & p .}
171 |   template setupGc =
172 |     if(omp_get_thread_num()!=0): setupForeignThreadGc()
173 | 
174 | template ompBarrier* = ompPragma("barrier")
175 | template ompBlock(p:string; body:untyped):untyped =
176 |   ompPragma(p)
177 |   block:
178 |     body
179 | 
180 | template ompParallel*(body:untyped):untyped =
181 |   ompBlock("parallel"):
182 |     setupGc()
183 |     body
184 | template ompMaster*(body:untyped):untyped = ompBlock("master", body)
185 | template ompSingle*(body:untyped):untyped = ompBlock("single", body)
186 | template ompCritical*(body:untyped):untyped = ompBlock("critical", body)
187 | #+END_SRC
188 | 
189 | #+BEGIN_SRC nim -n
190 | template threads*(body:untyped):untyped =
191 |   checkInit()
192 |   let tidOld = threadNum
193 |   let nidOld = numThreads
194 |   let tlOld = threadLocals
195 |   proc tproc{.genSym.} =
196 |     var ts:seq[ThreadShare]
197 |     ompParallel:
198 |       threadNum = ompGetThreadNum()
199 |       numThreads = ompGetNumThreads()
200 |       if threadNum==0: ts.newSeq(numThreads)
201 |       threadBarrierO()
202 |       initThreadLocals(ts)
203 |       body
204 |       threadBarrierO()
205 |   tproc()
206 |   threadNum = tidOld
207 |   numThreads = nidOld
208 |   threadLocals = tlOld
209 | #+END_SRC
210 | 
211 | * Offloading
212 | 
213 | #+BEGIN_SRC nim -n
214 | template cudaDefs(body: untyped): untyped {.dirty.} =
215 |   var gridDim{.global,importC,noDecl.}: CudaDim3
216 |   var blockIdx{.global,importC,noDecl.}: CudaDim3
217 |   var blockDim{.global,importC,noDecl.}: CudaDim3
218 |   var threadIdx{.global,importC,noDecl.}: CudaDim3
219 |   template getGridDim: untyped {.used.} = gridDim
220 |   template getBlockIdx: untyped {.used.} = blockIdx
221 |   template getBlockDim: untyped {.used.} = blockDim
222 |   template getThreadIdx: untyped {.used.} = threadIdx
223 |   template getThreadNum: untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x
224 |   template getNumThreads: untyped {.used.} = gridDim.x * blockDim.x
225 |   bind inlineProcs
226 |   inlineProcs:
227 |     body
228 | 
229 | template cudaLaunch*(p: proc; blocksPerGrid,threadsPerBlock: SomeInteger;
230 |                      arg: varargs[pointer,dataAddr]) =
231 |   var pp: proc = p
232 |   var gridDim, blockDim: CudaDim3
233 |   gridDim.x = blocksPerGrid
234 |   gridDim.y = 1
235 |   gridDim.z = 1
236 |   blockDim.x = threadsPerBlock
237 |   blockDim.y = 1
238 |   blockDim.z = 1
239 |   var args: array[arg.len, pointer]
240 |   for i in 0..<arg.len: args[i] = arg[i]
241 |   #echo "really launching kernel"
242 |   let err = cudaLaunchKernel(pp, gridDim, blockDim, addr args[0])
243 |   if err:
244 |     echo err
245 |     quit cast[cint](err)
246 | 
247 | macro cuda*(s,p: untyped): auto =
248 |   let ss = s.strVal
249 |   p.expectKind nnkProcDef
250 |   result = p
251 |   result.addPragma parseExpr("{.codegenDecl:\""&ss&" $# $#$#\".}")[0]
252 |   result.body = getAst(cudaDefs(result.body))
253 |   var sl = newStmtList()
254 |   sl.add( quote do:
255 |     {.push checks: off.}
256 |     {.push stacktrace: off.} )
257 |   sl.add result
258 |   result = sl
259 | template cudaGlobal*(p: untyped): auto = cuda("__global__",p)
260 | #+END_SRC
261 | 
262 | #+BEGIN_SRC nim -n
263 | template onGpu*(nn,tpb: untyped, body: untyped): untyped =
264 |   block:
265 |     var v = packVars(body, getGpuPtr)
266 |     type ByCopy {.bycopy.} [T] = object
267 |       d: T
268 |     proc kern(xx: ByCopy[type(v)]) {.cudaGlobal.} =
269 |       template deref(k: int): untyped = xx.d[k]
270 |       substVars(body, deref)
271 |     let ni = nn.int32
272 |     let threadsPerBlock = tpb.int32
273 |     let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock
274 |     cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v)
275 |     discard cudaDeviceSynchronize()
276 | template onGpu*(nn: untyped, body: untyped): untyped = onGpu(nn, 64, body)
277 | template onGpu*(body: untyped): untyped = onGpu(512*64, 64, body)
278 | #+END_SRC
279 | 
280 | ** The ~kern~ procedure in ~onGpu~
281 | 
282 | #+BEGIN_SRC nim -n
283 | proc kern(xx670162: ByCopy670160[type(v670158)])
284 |      {.codegenDecl: "__global__ $# $#$#".} =
285 |   var gridDim {.global, importC, noDecl.}: CudaDim3
286 |   var blockIdx {.global, importC, noDecl.}: CudaDim3
287 |   var blockDim {.global, importC, noDecl.}: CudaDim3
288 |   var threadIdx {.global, importC, noDecl.}: CudaDim3
289 |   template getGridDim(): untyped {.used.} = gridDim
290 |   template getBlockIdx(): untyped {.used.} = blockIdx
291 |   template getBlockDim(): untyped {.used.} = blockDim
292 |   template getThreadIdx(): untyped {.used.} = threadIdx
293 |   template getThreadNum(): untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x
294 |   template getNumThreads(): untyped {.used.} = gridDim.x * blockDim.x
295 |   inlineProcs:
296 |     template deref(k670164: int): untyped =
297 |       xx670162.d[k670164]
298 |     substVars((x += y * z), deref)
299 | #+END_SRC
300 | 
301 | ** Expression handling
302 | 
303 | #+BEGIN_SRC nim -n
304 | proc getVars*(v: var seq[NimNode], x,a: NimNode): NimNode =
305 |   proc recurse(it: NimNode, vars: var seq[NimNode], a: NimNode): NimNode =
306 |     var r0 = 0
307 |     var r1 = it.len - 1
308 |     case it.kind
309 |     of {nnkSym, nnkIdent}:
310 |       let i = vars.addIfNewSym(it)
311 |       if i>=0:
312 |         let ii = newLit(i)
313 |         return newCall(a,ii)
314 |     of nnkCallKinds: r0 = 1
315 |     of nnkDotExpr: r1 = 0
316 |     of {nnkVarSection,nnkLetSection}:
317 |       result = it.cpNimNode
318 |       for c in it:
319 |         result.add c.cpNimNode
320 |         for i in 0..(c.len-3):
321 |           ignore.add c[i]
322 |           result[^1].add c[i].cpNimNode
323 |         result[^1].add c[^2].cpNimNode
324 |         result[^1].add recurse(c[^1], vars, a)
325 |       return
326 |     else: discard
327 |     result = it.cpNimNode
328 |     for i in 0..<r0:
329 |       result.add it[i].cpNimNode
330 |     for i in r0..r1:
331 |       result.add recurse(it[i], vars, a)
332 |     for i in (r1+1)..<it.len:
333 |       result.add it[i].cpNimNode
334 |   ignore.newSeq(0)
335 |   result = recurse(x, v, a)
336 | 
337 | macro packVarsStmt*(x: untyped, f: untyped): auto =
338 |   var v = newSeq[NimNode](0)
339 |   let a = ident("foo")
340 |   let e = getVars(v, x, a)
341 |   var p = newStmtList()
342 |   for vs in v:
343 |     p.add newCall(f,vs)
344 |   result = p
345 | 
346 | macro packVars*(x: untyped, f: untyped): auto =
347 |   var v = newSeq[NimNode](0)
348 |   let a = ident("foo")
349 |   let e = getVars(v, x, a)
350 |   var p = newPar()
351 |   if v.len==0:
352 |     p.add newNimNode(nnkExprColonExpr).add(ident("Field0"),newLit(1))
353 |   elif v.len==1:
354 |     let vi = ident($v[0])
355 |     p.add newNimNode(nnkExprColonExpr).add(ident("Field0"),newCall(f,vi))
356 |   else:
357 |     for vs in v:
358 |       p.add newCall(f,vs)
359 |   result = p
360 | 
361 | macro substVars*(x: untyped, a: untyped): auto =
362 |   var v = newSeq[NimNode](0)
363 |   let e = getVars(v, x, a)
364 |   result = e
365 | #+END_SRC
366 | 
367 | * AST based overloading for array operations
368 | 
369 | #+BEGIN_SRC nim -n
370 | type ArrayIndex* = SomeInteger or ShortVectorIndex
371 | 
372 | template indexArray*(x: ArrayObj, i: ArrayIndex): untyped =
373 |   x.p[i]
374 | 
375 | macro indexArray*(x: ArrayObj{call}, y: ArrayIndex): untyped =
376 |   result = newCall(ident($x[0]))
377 |   for i in 1..<x.len:
378 |     let xi = x[i]
379 |     result.add( quote do:
380 |       indexArray(`xi`,`y`) )
381 | 
382 | template `[]`*(x: ArrayObj, i: ArrayIndex): untyped = indexArray(x, i)
383 | #+END_SRC
384 | 


--------------------------------------------------------------------------------
/demo3/ex1.nim:
--------------------------------------------------------------------------------
 1 | import cpugpuarray
 2 | 
 3 | let N = 64
 4 | var x = newColorMatrixArray(32,2,N)
 5 | var y = newColorMatrixArray(32,2,N)
 6 | var z = newColorMatrixArray(32,2,N)
 7 | 
 8 | threads:
 9 |   # set them to diagonal matrices on CPU
10 |   x := 1
11 |   y := 2
12 |   z := 3
13 | 
14 |   # do something on CPU
15 |   x += y * z
16 | 
17 | # do something on GPU
18 | onGpu:
19 |   x += y * z
20 |   z := 4
21 | 
22 | threads:
23 |   # do something on CPU again
24 |   x += y * z
25 | 
26 | if x[0][0,0].re == 21.0:
27 |   echo "yay, it worked!"
28 |   echo "do you agree, GPU?"
29 | else:
30 |   echo x[0][0,0].re
31 | 
32 | onGpu:
33 |   if getThreadNum()==0:
34 |     if x[0][0,0].re == 21.0:
35 |       printf("yes, I agree!\n")
36 | 
37 | # outputs:
38 | #   yay, it worked!
39 | #   do you agree, GPU?
40 | #   yes, I agree!
41 | 


--------------------------------------------------------------------------------
/demo3/ex2.nim:
--------------------------------------------------------------------------------
 1 | import timing, cpugpuarray, qexLite/metaUtils, math
 2 | 
 3 | proc test(vecLen, memLen: static[int]; N: int) =
 4 |   var
 5 |     x = newColorMatrixArray(vecLen,memLen,N) # array of N 3x3 single prec complex matrices
 6 |     y = newColorMatrixArray(vecLen,memLen,N)
 7 |     z = newColorMatrixArray(vecLen,memLen,N)
 8 |     rep = 0                     # accumulates the number of runs
 9 | 
10 |   let
11 |     mr = float(3 * 8 * x.T.N * x.T.N * N) / float(1024 * 1024 * 1024) # Resident memory in 2^30 bytes
12 |     mt = 4 * mr / 3             # Memory transaction
13 |     fp = float(8 * x.T.N * x.T.N * x.T.N * N) * 1e-9 # Floating point op / 10^9
14 |   template timeit(label:string, s:untyped) =
15 |     var
16 |       R {.global.}:int
17 |       T {.global.}:float
18 |     threadSingle:
19 |       R = 128                   # Base repeat
20 |       T = 1.0                   # Time limit
21 |     var t = timex(rep, R, s)    # Always warm up cache
22 |     while true:
23 |       threadSingle:
24 |         R = min(64*R,max(R,int(R.float*0.8/t))) # set up to run for at least 0.8 sec or 64*R
25 |       t = timex(rep, R, s)
26 |       threadSingle: T -= t
27 |       if T < 0: break
28 |     threadSingle:               # Use the last R & t for performance measure
29 |       printf("%8d %3d %d %-8s rep: %7d KB: %8.0f ms: %8.4f GF/s: %7.2f GB/s: %7.2f\n",
30 |              N, vecLen, memLen, label, R, 1024*1024*mr, 1e3*t/R.float, fp*R.float/t, mt*R.float/t)
31 | 
32 |   threads:                      # CPU threads
33 |     x := 0                      # set them to diagonal matrices on CPU
34 |     y := 1
35 |     z := 2
36 |     timeit "CPU": x += y * z
37 | 
38 |   timeit "GPU5":                # includes kernel launching and synchronization
39 |     onGpu(N, 32):               # Number of threads, threads per block
40 |       x += y * z
41 |   timeit "GPU6": onGpu(N, 64): x += y * z
42 |   timeit "GPU7": onGpu(N, 128): x += y * z
43 | 
44 |   threads: timeit "CPU": x += y * z # back to CPU threads again
45 | 
46 |   let scale = 0.5 / (sqrt(3.0) * rep.float)
47 |   threads:
48 |     x *= scale
49 |     var n = x.norm2
50 |     threadSingle: echo "# Final scaled x.norm2: ",n,"  rep: ",rep
51 |   x.free
52 |   y.free
53 |   z.free
54 | 
55 | for n in 8..26:
56 |   staticFor v, 2, 7:
57 |     when (1 shl v) >= (structsize(vectorizedElementType(float32)) div sizeof(float32)):
58 |       staticFor ml, 1, 2:
59 |         test(1 shl v, ml, 1 shl n)
60 | 


--------------------------------------------------------------------------------
/demo3/gpuarray.nim:
--------------------------------------------------------------------------------
  1 | import coalesced
  2 | 
  3 | when not declared(haveCuda):
  4 |   const haveCuda = true
  5 | 
  6 | when haveCuda:
  7 |   import ../cuda
  8 | 
  9 | import macros
 10 | include system/ansi_c
 11 | import linalg
 12 | 
 13 | type
 14 |   GpuArrayObj*[V,M:static[int],T] = object
 15 |     p*: Coalesced[V,M,T]
 16 |     n*: int
 17 |   # GpuArrayRef*[V,M:static[int],T] = ref GpuArrayObj[V,M,T]
 18 |   # GpuArray*[V,M:static[int],T] = GpuArrayRef[V,M,T]
 19 |   # GpuArrays* = GpuArrayObj | GpuArrayRef
 20 |   # GpuArrays2* = GpuArrayObj | GpuArrayRef
 21 |   # GpuArrays3* = GpuArrayObj | GpuArrayRef
 22 | 
 23 | # Nim Bug, cannot overload this function with generic static parameters.
 24 | # proc init*(r: var GpuArrayObj, n: int) =
 25 | #   type T = r.T
 26 | #   var p: ptr T
 27 | #   when haveCuda:
 28 | #     let err = cudaMalloc(cast[ptr pointer](addr p), n*sizeof(T))
 29 | #     if err:
 30 | #       echo "alloc err: ", err
 31 | #       quit(-1)
 32 | #   else:
 33 | #     p = createSharedU(T, n)
 34 | #   r.n = n
 35 | #   r.p.newCoalesced(r.V, r.M, p, n)
 36 | # proc init[V,M:static[int],T](r: var GpuArrayRef[V,M,T], n: int) =
 37 | #   r.new
 38 | #   r[].init(n)
 39 | 
 40 | proc free*(r: var GpuArrayObj) =
 41 |   when haveCuda: discard r.p.p.cudaFree
 42 | # proc free*[V,M:static[int],T](r: GpuArrayRef[V,M,T]) =
 43 | #   when haveCuda: discard r.p.p.cudaFree
 44 | 
 45 | proc initGpuArrayObj*(r: var GpuArrayObj, n: int) =
 46 |   type T = r.T
 47 |   var p: ptr T
 48 |   when haveCuda:
 49 |     let err = cudaMalloc(cast[ptr pointer](addr p), n*sizeof(T))
 50 |     if err:
 51 |       echo "alloc err: ", err
 52 |       quit(-1)
 53 |   else:
 54 |     p = createSharedU(T, n)
 55 |   r.n = n
 56 |   r.p.initCoalesced(p, n)
 57 |   # echo "GpuArray init done."
 58 | proc newGpuArrayObj*(V,M:static[int], n:int, T:typedesc): auto {.noinit.} =
 59 |   var z {.noinit.}: GpuArrayObj[V,M,T]
 60 |   z.initGpuArrayObj(n)
 61 |   z
 62 | 
 63 | # proc newGpuArrayRef*[V,M:static[int],T](r: var GpuArrayRef[V,M,T], n: int) =
 64 | #   r.init(n)
 65 | # proc newGpuArrayRef*[T](V,M:static[int], n: int): auto {.noinit.} =
 66 | #   var z {.noinit.}: GpuArrayRef[V,M,T]
 67 | #   z.init(n)
 68 | #   z
 69 | 
 70 | template getGpuPtr*(x: SomeNumber): untyped = x
 71 | template getGpuPtr*(x: GpuArrayObj): untyped = x
 72 | # template getGpuPtr*(x: GpuArrayRef): untyped = x[]
 73 | #template getGpuPtr(x: GpuArrayRef): untyped = x.p
 74 | #template getGpuPtr(x: GpuArrayRef): untyped = (p:x.p,n:x.n)
 75 | 
 76 | template indexGpuArray*(x: GpuArrayObj, i: SomeInteger): untyped =
 77 |   x.p[i]
 78 | 
 79 | macro indexGpuArray*(x: GpuArrayObj{call}, y: SomeInteger): untyped =
 80 |   #echo "call[", y.repr, "]"
 81 |   #echo x.treerepr
 82 |   #if siteLocalsField.contains($x[0]):
 83 |   result = newCall(ident($x[0]))
 84 |   for i in 1..<x.len:
 85 |     let xi = x[i]
 86 |     result.add( quote do:
 87 |       indexGpuArray(`xi`,`y`) )
 88 |   #else:
 89 |   #  result = quote do:
 90 |   #    let tt = `x`
 91 |   #    tt.d[`y`]
 92 |   #echo result.treerepr
 93 |   #echo result.repr
 94 | 
 95 | template `[]`*(x: GpuArrayObj, i: SomeInteger): untyped = indexGpuArray(x, i)
 96 | # template `[]=`*(x: GpuArrayObj, i: SomeInteger, y: untyped): untyped =
 97 | #   x.p[][i] = y
 98 | 
 99 | # template `[]`*(x: GpuArrayRef, i: SomeInteger): untyped =
100 | #   echo "GAR[]"
101 | #   x.p[][i]
102 | # template `[]=`*(x: GpuArrayRef, i: SomeInteger, y: untyped): untyped =
103 | #   x.p[][i] = y
104 | 
105 | # var threadNum = 0
106 | # var numThreads = 1
107 | # template getThreadNum: untyped = threadNum
108 | # template getNumThreads: untyped = numThreads
109 | template `:=`*(x: GpuArrayObj, y: GpuArrayObj) =
110 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
111 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
112 |   mixin getThreadNum, getNumThreads
113 |   let tid = getThreadNum()
114 |   let nid = getNumThreads()
115 |   var i = tid
116 |   while i<x.n:
117 |     x[i] := y[i]
118 |     i += nid
119 | 
120 | template `:=`*(x: GpuArrayObj, y: SomeNumber) =
121 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
122 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
123 |   mixin getThreadNum, getNumThreads
124 |   let tid = getThreadNum()
125 |   let nid = getNumThreads()
126 |   var i = tid
127 |   while i<x.n:
128 |     x[i] := y
129 |     #echo i, "/", x.n
130 |     i += nid
131 | 
132 | template `+=`*(x: GpuArrayObj, y: SomeNumber) =
133 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
134 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
135 |   mixin getThreadNum, getNumThreads
136 |   let tid = getThreadNum()
137 |   let nid = getNumThreads()
138 |   var i = tid
139 |   #cprintf("%i/%i\n", i, x.n)
140 |   while i<x.n:
141 |     x[i] += y
142 |     #cprintf("%i/%i\n", i, x.n)
143 |     i += nid
144 | 
145 | template `+=`*(x: GpuArrayObj, y: GpuArrayObj) =
146 |   #cprintf("t %i/%i  b %i/%i\n", getThreadIdx(), getThreadDim(), getBlockIdx(), getBlockDim())
147 |   #let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x
148 |   mixin getThreadNum, getNumThreads
149 |   let tid = getThreadNum()
150 |   let nid = getNumThreads()
151 |   var i = tid
152 |   #cprintf("%i/%i\n", i, x.n)
153 |   while i<x.n:
154 |     x[i] += y[i]
155 |     #cprintf("%i/%i\n", i, x.n)
156 |     i += nid
157 | 
158 | proc `+`*[VX,VY,MX,MY:static[int],TX,TY](x: GpuArrayObj[VX,MX,TX], y: GpuArrayObj[VY,MY,TY]): auto =
159 |   var r: GpuArrayObj[x.V,x.M,type(x[0]+y[0])]
160 |   # when x is GpuArrayObj:
161 |   #   var r: GpuArrayObj[x.V,x.M,type(x[0]+y[0])]
162 |   # else:
163 |   #   var r: GpuArrayRef[x.V,x.M,type(x[0]+y[0])]
164 |   cprintf("+\n")
165 |   r
166 | proc `*`*[VX,VY,MX,MY:static[int],TX,TY](x: GpuArrayObj[VX,MX,TX], y: GpuArrayObj[VY,MY,TY]): auto =
167 |   var r: GpuArrayObj[x.V,x.M,type(x[0]*y[0])]
168 |   # when x is GpuArrayObj:
169 |   #   var r: GpuArrayObj[x.V,x.M,type(x[0]*y[0])]
170 |   # else:
171 |   #   var r: GpuArrayRef[x.V,x.M,type(x[0]*y[0])]
172 |   cprintf("*\n")
173 |   r
174 | 
175 | when isMainModule:
176 |   var N = 1000
177 | 
178 |   proc testfloat =
179 |     # var x,y,z:GpuArrayObj[4,1,float32]
180 |     # x.initGpuArrayObj(N)
181 |     # y.initGpuArrayObj(N)
182 |     # z.initGpuArrayObj(N)
183 |     var x = newGpuArrayObj(4,1,N,float32)
184 |     var y = newGpuArrayObj(4,1,N,float32)
185 |     var z = newGpuArrayObj(4,1,N,float32)
186 |     #cprintf("x.n: %i\n", x.n)
187 |     onGpu(1,32):
188 |       x += y * z
189 |   testfloat()
190 | 
191 | when false:
192 |   proc testcomplex =
193 |     var x = newGpuArrayRef[Complex[float32]](N)
194 |     var y = newGpuArrayRef[Complex[float32]](N)
195 |     var z = newGpuArrayRef[Complex[float32]](N)
196 |     onGpu(N):
197 |       x += y * z
198 |   testcomplex()
199 | 
200 |   proc testcolmat =
201 |     var x = newGpuArrayRef[Colmat[3,float32]](N)
202 |     var y = newGpuArrayRef[Colmat[3,float32]](N)
203 |     var z = newGpuArrayRef[Colmat[3,float32]](N)
204 |     #y := 1
205 |     #z := 2
206 |     onGpu(N):
207 |       x += y * z
208 |   testcolmat()
209 | 


--------------------------------------------------------------------------------
/demo3/linalg.nim:
--------------------------------------------------------------------------------
  1 | import qexLite/metaUtils
  2 | import ../inline
  3 | 
  4 | type SomeNumber2* = SomeInteger | SomeReal
  5 | template `:=`*(x: var SomeNumber, y: SomeNumber2) =
  6 |   type tx = type(x)
  7 |   x = (tx)(y)
  8 | template `+=`*(x: var SomeNumber, y: SomeNumber2) =
  9 |   bind `+=`    # So the following += doesn't call this template again.
 10 |   type tx = type(x)
 11 |   x += (tx)(y)
 12 | 
 13 | type
 14 |   Complex*[T] = object
 15 |     re*,im*: T
 16 | proc structSize*[T](t:typedesc[Complex[T]]):int = 2*sizeof(T)
 17 | template `:=`*[T](x: var Complex[T], y: T) =
 18 |   let z = y
 19 |   x.re := z
 20 |   x.im := 0
 21 | template `:=`*[T](x: var Complex[T], y: SomeNumber) =
 22 |   let z = y
 23 |   x.re := z
 24 |   x.im := 0
 25 | template `:=`*[T](x: var Complex[T], y: Complex[T]) =
 26 |   let z = y
 27 |   x.re := z.re
 28 |   x.im := z.im
 29 | template `+=`*[T](x: var Complex[T], y: T) =
 30 |   let z = y
 31 |   x.re += z
 32 | template `+=`*[T](x: var Complex[T], y: SomeNumber) =
 33 |   let z = y
 34 |   x.re += z
 35 | template `+=`*[T](x: var Complex[T], y: Complex[T]) =
 36 |   let z = y
 37 |   x.re += z.re
 38 |   x.im += z.im
 39 | template `+`*[T](x: Complex[T], y:T): untyped =
 40 |   type tx = type(x)
 41 |   var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re+y)]
 42 |   r.re := x.re + y
 43 |   r.im := x.im
 44 |   r
 45 | template `+`*[T](x: Complex[T], y:SomeNumber): untyped =
 46 |   type tx = type(x)
 47 |   var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re+y)]
 48 |   r.re := x.re + y
 49 |   r.im := x.im
 50 |   r
 51 | template `+`*[T](x,y: Complex[T]): untyped =
 52 |   type tx = type(x)
 53 |   var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re+y.re)]
 54 |   r.re := x.re + y.re
 55 |   r.im := x.im + y.im
 56 |   r
 57 | template `*`*[T](x,y: Complex[T]): untyped =
 58 |   type tx = type(x)
 59 |   var r {.noInit.}: tx #Complex[x.T] #Complex[type(x.re*y.re)]
 60 |   r.re := x.re*y.re - x.im*y.im
 61 |   r.im := x.re*y.im + x.im*y.re
 62 |   r
 63 | template `*=`*[T](x: var Complex[T], y: SomeNumber) =
 64 |   let z = y
 65 |   x.re *= z
 66 |   x.im *= z
 67 | template norm2*(xx:Complex):untyped =
 68 |   let x = xx
 69 |   mixin norm2
 70 |   x.re.norm2 + x.im.norm2
 71 | 
 72 | type
 73 |   Colmat*[N:static[int],T] = object
 74 |     d*: array[N,array[N,Complex[T]]]
 75 | proc structSize*[N:static[int],T](t:typedesc[Colmat[N,T]]):int = 2*N*N*sizeof(T)
 76 | template `[]`*(x: Colmat, i,j: int): untyped = x.d[i][j]
 77 | template `:=`*[N:static[int],T](x: var Colmat[N,T], y: SomeNumber) =
 78 |   let z = y
 79 |   staticfor i, 0, N-1:
 80 |     staticfor j, 0, N-1:
 81 |       when i==j:
 82 |         x.d[i][j] := z
 83 |       else:
 84 |         x.d[i][j] := 0
 85 | template `:=`*[N:static[int],T](x: var Colmat[N,T], y: Colmat[N,T]) =
 86 |   let z = y
 87 |   staticfor i, 0, N-1:
 88 |     staticfor j, 0, N-1:
 89 |       x.d[i][j] := z.d[i][j]
 90 | template `+=`*[N:static[int],T](x: var Colmat[N,T], y: Colmat[N,T]) =
 91 |   let z = y
 92 |   staticfor i, 0, N-1:
 93 |     staticfor j, 0, N-1:
 94 |       x.d[i][j] += z.d[i][j]
 95 | template `+`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
 96 |   let xx = x
 97 |   let yy = y
 98 |   var r {.noInit.}: Colmat[N,type(x.d[0][0].re)]
 99 |   staticfor i, 0, N-1:
100 |     staticfor j, 0, N-1:
101 |       r.d[i][j] := xx.d[i][j] + yy.d[i][j]
102 |   r
103 | template `*`*[N:static[int],T](x,y: Colmat[N,T]): untyped =
104 |   let xx = x
105 |   let yy = y
106 |   var r {.noInit.}: Colmat[N,type(x.d[0][0].re)]
107 |   staticfor i, 0, N-1:
108 |     staticfor j, 0, N-1:
109 |       r.d[i][j] := xx.d[i][0] * yy.d[0][j]
110 |     staticfor k, 1, N-1:
111 |       staticfor j, 0, N-1:
112 |         r.d[i][j] += xx.d[i][k] * yy.d[k][j]
113 |   r
114 | template `*=`*[N:static[int],T](x: var Colmat[N,T], y: SomeNumber) =
115 |   let z = y
116 |   staticfor i, 0, N-1:
117 |     staticfor j, 0, N-1:
118 |       x.d[i][j] *= z
119 | template norm2*(xx:Colmat):untyped =
120 |   let x = xx
121 |   var r {.noinit.}: type(x.d[0][0].re.norm2)
122 |   const n = x.N-1
123 |   mixin norm2
124 |   r = x.d[0][0].norm2
125 |   staticfor i, 0, n:
126 |     staticfor j, 0, n:
127 |       when (i != 0) and (j != 0):
128 |         r += x.d[i][j].norm2
129 |   r
130 | 
131 | when isMainModule:
132 |   var x,y,z: ref Complex[float]
133 |   x.new
134 |   y.new
135 |   z.new
136 |   x[] += y[]*z[]
137 |   echo x[]
138 | 


--------------------------------------------------------------------------------
/demo3/llbits.h:
--------------------------------------------------------------------------------
 1 | #ifndef _CUDANIM_LLBITS_H_
 2 | #define _CUDANIM_LLBITS_H_
 3 | #include<stdint.h>
 4 | // Prepare to break the strict aliasing rule.
 5 | typedef uint32_t __attribute__((__may_alias__,__aligned__(4))) RegisterWord;
 6 | typedef struct MemoryWord1 {RegisterWord a[1];} __attribute__((__may_alias__,__aligned__(4))) MemoryWord1;
 7 | typedef struct MemoryWord2 {RegisterWord a[2];} __attribute__((__may_alias__,__aligned__(8))) MemoryWord2;
 8 | typedef struct MemoryWord4 {RegisterWord a[4];} __attribute__((__may_alias__,__aligned__(16))) MemoryWord4;
 9 | typedef struct MemoryWord8 {RegisterWord a[8];} __attribute__((__may_alias__,__aligned__(32))) MemoryWord8;
10 | typedef struct MemoryWord16 {RegisterWord a[16];} __attribute__((__may_alias__,__aligned__(64))) MemoryWord16;
11 | typedef struct MemoryWord32 {RegisterWord a[32];} __attribute__((__may_alias__,__aligned__(128))) MemoryWord32;
12 | #endif//_CUDANIM_LLBITS_H_
13 | 


--------------------------------------------------------------------------------
/demo3/out/bludhaven.info:
--------------------------------------------------------------------------------
  1 | nvcc: NVIDIA (R) Cuda compiler driver
  2 | Copyright (c) 2005-2015 NVIDIA Corporation
  3 | Built on Tue_Aug_11_14:27:32_CDT_2015
  4 | Cuda compilation tools, release 7.5, V7.5.17
  5 | g++-4.8 (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4
  6 | Copyright (C) 2013 Free Software Foundation, Inc.
  7 | This is free software; see the source for copying conditions.  There is NO
  8 | warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  9 | 
 10 | processor	: 0
 11 | vendor_id	: GenuineIntel
 12 | cpu family	: 6
 13 | model		: 58
 14 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
 15 | stepping	: 9
 16 | microcode	: 0x12
 17 | cpu MHz		: 1600.000
 18 | cache size	: 8192 KB
 19 | physical id	: 0
 20 | siblings	: 8
 21 | core id		: 0
 22 | cpu cores	: 4
 23 | apicid		: 0
 24 | initial apicid	: 0
 25 | fpu		: yes
 26 | fpu_exception	: yes
 27 | cpuid level	: 13
 28 | wp		: yes
 29 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
 30 | bogomips	: 6784.24
 31 | clflush size	: 64
 32 | cache_alignment	: 64
 33 | address sizes	: 36 bits physical, 48 bits virtual
 34 | power management:
 35 | 
 36 | processor	: 1
 37 | vendor_id	: GenuineIntel
 38 | cpu family	: 6
 39 | model		: 58
 40 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
 41 | stepping	: 9
 42 | microcode	: 0x12
 43 | cpu MHz		: 1600.000
 44 | cache size	: 8192 KB
 45 | physical id	: 0
 46 | siblings	: 8
 47 | core id		: 1
 48 | cpu cores	: 4
 49 | apicid		: 2
 50 | initial apicid	: 2
 51 | fpu		: yes
 52 | fpu_exception	: yes
 53 | cpuid level	: 13
 54 | wp		: yes
 55 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
 56 | bogomips	: 6784.24
 57 | clflush size	: 64
 58 | cache_alignment	: 64
 59 | address sizes	: 36 bits physical, 48 bits virtual
 60 | power management:
 61 | 
 62 | processor	: 2
 63 | vendor_id	: GenuineIntel
 64 | cpu family	: 6
 65 | model		: 58
 66 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
 67 | stepping	: 9
 68 | microcode	: 0x12
 69 | cpu MHz		: 3000.000
 70 | cache size	: 8192 KB
 71 | physical id	: 0
 72 | siblings	: 8
 73 | core id		: 2
 74 | cpu cores	: 4
 75 | apicid		: 4
 76 | initial apicid	: 4
 77 | fpu		: yes
 78 | fpu_exception	: yes
 79 | cpuid level	: 13
 80 | wp		: yes
 81 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
 82 | bogomips	: 6784.24
 83 | clflush size	: 64
 84 | cache_alignment	: 64
 85 | address sizes	: 36 bits physical, 48 bits virtual
 86 | power management:
 87 | 
 88 | processor	: 3
 89 | vendor_id	: GenuineIntel
 90 | cpu family	: 6
 91 | model		: 58
 92 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
 93 | stepping	: 9
 94 | microcode	: 0x12
 95 | cpu MHz		: 1600.000
 96 | cache size	: 8192 KB
 97 | physical id	: 0
 98 | siblings	: 8
 99 | core id		: 3
100 | cpu cores	: 4
101 | apicid		: 6
102 | initial apicid	: 6
103 | fpu		: yes
104 | fpu_exception	: yes
105 | cpuid level	: 13
106 | wp		: yes
107 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
108 | bogomips	: 6784.24
109 | clflush size	: 64
110 | cache_alignment	: 64
111 | address sizes	: 36 bits physical, 48 bits virtual
112 | power management:
113 | 
114 | processor	: 4
115 | vendor_id	: GenuineIntel
116 | cpu family	: 6
117 | model		: 58
118 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
119 | stepping	: 9
120 | microcode	: 0x12
121 | cpu MHz		: 1600.000
122 | cache size	: 8192 KB
123 | physical id	: 0
124 | siblings	: 8
125 | core id		: 0
126 | cpu cores	: 4
127 | apicid		: 1
128 | initial apicid	: 1
129 | fpu		: yes
130 | fpu_exception	: yes
131 | cpuid level	: 13
132 | wp		: yes
133 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
134 | bogomips	: 6784.24
135 | clflush size	: 64
136 | cache_alignment	: 64
137 | address sizes	: 36 bits physical, 48 bits virtual
138 | power management:
139 | 
140 | processor	: 5
141 | vendor_id	: GenuineIntel
142 | cpu family	: 6
143 | model		: 58
144 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
145 | stepping	: 9
146 | microcode	: 0x12
147 | cpu MHz		: 1600.000
148 | cache size	: 8192 KB
149 | physical id	: 0
150 | siblings	: 8
151 | core id		: 1
152 | cpu cores	: 4
153 | apicid		: 3
154 | initial apicid	: 3
155 | fpu		: yes
156 | fpu_exception	: yes
157 | cpuid level	: 13
158 | wp		: yes
159 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
160 | bogomips	: 6784.24
161 | clflush size	: 64
162 | cache_alignment	: 64
163 | address sizes	: 36 bits physical, 48 bits virtual
164 | power management:
165 | 
166 | processor	: 6
167 | vendor_id	: GenuineIntel
168 | cpu family	: 6
169 | model		: 58
170 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
171 | stepping	: 9
172 | microcode	: 0x12
173 | cpu MHz		: 1600.000
174 | cache size	: 8192 KB
175 | physical id	: 0
176 | siblings	: 8
177 | core id		: 2
178 | cpu cores	: 4
179 | apicid		: 5
180 | initial apicid	: 5
181 | fpu		: yes
182 | fpu_exception	: yes
183 | cpuid level	: 13
184 | wp		: yes
185 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
186 | bogomips	: 6784.24
187 | clflush size	: 64
188 | cache_alignment	: 64
189 | address sizes	: 36 bits physical, 48 bits virtual
190 | power management:
191 | 
192 | processor	: 7
193 | vendor_id	: GenuineIntel
194 | cpu family	: 6
195 | model		: 58
196 | model name	: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz
197 | stepping	: 9
198 | microcode	: 0x12
199 | cpu MHz		: 1600.000
200 | cache size	: 8192 KB
201 | physical id	: 0
202 | siblings	: 8
203 | core id		: 3
204 | cpu cores	: 4
205 | apicid		: 7
206 | initial apicid	: 7
207 | fpu		: yes
208 | fpu_exception	: yes
209 | cpuid level	: 13
210 | wp		: yes
211 | flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms
212 | bogomips	: 6784.24
213 | clflush size	: 64
214 | cache_alignment	: 64
215 | address sizes	: 36 bits physical, 48 bits virtual
216 | power management:
217 | 
218 | /homes/xjin/tmp/samples/1_Utilities/deviceQuery/deviceQuery Starting...
219 | 
220 |  CUDA Device Query (Runtime API) version (CUDART static linking)
221 | 
222 | Detected 1 CUDA Capable device(s)
223 | 
224 | Device 0: "Quadro 600"
225 |   CUDA Driver Version / Runtime Version          8.0 / 7.5
226 |   CUDA Capability Major/Minor version number:    2.1
227 |   Total amount of global memory:                 964 MBytes (1010761728 bytes)
228 |   ( 2) Multiprocessors, ( 48) CUDA Cores/MP:     96 CUDA Cores
229 |   GPU Max Clock rate:                            1280 MHz (1.28 GHz)
230 |   Memory Clock rate:                             800 Mhz
231 |   Memory Bus Width:                              128-bit
232 |   L2 Cache Size:                                 131072 bytes
233 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048)
234 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
235 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
236 |   Total amount of constant memory:               65536 bytes
237 |   Total amount of shared memory per block:       49152 bytes
238 |   Total number of registers available per block: 32768
239 |   Warp size:                                     32
240 |   Maximum number of threads per multiprocessor:  1536
241 |   Maximum number of threads per block:           1024
242 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
243 |   Max dimension size of a grid size    (x,y,z): (65535, 65535, 65535)
244 |   Maximum memory pitch:                          2147483647 bytes
245 |   Texture alignment:                             512 bytes
246 |   Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
247 |   Run time limit on kernels:                     Yes
248 |   Integrated GPU sharing Host Memory:            No
249 |   Support host page-locked memory mapping:       Yes
250 |   Alignment requirement for Surfaces:            Yes
251 |   Device has ECC support:                        Disabled
252 |   Device supports Unified Addressing (UVA):      Yes
253 |   Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
254 |   Compute Mode:
255 |      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
256 | 
257 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 8.0, CUDA Runtime Version = 7.5, NumDevs = 1, Device0 = Quadro 600
258 | Result = PASS
259 | [CUDA Bandwidth Test] - Starting...
260 | Running on...
261 | 
262 |  Device 0: Quadro 600
263 |  Quick Mode
264 | 
265 |  Host to Device Bandwidth, 1 Device(s)
266 |  PINNED Memory Transfers
267 |    Transfer Size (Bytes)	Bandwidth(MB/s)
268 |    33554432			6488.5
269 | 
270 |  Device to Host Bandwidth, 1 Device(s)
271 |  PINNED Memory Transfers
272 |    Transfer Size (Bytes)	Bandwidth(MB/s)
273 |    33554432			6483.5
274 | 
275 |  Device to Device Bandwidth, 1 Device(s)
276 |  PINNED Memory Transfers
277 |    Transfer Size (Bytes)	Bandwidth(MB/s)
278 |    33554432			20514.2
279 | 
280 | Result = PASS
281 | 
282 | NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
283 | 


--------------------------------------------------------------------------------
/demo3/qexLite/alignedMem.nim:
--------------------------------------------------------------------------------
 1 | import strUtils
 2 | import stdUtils
 3 | 
 4 | type
 5 |   alignedMem*[T] = object
 6 |     len*: int
 7 |     align*: int
 8 |     stride*: int
 9 |     bytes*: int
10 |     mem*: ref cArray[char]
11 |     data*: ptr cArray[T]
12 | 
13 | proc unsafeNewU*[T](a: var ref T, size: Natural) =
14 |   {.emit: "N_NIMCALL(void*, newObjNoInit)(TNimType* typ0, NI size0);".}
15 |   {.emit: "#define newObj newObjNoInit".}
16 |   unsafeNew(a, size)
17 |   {.emit: "#undef newObj".}
18 | 
19 | proc ptrAlign[T](p:ptr T; a:int):ptr T =
20 |   let x = cast[ByteAddress](p)
21 |   let a1 = a - 1
22 |   let y = x + (a1-((x+a1) mod a))
23 |   #echo x, ":", y
24 |   result = cast[type(result)](y)
25 | 
26 | proc new*[T](t:var alignedMem[T], n:int, align:int=64) =
27 |   t.len = n
28 |   t.align = align
29 |   t.stride = sizeof(T)
30 |   t.bytes = t.len * t.stride + t.align
31 |   unsafeNew(t.mem, t.bytes)
32 |   t.data = ptrAlign(cast[ptr cArray[T]](t.mem[0].addr), align)
33 | proc newU*[T](t:var alignedMem[T], n:int, align:int=64) =
34 |   t.len = n
35 |   t.align = align
36 |   t.stride = sizeof(T)
37 |   t.bytes = t.len * t.stride + t.align
38 |   unsafeNewU(t.mem, t.bytes)
39 |   t.data = ptrAlign(cast[ptr cArray[T]](t.mem[0].addr), align)
40 | proc newAlignedMem*[T](t:var alignedMem[T], n:int, align:int=64) =
41 |   new(t, n, align)
42 | proc newAlignedMem*[T](n:int, align:int=64): alignedMem[T] =
43 |   newAlignedMem[T](result, n, align)
44 | proc newAlignedMemU*[T](t:var alignedMem[T], n:int, align:int=64) =
45 |   newU(t, n, align)
46 | proc newAlignedMemU*[T](n:int, align:int=64): alignedMem[T] =
47 |   newAlignedMemU[T](result, n, align)
48 | 
49 | template low*(s:alignedMem):untyped = 0
50 | template high*(s:alignedMem):untyped = s.len-1
51 | proc `[]`*[T](s:alignedMem[T], i:SomeInteger):var T =
52 |   result = s.data[i]
53 | #template `[]`*[T](s:alignedMem[T], i:SomeInteger):untyped = s.data[i]
54 | template `[]`*[T](s:var alignedMem[T], i:SomeInteger):untyped = s.data[i]
55 | template `[]=`*[T](s:var alignedMem[T], i:SomeInteger, v:untyped) =
56 |   s.data[i] = v
57 | 
58 | when isMainModule:
59 |   var x: alignedMem[float]
60 |   newAlignedMem(x, 10)
61 |   let c0 = cast[ByteAddress](x.mem[0].addr)
62 |   echo c0, " ", toHex(c0,8)
63 |   let x0 = cast[ByteAddress](x[0].addr)
64 |   echo x0, " ", toHex(x0,8)
65 | 
66 |   for i in x.low..x.high:
67 |     x[i] = float(i)
68 |   for i in 0..<x.len:
69 |     assert(x[i] == float(i))
70 | 


--------------------------------------------------------------------------------
/demo3/qexLite/comms/comms.nim:
--------------------------------------------------------------------------------
1 | import commsQmp
2 | export commsQmp
3 | 


--------------------------------------------------------------------------------
/demo3/qexLite/comms/commsQmp.nim:
--------------------------------------------------------------------------------
  1 | import base/threading
  2 | import times
  3 | import os
  4 | import macros
  5 | import strUtils
  6 | import qmp
  7 | 
  8 | var myRank* = 0
  9 | var nRanks* = 1
 10 | 
 11 | proc commsInit* =
 12 |   var argc {.importc:"cmdCount", global.}:cint
 13 |   var argv {.importc:"cmdLine", global.}:ptr cstring
 14 |   var prv = QMP_THREAD_FUNNELED
 15 |   #var prv = QMP_THREAD_SERIALIZED
 16 |   let err = QMP_init_msg_passing(argc.addr, argv.addr, prv, prv.addr)
 17 |   myRank = int(QMP_get_node_number())
 18 |   nRanks = int(QMP_get_number_of_nodes())
 19 | proc commsFinalize* =
 20 |   QMP_finalize_msg_passing()
 21 | proc commsAbort*(status = -1) =
 22 |   QMP_abort(status.cint)
 23 | 
 24 | proc evalArgs*(call:var NimNode; args:NimNode):NimNode =
 25 |   result = newStmtList()
 26 |   for i in 0..<args.len:
 27 |     let t = genSym()
 28 |     let a = args[i]
 29 |     result.add(quote do:
 30 |       let `t` = `a`
 31 |       )
 32 |     call.add(t)
 33 | proc cprintf*(fmt:cstring){.importc:"printf",varargs,header:"<stdio.h>".}
 34 | #proc printfOrdered(
 35 | macro printf*(fmt:string; args:varargs[untyped]):auto =
 36 |   var call = newCall(ident("cprintf"), fmt)
 37 |   result = evalArgs(call, args)
 38 |   result.add(quote do:
 39 |     if myRank==0 and threadNum==0:
 40 |       `call`
 41 |     )
 42 | proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".}
 43 | macro echoAll*(args:varargs[untyped]):auto =
 44 |   var call = newCall(bindSym"echoRaw")
 45 |   result = evalArgs(call, args)
 46 |   result.add(quote do:
 47 |     `call`
 48 |     )
 49 | macro echoRank*(args:varargs[untyped]):auto =
 50 |   var call = newCall(bindSym"echoRaw")
 51 |   call.add ident"myRank"
 52 |   call.add newLit"/"
 53 |   call.add ident"nRanks"
 54 |   call.add newLit": "
 55 |   result = evalArgs(call, args)
 56 |   template f(x:untyped):untyped =
 57 |     if threadNum==0: x
 58 |   result.add getAst(f(call))
 59 | macro echo0*(args: varargs[untyped]): auto =
 60 |   var call = newCall(bindSym"echoRaw")
 61 |   result = evalArgs(call, args)
 62 |   result.add(quote do:
 63 |     if myRank==0 and threadNum==0:
 64 |       `call`
 65 |     )
 66 | macro makeEchos(n:static[int]):auto =
 67 |   template ech(x,y: untyped): untyped =
 68 |     template echo*(): untyped =
 69 |       when nimvm:
 70 |         x
 71 |       else:
 72 |         y
 73 |   result = newStmtList()
 74 |   var er = newCall(bindSym"echoRaw")
 75 |   var e0 = newCall(bindSym"echo0")
 76 |   var ea = newSeq[NimNode](0)
 77 |   for i in 1..n:
 78 |     let ai = ident("a" & $i)
 79 |     er.add ai
 80 |     e0.add ai
 81 |     ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode())
 82 |     var t = getAst(ech(er,e0))
 83 |     #echo t.treerepr
 84 |     for j in 0..<i:
 85 |       t[0][3].add ea[j]
 86 |     result.add t
 87 |   #echoAll result.repr
 88 | makeEchos(10)
 89 | 
 90 | proc unwrap(x:NimNode):seq[NimNode] =
 91 |   result = @[]
 92 |   let t = x.getType
 93 |   #echo x.treeRepr
 94 |   #echo t.treeRepr
 95 |   #echo t.typekind
 96 |   if t.typekind==ntyTuple:
 97 |     let n = t.len - 1
 98 |     for i in 0..<n:
 99 |       let id = newLit(i)
100 |       result.add(quote do:
101 |         `x`[`id`]
102 |         )
103 |   else:
104 |     result.add(quote do:
105 |       `x`[]
106 |       )
107 |   #echo result.repr
108 | 
109 | macro rankSumN*(a:varargs[typed]):auto =
110 |   #echo "rankSum: ", a.repr
111 |   #echo a.treeRepr
112 |   var i0 = 0
113 |   let t0 = a[0].getType
114 |   if a.len==1:
115 |     let a0 = a[0]
116 |     result = quote do:
117 |       if threadNum==0:
118 |         qmpSum(`a0`)
119 |     return result
120 |   #echo t0.repr
121 |   #echo t0.typekind
122 |   if t0.typekind==ntyFloat32 or t0.typekind==ntyFloat:
123 |     #echo "got float"
124 |     i0 = -1
125 |     for i in 1..<a.len:
126 |       #echo a[i].getType.repr
127 |       if a[i].getType.repr != t0.repr:
128 |         if a[i].getType is float32|float64:
129 |           quit("can't mix float types in rankSum")
130 |         i0 = i
131 |         break
132 |   if i0<0:
133 |     var s = newNimNode(nnkStmtList)
134 |     let t = !"t"
135 |     for i in 0..<a.len:
136 |       let ai = a[i]
137 |       let x = quote do:
138 |         `ai` = `t`[`i`]
139 |       s.add x[0]
140 |     result = quote do:
141 |       if threadNum==0:
142 |         var `t` = `a`
143 |         qmpSum(`t`)
144 |         `s`
145 |   else:
146 |     result = newCall(!"rankSum")
147 |     for i in 0..<a.len:
148 |       if i==i0:
149 |         let ai = unwrap(a[i])
150 |         for j in 0..<ai.len:
151 |           result.add(ai[j])
152 |       else:
153 |         result.add(a[i])
154 |   #echo result.repr
155 | macro rankSum*(a:varargs[untyped]):auto =
156 |   if a.len==1:
157 |     let a0 = a[0]
158 |     result = quote do:
159 |       if threadNum==0:
160 |         qmpSum(`a0`)
161 |   else:
162 |     result = newCall(ident("rankSumN"))
163 |     for v in a: result.add v
164 | 
165 | #var count = 0
166 | template threadRankSum1*(a:untyped):untyped =
167 |   mixin rankSum
168 |   #[
169 |   #if threadNum==0: inc count
170 |   #threadBarrier()
171 |   threadLocals.share[threadNum].p = a.addr
172 |   #echoAll count, " ", myrank, " ", threadNum, " v: ", cast[ByteAddress](a.addr)
173 |   #echoAll count, " ", myrank, " ", threadNum, " s: ", ptrInt(threadLocals.share)
174 |   if threadNum==0:
175 |     #threadBarrier()
176 |     t0wait()
177 |     for i in 1..<numThreads:
178 |       #echo "test1"
179 |       #echo count, " ", i, " ", cast[ByteAddress](threadLocals.share[i].p)
180 |       a += cast[ptr type(a)](threadLocals.share[i].p)[]
181 |       #echo "test2"
182 |     rankSum(a)
183 |     #threadBarrier()
184 |     twait0()
185 |     #threadBarrier()
186 |     t0wait()
187 |   else:
188 |     #threadBarrier()
189 |     t0wait()
190 |     #threadBarrier()
191 |     twait0()
192 |     a = cast[ptr type(a)](threadLocals.share[0].p)[]
193 |     #threadBarrier()
194 |     t0wait()
195 |   ]#
196 |   var ta{.global.}:type(a)
197 |   #var ta2{.global.}:array[512,type(a)]
198 |   if threadNum==0:
199 |     t0wait()
200 |     for i in 1..<numThreads:
201 |       a += cast[ptr type(a)](threadLocals.share[i].p)[]
202 |       #a += ta2[threadNum]
203 |     rankSum(a)
204 |     ta = a
205 |     twait0()
206 |   else:
207 |     threadLocals.share[threadNum].p = a.addr
208 |     #ta2[threadNum] = a
209 |     t0wait()
210 |     twait0()
211 |     a = ta
212 | 
213 | proc threadRankSumN*(a:NimNode):auto =
214 |   echo a.treeRepr
215 |   result = newNimNode(nnkStmtList)
216 |   var sum = newNimNode(nnkStmtList)
217 |   let tid = ident("threadNum")
218 |   let nid = ident("numThreads")
219 |   let p = newLit(1)
220 |   for i in 0..<a.len:
221 |     let gi = !("g" & $i)
222 |     let ai = a[i]
223 |     result.add(quote do:
224 |       var `gi`{.global.}:array[`p`*512,type(`ai`)]
225 |       `gi`[`p`*`tid`] = `ai`
226 |       )
227 |     let s = quote do:
228 |       `ai` = `gi`[0]
229 |       for i in 1..<`nid`:
230 |         `ai` += `gi`[`p`*i]
231 |     sum.add(s)
232 |   let m = quote do:
233 |     threadBarrier()
234 |     `sum`
235 |     threadBarrier()
236 |   result.add(m)
237 |   #echo result.treeRepr
238 | macro threadRankSum*(a:varargs[untyped]):auto =
239 |   if a.len==1:
240 |     template trs1(x:untyped):untyped = threadRankSum1(x)
241 |     result = getAst(trs1(a[0]))
242 |   else:
243 |     result = threadRankSumN(a)
244 | 
245 | 
246 | when isMainModule:
247 |   commsInit()
248 |   echo "rank ", myRank, "/", nRanks
249 |   printf("rank %i/%i\n", myRank, nRanks)
250 |   threads:
251 |     echo threadNum, "/", numThreads
252 |     let n = nRanks * numThreads
253 |     let s = (n*(n-1)) div 2
254 |     var x = myRank*numThreads + threadNum
255 |     threadRankSum(x)
256 |     echo threadNum, ": ", x, "  ", s
257 |     threadRankSum(x)
258 |     echo threadNum, ": ", x, "  ", n*s
259 | 
260 |     let nrep = 1000
261 | 
262 |     threadBarrier()
263 |     var t0 = epochTime()
264 |     for i in 1..nrep:
265 |       threadBarrier()
266 |     var t1 = epochTime()
267 |     echo "threadBarrier time: ", int(1e9*(t1-t0)/nrep.float), " ns"
268 | 
269 |     var f = 0.1
270 |     threadBarrier()
271 |     t0 = epochTime()
272 |     for i in 1..nrep:
273 |       threadSum(f)
274 |     t1 = epochTime()
275 |     echo "threadSum(float) time: ", int(1e9*(t1-t0)/nrep.float), " ns"
276 | 
277 |     f = 0.1
278 |     threadBarrier()
279 |     t0 = epochTime()
280 |     for i in 1..nrep:
281 |       threadRankSum(f)
282 |     t1 = epochTime()
283 |     echo "threadRankSum(float) time: ", int(1e9*(t1-t0)/nrep.float), " ns"
284 | 
285 |     f = 0.1
286 |     threadBarrier()
287 |     if threadNum==0:
288 |       t0 = epochTime()
289 |       for i in 1..nrep:
290 |         rankSum(f)
291 |       t1 = epochTime()
292 |       echo "rankSum(float) time: ", int(1e9*(t1-t0)/nrep.float), " ns"
293 |     threadBarrier()
294 | 
295 |   commsFinalize()
296 | 


--------------------------------------------------------------------------------
/demo3/qexLite/comms/qmp.nim:
--------------------------------------------------------------------------------
 1 | import os
 2 | import macros
 3 | 
 4 | when existsEnv("QMPDIR"):
 5 |   const qmpDir = getEnv("QMPDIR")
 6 | else:
 7 |   const homeDir = getHomeDir()
 8 |   const qmpDir = homeDir & "lqcd/install/qmp"
 9 | {. passC: "-I" & qmpDir & "/include" .}
10 | {. passL: "-L" & qmpDir & "/lib -lqmp" .}
11 | {. pragma: qmp, importc, header:"qmp.h" .}
12 | 
13 | type QMP_status_t{.qmp.} = enum
14 |   test
15 | type QMP_thread_level_t*{.qmp.} = enum
16 |   QMP_THREAD_SINGLE,
17 |   QMP_THREAD_FUNNELED,
18 |   QMP_THREAD_SERIALIZED,
19 |   QMP_THREAD_MULTIPLE
20 | type
21 |   QMP_msgmem_t*{.qmp.} = pointer
22 |   QMP_msghandle_t*{.qmp.} = pointer
23 | 
24 | proc QMP_init_msg_passing*(argc:ptr cint; argv:ptr ptr cstring;
25 |                            required:QMP_thread_level_t;
26 |                            provided:ptr QMP_thread_level_t):QMP_status_t{.qmp.}
27 | proc QMP_finalize_msg_passing*() {.qmp.}
28 | proc QMP_abort*(error_code:cint) {.qmp.}
29 | proc QMP_get_number_of_nodes*():cint {.qmp.}
30 | proc QMP_get_node_number*():cint {.qmp.}
31 | proc QMP_barrier*() {.qmp.}
32 | proc QMP_sum_float*(value:ptr cfloat) {.qmp.}
33 | proc QMP_sum_double*(value:ptr cdouble) {.qmp.}
34 | proc QMP_sum_float_array*(value:ptr cfloat, length:cint) {.qmp.}
35 | proc QMP_sum_double_array*(value:ptr cdouble, length:cint) {.qmp.}
36 | 
37 | proc qmpSum*(v:var int) =
38 |   var t = v.float
39 |   QmpSumDouble(t.addr)
40 |   v = t.int
41 | 
42 | template qmpSum*(v:float32):untyped = QmpSumFloat(v.addr)
43 | template qmpSum*(v:float64):untyped = QmpSumDouble(v.addr)
44 | template qmpSum*(v:ptr float32, n:int):untyped = QmpSumFloatArray(v,n.cint)
45 | template qmpSum*(v:ptr float64, n:int):untyped = QmpSumDoubleArray(v,n.cint)
46 | template qmpSum*(v:ptr array, n:int):untyped =
47 |   qmpSum(v[][0].addr, n*v[].len)
48 | template qmpSum*(v:ptr tuple, n:int):untyped =
49 |   qmpSum(v[][0].addr, n*(sizeOf(v) div sizeOf(v[0])))
50 | template qmpSum*(v:ptr object, n:int):untyped =
51 |   qmpSum(v[][].addr, n)
52 | #template qmpSum*(v:ptr typed, n:int):untyped =
53 | #  qmpSum(v[][].addr, n)
54 | #template QmpSum(v:array[int,int]):untyped =
55 | #  var tQmpSumDoubleArray(v)
56 | template qmpSum*[I,T](v:array[I,T]):untyped =
57 |   qmpSum(v[0].addr, v.len)
58 | #template qmpSum*(v:openArray[float64]):untyped =
59 | #  QmpSumDoubleArray(v[0].addr,v.len.cint)
60 | template qmpSum*[T](v:seq[T]):untyped =
61 |   qmpSum(v[0].addr, v.len)
62 | #template qmpSum*[I,T](v:seq[array[I,T]]):untyped =
63 | #  qmpSum(v[0][0].addr, v.len.cint*sizeOf(v[0]))
64 | #template qmpSum*(v:openArray[array]):untyped =
65 | #  qmpSum(v[0][0].addr, v.len.cint*sizeOf(v[0]))
66 | template qmpSum*(v:tuple):untyped =
67 |   qmpSum(v[0].addr, sizeOf(v) div sizeOf(v[0]))
68 | #template qmpSum*[T](v:T):untyped =
69 | #template qmpSum*(v:typed):untyped =
70 | #  qmpSum(v[])
71 | #template qmpSum*[T](v:T):untyped =
72 | #  qmpSum(v[])
73 | template qmpSum*(v: typed): untyped =
74 |   when numberType(v) is float64:
75 |     qmpSum(cast[ptr float64](addr v), sizeof(v) div sizeof(float64))
76 |   else:
77 |     qmpSum(v[])
78 | 
79 | when isMainModule:
80 |   var argc {.importc:"cmdCount", global.}:cint
81 |   var argv {.importc:"cmdLine", global.}:ptr cstring
82 |   var prv = QMP_THREAD_SERIALIZED
83 |   let err = QMP_init_msg_passing(argc.addr, argv.addr, prv, prv.addr)
84 |   let rank = QMP_get_node_number()
85 |   let size = QMP_get_number_of_nodes()
86 |   echo "rank " & $rank & "/" & $size
87 |   QMP_finalize_msg_passing()
88 | 


--------------------------------------------------------------------------------
/demo3/qexLite/config.nims:
--------------------------------------------------------------------------------
1 | ../config.nims


--------------------------------------------------------------------------------
/demo3/qexLite/layout.nim:
--------------------------------------------------------------------------------
1 | import layout/layoutX
2 | export layoutX
3 | import layout/shifts
4 | export shifts
5 | 


--------------------------------------------------------------------------------
/demo3/qexLite/layout/qlayout.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include "qlayout.h"
  4 | 
  5 | #define myalloc malloc
  6 | #define PRINTV(s,f,v,n) do { printf(s);                 \
  7 |     for(int _i=0; _i<n; _i++) printf(" "f, (v)[_i]);    \
  8 |     printf("\n"); } while(0)
  9 | 
 10 | void
 11 | layoutSubset(Subset *s, Layout *l, char *sub)
 12 | {
 13 |   s->begin = 0;
 14 |   s->end = l->nSites;
 15 |   s->beginOuter = 0;
 16 |   s->endOuter = l->nSitesOuter;
 17 |   if(sub[0]=='e') {
 18 |     s->end = l->nEven;
 19 |     s->endOuter = l->nEvenOuter;
 20 |   } else if(sub[0]=='o') {
 21 |     s->begin = l->nOdd;
 22 |     s->beginOuter = l->nOddOuter;
 23 |   }
 24 | }
 25 | 
 26 | void
 27 | layoutSetup(Layout *l)
 28 | {
 29 |   int nd = l->nDim;
 30 |   l->outerGeom = myalloc(nd*sizeof(int));
 31 |   l->localGeom = myalloc(nd*sizeof(int));
 32 |   int pvol=1, lvol=1, ovol=1, icb=0, icbd=-1;
 33 |   for(int i=0; i<nd; i++) {
 34 |     l->localGeom[i] = l->physGeom[i]/l->rankGeom[i];
 35 |     l->outerGeom[i] = l->localGeom[i]/l->innerGeom[i];
 36 |     pvol *= l->physGeom[i];
 37 |     lvol *= l->localGeom[i];
 38 |     ovol *= l->outerGeom[i];
 39 |     if(l->innerGeom[i]>1 && (l->outerGeom[i]&1)==1) icb++;
 40 |     if(l->innerGeom[i]==1 && (l->outerGeom[i]&1)==0) icbd = i;
 41 |   }
 42 |   if(icb==0) {
 43 |     icbd = 0;
 44 |   } else {
 45 |     if(icbd<0) {
 46 |       if(l->myrank==0) {
 47 | 	printf("not enough 2's in localGeom\n");
 48 | 	PRINTV("physGeom:", "%i", l->physGeom, nd);
 49 | 	PRINTV("rankGeom:", "%i", l->rankGeom, nd);
 50 | 	PRINTV("localGeom:", "%i", l->localGeom, nd);
 51 | 	PRINTV("outerGeom:", "%i", l->outerGeom, nd);
 52 | 	PRINTV("innerGeom:", "%i", l->innerGeom, nd);
 53 |       }
 54 |       exit(-1);
 55 |     }
 56 |     icb = l->outerGeom[icbd]/2;
 57 |     if((icb&1)==0) {
 58 |       if(l->myrank==0) {
 59 | 	printf("error in cb choice\n");
 60 | 	PRINTV("physGeom:", "%i", l->physGeom, nd);
 61 | 	PRINTV("rankGeom:", "%i", l->rankGeom, nd);
 62 | 	PRINTV("localGeom:", "%i", l->localGeom, nd);
 63 | 	PRINTV("outerGeom:", "%i", l->outerGeom, nd);
 64 | 	PRINTV("innerGeom:", "%i", l->innerGeom, nd);
 65 | 	printf("innerCb: %i\n", icb);
 66 | 	printf("innerCbDir: %i\n", icbd);
 67 |       }
 68 |       exit(-1);
 69 |     }
 70 |   }
 71 |   l->physVol = pvol;
 72 |   l->nSites = lvol;
 73 |   l->nOdd = lvol/2;
 74 |   l->nEven = lvol - l->nOdd;
 75 |   l->nSitesOuter = ovol;
 76 |   l->nOddOuter = ovol/2;
 77 |   l->nEvenOuter = ovol - l->nOddOuter;
 78 |   l->nSitesInner = l->nSites/l->nSitesOuter;
 79 |   l->innerCb = icb;
 80 |   l->innerCbDir = icbd;
 81 |   if(l->myrank==0) {
 82 |     printf("#innerCb: %i\n", icb);
 83 |     printf("#innerCbDir: %i\n", icbd);
 84 |   }
 85 | }
 86 | 
 87 | static void
 88 | lex_x(int *x, int l, int *s, int ndim)
 89 | {
 90 |   for(int i=0; i<ndim; i++) {
 91 |   //for(int i=ndim-1; i>=0; --i) {
 92 |     x[i] = l % s[i];
 93 |     l = l / s[i];
 94 |   }
 95 | }
 96 | 
 97 | // x[0] is fastest
 98 | static int
 99 | lex_i(int *x, int *s, int *d, int ndim)
100 | {
101 |   int l = 0;
102 |   for(int i=ndim-1; i>=0; --i) {
103 |     int xx = x[i];
104 |     if(d) xx /= d[i];
105 |     l = l*s[i] + (xx%s[i]);
106 |   }
107 |   return l;
108 | }
109 | 
110 | #if 0
111 | // x[0] is slowest
112 | static int
113 | lexr_i(int *x, int *s, int *d, int ndim)
114 | {
115 |   int l = 0;
116 |   for(int i=0; i<ndim; i++) {
117 |     int xx = x[i];
118 |     if(d) xx /= d[i];
119 |     l = l*s[i] + (xx%s[i]);
120 |   }
121 |   return l;
122 | }
123 | #endif
124 | 
125 | void
126 | layoutIndex(Layout *l, LayoutIndex *li, int coords[])
127 | {
128 |   int nd = l->nDim;
129 |   int ri = lex_i(coords, l->rankGeom, l->localGeom, nd);
130 |   int ii = lex_i(coords, l->innerGeom, l->outerGeom, nd);
131 |   int ib = 0;
132 |   for(int i=0; i<nd; i++) {
133 |     int xi = coords[i]/l->outerGeom[i];
134 |     int li = xi % l->innerGeom[i];
135 |     ib += li * l->outerGeom[i];
136 |   }
137 |   ib &= 1;
138 |   coords[l->innerCbDir] += l->innerCb * ib;
139 |   int oi = lex_i(coords, l->outerGeom, NULL, nd);
140 |   coords[l->innerCbDir] -= l->innerCb * ib;
141 |   int p = 0;
142 |   for(int i=0; i<nd; i++) p += coords[i];
143 |   int oi2 = oi/2;
144 |   if(p&1) oi2 = (oi+l->nSitesOuter)/2;
145 |   li->rank = ri;
146 |   li->index = oi2*l->nSitesInner + ii;
147 | }
148 | 
149 | void
150 | layoutCoord(Layout *l, int *coords, LayoutIndex *li)
151 | {
152 |   int nd = l->nDim;
153 |   int cr[nd];
154 |   lex_x(cr, li->rank, l->rankGeom, nd);
155 |   int p = 0;
156 |   int ll = li->index % l->nSitesInner;
157 |   int ib = 0;
158 |   for(int i=0; i<nd; i++) {
159 |     int w = l->innerGeom[i];
160 |     int wl = l->outerGeom[i];
161 |     int k = ll % w;
162 |     int c = l->localGeom[i]*cr[i] + k*wl;
163 |     cr[i] = c;
164 |     //printf("cr[%i]: %i\n", i, c);
165 |     p += c;
166 |     ll = ll / w;
167 |     ib += k*wl;
168 |   }
169 |   ib &= 1;
170 |   int ii = li->index / l->nSitesInner;
171 |   if(ii>=l->nEvenOuter) {
172 |     ii -= l->nEvenOuter;
173 |     p++;
174 |   }
175 |   ii *= 2;
176 |   for(int i=0; i<nd; i++) {
177 |     int wl = l->outerGeom[i];
178 |     int k = ii % wl;
179 |     if(i==l->innerCbDir) k = (k + l->innerCb * ib)%wl;
180 |     coords[i] = k;
181 |     //printf("coords[%i]: %i\n", i, k);
182 |     p += k;
183 |     ii = ii / wl;
184 |   }
185 |   if(p&1) {
186 |     for(int i=0; i<nd; i++) {
187 |       int wl = l->outerGeom[i];
188 |       if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl;
189 |       coords[i]++;
190 |       if(coords[i]>=wl) {
191 | 	coords[i] = 0;
192 | 	if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl;
193 |       } else {
194 | 	if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl;
195 | 	break;
196 |       }
197 |     }
198 |   }
199 |   for(int i=0; i<nd; i++) coords[i] += cr[i];
200 |   {
201 |     LayoutIndex li2;
202 |     layoutIndex(l, &li2, coords);
203 |     if(li->rank!=li2.rank ||li->index!=li2.index) {
204 |       printf("error: bad coord:\n");
205 |       printf(" %i,%i -> %i %i %i %i -> %i,%i\n", li->rank, li->index,
206 | 	     coords[0],coords[1],coords[2],coords[3], li2.rank, li2.index);
207 |       exit(-1);
208 |     }
209 |   }
210 | }
211 | 
212 | void
213 | layoutShift(Layout *l, LayoutIndex *li, LayoutIndex *li2, int *disp)
214 | {
215 |   int nd = l->nDim;
216 |   int x[nd];
217 |   layoutCoord(l, x, li2);
218 |   for(int i=0; i<nd; i++) {
219 |     x[i] = (x[i] + disp[i] + l->physGeom[i])%l->physGeom[i];
220 |   }
221 |   layoutIndex(l, li, x);
222 | }
223 | 


--------------------------------------------------------------------------------
/demo3/qexLite/layout/qlayout.h:
--------------------------------------------------------------------------------
  1 | #include "qmp.h"
  2 | 
  3 | typedef struct llist {
  4 |   void *value;
  5 |   struct llist *next;
  6 | } llist;
  7 | 
  8 | // k = sum_i sum_j ((x[i]/d[i][j])%m[i][j])*f[i][j]
  9 | // x[i] = sum_j ((k/f[i][j])%m[i][j])*d[i][j]
 10 | // parity?
 11 | 
 12 | typedef struct {
 13 |   int nDim;
 14 |   int *physGeom;
 15 |   int *rankGeom;
 16 |   int *innerGeom; //wrap
 17 |   int *outerGeom; //wls
 18 |   int *localGeom;
 19 |   int physVol;
 20 |   int nEven;
 21 |   int nOdd;
 22 |   int nSites;
 23 |   int nEvenOuter;
 24 |   int nOddOuter;
 25 |   int nSitesOuter;
 26 |   int nSitesInner;
 27 |   int innerCb;
 28 |   int innerCbDir;
 29 |   llist *shifts;
 30 |   int nranks;
 31 |   int myrank;
 32 | } Layout;
 33 | 
 34 | typedef struct {
 35 |   int rank;
 36 |   int index;
 37 | } LayoutIndex;
 38 | 
 39 | typedef struct {
 40 |   int begin;
 41 |   int end;
 42 |   int beginOuter;
 43 |   int endOuter;
 44 | } Subset;
 45 | 
 46 | void layoutSetup(Layout *l);
 47 | void layoutIndex(Layout *l, LayoutIndex *li, int coords[]);
 48 | void layoutCoord(Layout *l, int coords[], LayoutIndex *li);
 49 | void layoutShift(Layout *l, LayoutIndex *li, LayoutIndex *li2, int disp[]);
 50 | void layoutSubset(Subset *s, Layout *l, char *sub);
 51 | 
 52 | typedef struct {
 53 |   int myRank;
 54 |   int nIndices;
 55 |   int *srcRanks;
 56 |   int *srcIndices;
 57 |   int nRecvDests;
 58 |   int nSendIndices;
 59 |   int *sendSrcIndices;
 60 |   int *sendDestRanks;
 61 |   int *sendDestIndices;
 62 | } GatherDescription;
 63 | 
 64 | typedef struct {
 65 |   GatherDescription *gd;
 66 |   int myRank;
 67 |   int nIndices;
 68 |   int *srcIndices;
 69 |   int nRecvRanks;
 70 |   int *recvRanks;
 71 |   int *recvRankSizes;
 72 |   int *recvRankOffsets;
 73 |   int recvSize;
 74 |   int nRecvDests;
 75 |   int *recvDestIndices;
 76 |   int *recvBufIndices;
 77 |   int nSendRanks;
 78 |   int *sendRanks;
 79 |   int *sendRankSizes;
 80 |   int *sendRankOffsets;
 81 |   int sendSize; // same as nSendIndices
 82 |   int nSendIndices;
 83 |   int *sendIndices;
 84 | } GatherIndices;
 85 | 
 86 | // per gather:
 87 | //  pidx
 88 | //  recv
 89 | // combined:
 90 | //  send*
 91 | 
 92 | typedef struct {
 93 |   GatherIndices *gi;
 94 |   int *disp;
 95 |   int *sidx;
 96 |   int *pidx;
 97 |   int nRecvRanks;
 98 |   int *recvRanks;
 99 |   int *recvRankSizes;
100 |   int *recvRankSizes1;
101 |   int *recvRankOffsets;
102 |   int *recvRankOffsets1;
103 |   int nRecvSites;
104 |   int nRecvSites1;
105 |   int nRecvDests;
106 |   int *recvDests;
107 |   int *recvLocalSrcs;
108 |   int *recvRemoteSrcs;
109 |   int nSendRanks;
110 |   int *sendRanks;
111 |   int *sendRankSizes;
112 |   int *sendRankSizes1;
113 |   int *sendRankOffsets;
114 |   int *sendRankOffsets1;
115 |   int nSendSites;
116 |   int nSendSites1;
117 |   int *sendSites;
118 |   int vv;
119 |   //int offr, lenr, nthreads;
120 |   int perm;
121 |   int pack;
122 |   int blend;
123 |   //QMP_msgmem_t sqmpmem;
124 |   //QMP_msghandle_t smsg;
125 |   //QMP_msgmem_t rqmpmem;
126 |   //QMP_msghandle_t rmsg;
127 |   //QMP_msghandle_t pairmsg;
128 | } ShiftIndices;
129 | 
130 | typedef struct {
131 |   QMP_msgmem_t sqmpmem;
132 |   QMP_msghandle_t smsg;
133 |   QMP_msgmem_t rqmpmem;
134 |   QMP_msghandle_t rmsg;
135 |   QMP_msghandle_t pairmsg;
136 |   char *sbuf;
137 |   char *rbuf;
138 |   int sbufSize;
139 |   int rbufSize;
140 |   int first;
141 |   int *offr;
142 |   int *lenr;
143 |   int *nthreads;
144 | } ShiftBuf;
145 | 
146 | typedef void GatherMap(int *srcRank, int *srcIdx, int dstRank, int *dstIdx, void *args);
147 | 


--------------------------------------------------------------------------------
/demo3/qexLite/metaUtils.nim:
--------------------------------------------------------------------------------
  1 | import macros
  2 | import strUtils
  3 | 
  4 | proc symToIdent*(x: NimNode): NimNode =
  5 |   case x.kind:
  6 |     of nnkCharLit..nnkUInt64Lit:
  7 |       result = newNimNode(x.kind)
  8 |       result.intVal = x.intVal
  9 |     of nnkFloatLit..nnkFloat64Lit:
 10 |       result = newNimNode(x.kind)
 11 |       result.floatVal = x.floatVal
 12 |     of nnkStrLit..nnkTripleStrLit:
 13 |       result = newNimNode(x.kind)
 14 |       result.strVal = x.strVal
 15 |     of nnkIdent, nnkSym:
 16 |       result = newIdentNode($x)
 17 |     of nnkOpenSymChoice:
 18 |       result = newIdentNode($x[0])
 19 |     else:
 20 |       result = newNimNode(x.kind)
 21 |       for c in x:
 22 |         result.add symToIdent(c)
 23 | 
 24 | macro getConst*(x: static[int]): auto =
 25 |   return newLit(x)
 26 | #macro getConst*(x: typed): auto =
 27 |   #echo x.treerepr
 28 |   #result = newLit(3)
 29 |   #result = newLit(x.intVal)
 30 | 
 31 | macro delayExpansion*(x:untyped):auto = result = x
 32 | 
 33 | macro `$`*(t:typedesc):auto =
 34 |   result = newLit(t.getType[1].repr)
 35 | 
 36 | macro echoType*(x:typed):auto =
 37 |   result = newEmptyNode()
 38 |   let t1 = x.getType
 39 |   echo t1.treeRepr
 40 |   echo t1.getType.treeRepr
 41 | macro echoType*(x:typedesc):auto =
 42 |   result = newEmptyNode()
 43 |   let t1 = x.getType
 44 |   echo t1.treeRepr
 45 |   echo t1[1].getType.treeRepr
 46 | 
 47 | macro treerep*(x:typed):auto =
 48 |   echo x.treeRepr
 49 |   newEmptyNode()
 50 | 
 51 | macro echoAst*(x:untyped):untyped =
 52 |   echo x.treeRepr
 53 |   x
 54 | 
 55 | #template dump*(x:untyped):untyped =
 56 | #  echo $(x)
 57 | #  echo astToStr(x)
 58 | #  echo repr(x)
 59 | macro dump*(x:untyped):untyped =
 60 |   let s = x[0].strVal
 61 |   #echo s
 62 |   let v = parseExpr(s)
 63 |   #echo v.treeRepr
 64 |   #echo v.toStrLit.treeRepr
 65 |   result = quote do:
 66 |     echo `x`, ": ", `v`
 67 | 
 68 | macro toId*(s:static[string]):untyped =
 69 |   echo s
 70 |   newIdentNode(!s)
 71 | 
 72 | macro toId*(s:typed):untyped =
 73 |   echo s.treeRepr
 74 |   #newIdentNode(!s)
 75 | 
 76 | macro toString*(id:untyped):untyped =
 77 |   #echo id.repr
 78 |   echo id.treeRepr
 79 |   if id.kind==nnkSym:
 80 |     result = newLit($id)
 81 |   else:
 82 |     result = newLit($id[0])
 83 | 
 84 | macro catId*(x:varargs[untyped]):auto =
 85 |   #echo x.repr
 86 |   var s = ""
 87 |   for i in 0..<x.len:
 88 |      s &= x[i].repr
 89 |   result = ident(s)
 90 | 
 91 | macro setType*(x:untyped; s:static[string]):auto =
 92 |   let t = ident(s)
 93 |   result = quote do:
 94 |     type `x`* = `t`
 95 | 
 96 | macro map*(a:tuple; f:untyped; p:varargs[untyped]):untyped =
 97 |   #echo a.treeRepr
 98 |   #echo f.treeRepr
 99 |   #echo p.treeRepr
100 |   let ti = a.getTypeImpl
101 |   #echo ti.treeRepr
102 |   let nargs = ti.len
103 |   #echo nargs
104 |   result = newPar()
105 |   for i in 0..<nargs:
106 |     #let c = newCall(f,newTree(nnkBracketExpr,a,newLit(i)))
107 |     let c = newCall(f,newDotExpr(a,ti[i][0]))
108 |     for pp in p: c.add(pp)
109 |     #result.add(newColonExpr(ident("field" & $i),c))
110 |     result.add(newColonExpr(ti[i][0],c))
111 |   #echo result.repr
112 | 
113 | macro makeCall*(op:static[string],a:tuple):untyped =
114 |   echo op
115 |   echo a.repr
116 |   #echo a[0].repr
117 |   echo a.treeRepr
118 |   result = newCall(!op)
119 |   let nargs = a.getType.len - 1
120 |   for i in 0..<nargs:
121 |     result.add(a[i][1])
122 |   echo result.repr
123 |   #echo result.treeRepr
124 | 
125 | #macro makeCall*(op:static[string]; a:typed):untyped =
126 | macro makeCall*(op:static[string],a:typed,idx:typed):untyped =
127 |   #echo op
128 |   #echo a.repr
129 |   #echo a.treeRepr
130 |   #echo a.getType.treeRepr
131 |   #echo a.getType.len
132 |   var opid = !op
133 |   let nargs = a.getType.len - 1
134 |   case nargs
135 |     of 1:
136 |       return quote do:
137 |         `opid`(`a`[0][`idx`])
138 |     of 2:
139 |       return quote do:
140 |         `opid`(`a`[0][`idx`],`a`[1][`idx`])
141 |     else:
142 |       quit("makeCall: unhandled number of arguments " & $nargs)
143 | 
144 | proc evalBackticR(body:NimNode):NimNode =
145 |   #echo body.treeRepr
146 |   if body.kind == nnkAccQuoted:
147 |     var id = ""
148 |     for c in body:
149 |       id &= $c.repr
150 |     result = newIdentNode(id)
151 |   else:
152 |     result = copyNimNode(body)
153 |     for c in body.children:
154 |       result.add(evalBackticR(c))
155 |   #echo result.repr
156 | 
157 | macro evalBacktic*(body:untyped):untyped =
158 |   result = evalBackticR(body)
159 | 
160 | proc replace(id,val,body:NimNode):NimNode =
161 |   #echo(id.treeRepr)
162 |   #echo(id.repr)
163 |   #echo(" " & val.treeRepr)
164 |   #echo(" " & val.repr)
165 |   #echo(" " & body.treeRepr)
166 |   if body == id:
167 |     result = val
168 |   else:
169 |     result = copyNimNode(body)
170 |     for c in body.children:
171 |       result.add(replace(id, val, c))
172 | 
173 | macro makeTyped*(x:typed):auto = x
174 | macro makeUntyped*(x:untyped):auto = x
175 | 
176 | macro echoTyped*(x:typed):auto =
177 |   result = newEmptyNode()
178 |   echo x.repr
179 | macro echoTypedTree*(x:typed):auto =
180 |   result = newEmptyNode()
181 |   echo x.treeRepr
182 | 
183 | macro teeTyped*(x:typed):auto =
184 |   result = x
185 |   echo x.repr
186 | 
187 | macro teeTypedTree*(x:typed):auto =
188 |   result = x
189 |   echo x.treeRepr
190 | 
191 | proc dumpTyped(r:var NimNode; x:NimNode) =
192 |   r = quote do:
193 |     echoTyped:
194 |       block:
195 |         `x`
196 |     `r`
197 | 
198 | var idNum{.compiletime.} = 1
199 | macro makeUnique*(x:varargs[untyped]):auto =
200 |   result = x[^1]
201 |   #echo result.repr
202 |   for i in 0..(x.len-2):
203 |     echo x[i].repr
204 |     let v = ident(($x[i])[0..^3] & $idNum & "_")
205 |     idNum.inc
206 |     result = replace(x[i], v, result)
207 |   #echo result.repr
208 |   result.dumpTyped(result)
209 | 
210 | macro subst*(x:varargs[untyped]):auto =
211 |   let n = x.len
212 |   result = x[n-1]
213 |   #echo result.repr
214 |   for i in countup(0, n-3, 2):
215 |     #echo x[i].repr, " ", x[i+1].repr
216 |     var t = x[i+1]
217 |     #echo t.repr
218 |     if t.repr == "_":
219 |       t = ident($x[i] & "_" & $idNum & "_")
220 |       idNum.inc
221 |     result = replace(x[i], t, result)
222 |   # echo result.repr
223 |   # echo "subst: "
224 |   # result.dumpTyped(result)
225 | 
226 | proc separateStmtListExpr(st: var NimNode, stex: NimNode): NimNode =
227 |   if stex.kind == nnkStmtListExpr:
228 |     for s in 0..(stex.len-2):
229 |       st.add stex[s]
230 |     result = separateStmtListExpr(st, stex[^1])
231 |   else:
232 |     result = copyNimNode(stex)
233 |     for s in 0..<stex.len:
234 |       result.add separateStmtListExpr(st, stex[s])
235 | 
236 | template newLet(a,b: untyped): untyped =
237 |   let a = b
238 |   #mixin simpleAssign
239 |   #var a{.noInit.}:type(b)
240 |   #simpleAssign(a, b)
241 | 
242 | macro lets*(x:varargs[untyped]):auto =
243 |   var prestmts = newStmtList()
244 |   result = x[^1]
245 |   #echo "begin lets: ", result.repr
246 |   for i in countup(0, x.len-3, 2):
247 |     #echo x[i].repr, " ", x[i+1].repr
248 |     var t = separateStmtListExpr(prestmts, x[i+1])
249 |     #echo t.repr
250 |     if t.kind == nnkInfix:
251 |       echo "let: ", t.repr
252 |       let v = genSym(nskLet, $x[i])
253 |       prestmts.add getAst(newLet(v,t))
254 |       t = v
255 |     result = replace(x[i], t, result)
256 |   result = newStmtList(prestmts, result)
257 |   #echo result.repr
258 |   #echo "lets: "
259 |   #result.dumpTyped(result)
260 | 
261 | macro forStaticX2(a,b:static[int]; index,body:untyped):untyped =
262 |   #echo(index.repr)
263 |   #echo(index.treeRepr)
264 |   #echo(body.repr)
265 |   #echo(body.treeRepr)
266 |   result = newStmtList()
267 |   for i in a..b:
268 |     #result.add(replace(index, newIntLitNode(i), body))
269 |     result.add(newBlockStmt(replace(index, newIntLitNode(i), body)))
270 |   #echo(result.repr)
271 | 
272 | macro forStaticX(slice: Slice[int]; index,body: untyped): untyped =
273 |   #echo(index.repr)
274 |   #echo(index.treeRepr)
275 |   #echo(slice.repr)
276 |   #echo(slice.treeRepr)
277 |   #echo(body.repr)
278 |   #echo(body.treeRepr)
279 |   result = newStmtList()
280 |   let a = slice[1][1].intVal
281 |   let b = slice[1][2].intVal
282 |   for i in a..b:
283 |     #result.add(replace(index, newIntLitNode(i), body))
284 |     result.add(newBlockStmt(replace(index, newIntLitNode(i), body)))
285 |   #echo(result.repr)
286 | 
287 | #template forStatic*(index,slice,body:untyped):untyped =
288 | #  bind forStaticX
289 | #  forStaticX(slice, index, body)
290 | 
291 | # template forStatic*(index,i0,i1,body:untyped):untyped =
292 | #   bind forStaticX2
293 | #   forStaticX2(i0, i1, index, body)
294 | # Nim bug prevents it to work in simd...
295 | template forStatic*(index,i0,i1,body:untyped):untyped =
296 |   for index in i0..i1: body
297 | macro staticFor*(index:untyped,a,b:static[int],body:untyped):untyped =
298 |   result = newstmtlist()
299 |   for i in a..b:
300 |     #result.add(replace(index, newIntLitNode(i), body))
301 |     result.add(newBlockStmt(replace(index, newIntLitNode(i), body)))
302 | 
303 | template forOpt*(i,r0,r1,b:untyped):untyped =
304 |   when compiles((const x=r0;const y=r1;x)):
305 |     forStatic i, r0, r1:
306 |       b
307 |   else:
308 |     for i in r0..r1:
309 |       b
310 | 
311 | template depthFirst*(body:untyped; action:untyped):untyped {.dirty.} =
312 |   proc recurse(body:NimNode):NimNode =
313 |     #echo body.treeRepr
314 |     result = copyNimNode(body)
315 |     for it in body:
316 |       #echo "it: ", it.treeRepr
317 |       action
318 |       result.add recurse(it)
319 |     #echo result.repr
320 |   result = recurse(body)
321 |   #echo result.treeRepr
322 |   #echo result.repr
323 | template depthFirst2*(body:untyped; action:untyped):untyped {.dirty.} =
324 |   proc recurse(it:var NimNode):NimNode =
325 |     action
326 |     result = copyNimNode(it)
327 |     for c in it:
328 |       var cc = c
329 |       result.add recurse(cc)
330 |   var b{.genSym.} = body
331 |   result = recurse(b)
332 | template depthFirst3*(body:untyped; action:untyped):untyped {.dirty.} =
333 |   proc recurse(it:NimNode) =
334 |     action
335 |     for c in it:
336 |       recurse(c)
337 |   recurse(body)
338 | 
339 | macro addImportC(prefix=""; body:untyped):auto =
340 |   #echo body.treeRepr
341 |   let p = prefix.strVal
342 |   depthFirst(body):
343 |     if it.kind==nnkProcDef:
344 |       if it.pragma.kind == nnkEmpty:
345 |         it.pragma = newNimNode(nnkPragma)
346 |       it.pragma.add newColonExpr(ident("importC"), newLit(p & $it.name))
347 | macro addPragma(prg:string; body:untyped):auto =
348 |   #echo prg.repr
349 |   let p = parseExpr(prg.strVal)
350 |   #echo p.treerepr
351 |   depthFirst(body):
352 |     if it.kind==nnkProcDef:
353 |       if it.pragma.kind == nnkEmpty:
354 |         it.pragma = newNimNode(nnkPragma)
355 |       p.copyChildrenTo it.pragma
356 | macro addReturnType(t:untyped; body:untyped):auto =
357 |   #echo t.repr
358 |   #echo t.treerepr
359 |   let tt = t
360 |   depthFirst(body):
361 |     if it.kind==nnkProcDef:
362 |       it[3][0] = tt
363 | macro addArgTypes(t:varargs[untyped]; body:untyped):auto =
364 |   #echo t.repr
365 |   #echo t.treerepr
366 |   let tt = t
367 |   var a = newSeq[NimNode]()
368 |   for i in 0..<t.len:
369 |     a.add newIdentDefs(ident($chr(ord('a')+i)),t[i])
370 |   depthFirst(body):
371 |     if it.kind==nnkProcDef:
372 |       for s in a: it[3].add s
373 | 
374 | #nnkPostfix(nnkIdent(!"*"), nnkIdent(!"hello"))
375 | 
376 | macro neverInit*(p:untyped):auto =
377 |   #echo p.treeRepr
378 |   result = p
379 |   template def = {.emit:"#define memset(a,b,c)".}
380 |   template undef = {.emit:"#undef memset".}
381 |   insert(result.body, 0, getAst(def()))
382 |   add(result.body, getAst(undef()))
383 |   #echo result.treeRepr
384 | 


--------------------------------------------------------------------------------
/demo3/qexLite/omp.nim:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | when defined(noOpenmp):
 4 |   template omp_set_num_threads*(x: cint) = discard
 5 |   template omp_get_num_threads*(): cint = 1
 6 |   template omp_get_max_threads*(): cint = 1
 7 |   template omp_get_thread_num*(): cint = 0
 8 |   template ompPragma(p:string):untyped = discard
 9 |   template setupGc = discard
10 | else:
11 |   const OMPFlag {.strDefine.} = "-fopenmp"
12 |   {. passC: OMPFlag .}
13 |   {. passL: OMPFlag .}
14 |   {. pragma: omp, header:"omp.h" .}
15 |   proc omp_set_num_threads*(x: cint) {.omp.}
16 |   proc omp_get_num_threads*(): cint {.omp.}
17 |   proc omp_get_max_threads*(): cint {.omp.}
18 |   proc omp_get_thread_num*(): cint {.omp.}
19 |   template ompPragma(p:string):untyped =
20 |     {. emit:"\n#pragma omp " & p .}
21 |   template setupGc =
22 |     if(omp_get_thread_num()!=0): setupForeignThreadGc()
23 | 
24 | template ompBarrier* = ompPragma("barrier")
25 | template ompBlock(p:string; body:untyped):untyped =
26 |   ompPragma(p)
27 |   block:
28 |     body
29 | 
30 | template ompParallel*(body:untyped):untyped =
31 |   ompBlock("parallel"):
32 |     setupGc()
33 |     body
34 | template ompMaster*(body:untyped):untyped = ompBlock("master", body)
35 | template ompSingle*(body:untyped):untyped = ompBlock("single", body)
36 | template ompCritical*(body:untyped):untyped = ompBlock("critical", body)
37 | 
38 | when isMainModule:
39 |   proc test =
40 |     echo "main: ", ompGetThreadNum(), "/", ompGetNumThreads()
41 |     ompParallel:
42 |       echo "parallel: ", ompGetThreadNum(), "/", ompGetNumThreads()
43 |       ompBarrier()
44 |       ompMaster:
45 |         echo "master: ", ompGetThreadNum(), "/", ompGetNumThreads()
46 |       echo "parallel: ", ompGetThreadNum(), "/", ompGetNumThreads()
47 |       ompSingle:
48 |         echo "single: ", ompGetThreadNum(), "/", ompGetNumThreads()
49 |       echo "parallel: ", ompGetThreadNum(), "/", ompGetNumThreads()
50 |       ompCritical:
51 |         echo "critical: ", ompGetThreadNum(), "/", ompGetNumThreads()
52 |       echo "parallel: ", ompGetThreadNum(), "/", ompGetNumThreads()
53 |       ompBarrier()
54 |   test()
55 | 


--------------------------------------------------------------------------------
/demo3/qexLite/simd.nim:
--------------------------------------------------------------------------------
 1 | #import simdGcc
 2 | #export simdGcc
 3 | 
 4 | when defined(SSE) or defined(AVX) or defined(AVX512):
 5 |   import simd/simdX86
 6 |   export simdX86
 7 | elif defined(QPX):
 8 |   import simd/simdQpx
 9 |   export simdQpx
10 | 
11 | #import simd/simdGeneric
12 | #export simdGeneric
13 | 
14 | 
15 | when declared(SimdS4):
16 |   proc toSingle*(x: SimdD4): SimdS4 {.inline,noInit.} =
17 |     for i in 0..<4:
18 |       result[i] = x[i]
19 |   proc toDouble*(x: SimdS4): SimdD4 {.inline,noInit.} =
20 |     for i in 0..<4:
21 |       result[i] = x[i]
22 |   proc inorm2*(r:var SimdD4; x:SimdS4) {.inline.} =
23 |     let y = toDouble(x)
24 |     inorm2(r, y)
25 | 
26 | template assign*(r: array[4,float32], x: SimdD4): untyped =
27 |   assign(r, toSingle(x))
28 | template assign*(r: SimdS4, x: SimdD4): untyped =
29 |   r = toSingle(x)
30 | template assign*(r: SimdD4, x: SimdS4): untyped =
31 |   r = toDouble(x)
32 | 
33 | when declared(SimdD8) and declared(SimdS8):
34 |   proc toSingle*(x: SimdD8): SimdS8 {.inline,noInit.} =
35 |     for i in 0..<8:
36 |       result[i] = x[i]
37 |   template assign*(r: SimdS8, x: SimdD8): untyped =
38 |     r = toSingle(x)
39 |   template assign*(r: SimdD8, x: SimdS8): untyped =
40 |     r = toDouble(x)
41 |   template isub*(r: SimdD8, x: SimdS8): untyped =
42 |     isub(r, toDouble(x))
43 |   template imadd*(r: SimdD8, x: SimdD8, y: SimdS8): untyped =
44 |     imadd(r, x, toDouble(y))
45 |   template imsub*(r: SimdD8, x: SimdD8, y: SimdS8): untyped =
46 |     imsub(r, x, toDouble(y))
47 |   template eval*(x: SimdD8): auto = x
48 |   template `-=`*(r: SimdD8, x: SimdD8): untyped = isub(r, x)
49 | 
50 | when declared(SimdD8) and declared(SimdS8):
51 |   template eval*(x: SimdD16): auto = x
52 | 
53 | template eval*(x: SimdD4): auto = x
54 | 
55 | when declared(SimdS4):
56 |   converter promote*(x: SimdS4): SimdD4 {.inline,noInit.} =
57 |     assign(result, x)
58 |   template toSingleImpl*(x: SimdS4): untyped = x
59 |   template toSingleImpl*(x: SimdD4): untyped = toSingle(x)
60 |   template toDoubleImpl*(x: SimdS4): untyped = toDouble(x)
61 |   template toDoubleImpl*(x: SimdD4): untyped = x
62 | 
63 | when declared(SimdS8):
64 |   converter promote*(x: SimdS8): SimdD8 =
65 |     assign(result, x)
66 |   template toSingleImpl*(x: SimdS8): untyped = x
67 |   template toSingleImpl*(x: SimdD8): untyped = toSingle(x)
68 |   template toDoubleImpl*(x: SimdS8): untyped = toDouble(x)
69 |   template toDoubleImpl*(x: SimdD8): untyped = x
70 | 
71 | when declared(SimdS16):
72 |   proc toSingle*(x: SimdD16): SimdS16 {.inline,noInit.} =
73 |     for i in 0..<16:
74 |       result[i] = x[i]
75 |   template assign*(r: SimdS16, x: SimdD16): untyped =
76 |     r = toSingle(x)
77 |   template assign*(r: SimdD16, x: SimdS16): untyped =
78 |     r = toDouble(x)
79 |   converter promote*(x: SimdS16): SimdD16 =
80 |     assign(result, x)
81 |   template toSingleImpl*(x: SimdS16): untyped = x
82 |   template toSingleImpl*(x: SimdD16): untyped = toSingle(x)
83 |   template toDoubleImpl*(x: SimdS16): untyped = toDouble(x)
84 |   template toDoubleImpl*(x: SimdD16): untyped = x
85 | 


--------------------------------------------------------------------------------
/demo3/qexLite/simd/simdX86.nim:
--------------------------------------------------------------------------------
 1 | import macros
 2 | import ../metaUtils
 3 | #import ../basicOps
 4 | # import base
 5 | import simdX86Types
 6 | export simdX86Types
 7 | 
 8 | import simdX86Ops
 9 | export simdX86Ops
10 | 
11 | import simdArray
12 | 
13 | template tryArray(T,L,B:untyped):untyped =
14 |   when (not declared(T)) and declared(B):
15 |     makeSimdArray(T, L, B)
16 | macro makeArray(P,N:untyped):auto =
17 |   let n = N.intVal
18 |   let t = ident("Simd" & $P & $n)
19 |   var m = n div 2
20 |   result = newStmtList()
21 |   while m>0:
22 |     let b = ident("Simd" & $P & $m)
23 |     let l = n div m
24 |     result.add getAst(tryArray(t,newLit(l),b))
25 |     m = m div 2
26 |   #echo result.repr
27 | 
28 | makeArray(D, 16)
29 | makeArray(D,  8)
30 | makeArray(D,  4)
31 | 
32 | #when defined(SSE):
33 | #proc toDoubleA*(x:SimdS4):array[2,SimdD2] {.inline,noInit.} =
34 | #  result[0] = mm_cvtps_pd(x)
35 | #  var y{.noInit.}:SimdS4
36 | #  perm2(y, x)
37 | #  result[1] = mm_cvtps_pd(y)
38 | 
39 | when defined(AVX):
40 |   when not defined(AVX512):
41 |     proc toDouble*(x:SimdS8):SimdD8 {.inline,noInit.} =
42 |       #result = SimdD8(toDoubleA(x))
43 |       result := toDoubleA(x)
44 | 
45 | #when declared(SimdS4):
46 | #  proc toDouble*(x:SimdS4):SimdD4 {.inline,noInit.} =
47 | #    result = SimdD4(toDoubleA(x))
48 | #  proc inorm2*(r:var SimdD4; x:SimdS4) {.inline.} =
49 | #    let y = toDouble(x)
50 | #    inorm2(r, y)
51 | 
52 | when declared(SimdS8):
53 |   #proc toDouble*(x:SimdS8):SimdD8 {.inline,noInit.} =
54 |   #  result = SimdD8(toDoubleA(x))
55 |   proc inorm2*(r:var SimdD8; x:SimdS8) {.inline.} =
56 |     var xx{.noInit.} = toDouble(x)
57 |     inorm2(r, xx)
58 |   proc imadd*(r:var SimdD8; x,y:SimdS8) {.inline.} =
59 |     var xx{.noInit.} = toDouble(x)
60 |     var yy{.noInit.} = toDouble(y)
61 |     imadd(r, xx, yy)
62 |   proc imsub*(r:var SimdD8; x,y:SimdS8) {.inline.} =
63 |     let xd = toDouble(x)
64 |     let yd = toDouble(y)
65 |     imsub(r, xd, yd)
66 | 
67 | when declared(SimdS16):
68 |   proc toDouble*(x:SimdS16):SimdD16 {.inline,noInit.} =
69 |     #for i in 0..15: result[i] = float64(x[i])
70 |     result = SimdD16(v: toDoubleA(x))
71 |   proc inorm2*(r:var SimdD16; x:SimdS16) {.inline.} = inorm2(r, toDouble(x))
72 |   proc imadd*(r:var SimdD16; x,y:SimdS16) {.inline.} =
73 |     var xx{.noInit.} = toDouble(x)
74 |     var yy{.noInit.} = toDouble(y)
75 |     imadd(r, xx, yy)
76 |   proc imsub*(r:var SimdD16; x,y:SimdS16) {.inline.} =
77 |     let xd = toDouble(x)
78 |     let yd = toDouble(y)
79 |     imsub(r, xd, yd)
80 | 
81 | when declared(SimdD4):
82 |   template toDouble*(x:SimdD4):untyped = x
83 | when declared(SimdD8):
84 |   template toDouble*(x:SimdD8):untyped = x
85 | when declared(SimdD16):
86 |   template toDouble*(x:SimdD16):untyped = x
87 | 
88 | when isMainModule:
89 |   var s8:SimdS8
90 |   assign(s8, [0,1,2,3,4,5,6,7])
91 |   var d8 = toDouble(s8)
92 |   echo d8
93 |   inorm2(d8, s8)
94 |   echo d8
95 | 


--------------------------------------------------------------------------------
/demo3/qexLite/simd/simdX86Ops.nim:
--------------------------------------------------------------------------------
  1 | {. deadCodeElim: on .}
  2 | 
  3 | import simdX86Types
  4 | import simdSse
  5 | import simdAvx
  6 | import simdAvx512
  7 | #import ../basicOps
  8 | # import base
  9 | # import math
 10 | import macros
 11 | 
 12 | template binaryMixed(T,op1,op2:untyped):untyped =
 13 |   template op1*(x:SomeNumber; y:T):T = op2(x.to(T),y)
 14 |   template op1*(x:T; y:SomeNumber):T = op2(x,y.to(T))
 15 | template unaryMixedVar(T,op1,op2:untyped):untyped =
 16 |   template op1*(r:var T; x:SomeNumber) = op2(r,x.to(T))
 17 | template binaryMixedVar(T,op1,op2:untyped):untyped =
 18 |   template op1*(r:var T; x:SomeNumber; y:T) = op2(r,x.to(T),y)
 19 |   template op1*(r:var T; x:T; y:SomeNumber) = op2(r,x,y.to(T))
 20 | template trinaryMixedVar(T,op1,op2:untyped):untyped =
 21 |   template op1*(r:var T; x:SomeNumber; y:T; z:T) = op2(r,x.to(T),y,z)
 22 |   template op1*(r:var T; x:T; y:SomeNumber; z:T) = op2(r,x,y.to(T),z)
 23 |   template op1*(r:var T; x:T; y:T; z:SomeNumber) = op2(r,x,y,z.to(T))
 24 | template map1(T,N,op:untyped):untyped {.dirty.} =
 25 |   proc op*(x:T):T {.inline,noInit.} =
 26 |     let t = x.toArray
 27 |     var r{.noInit.}:type(t)
 28 |     for i in 0..<N:
 29 |       r[i] = op(t[i])
 30 |     assign(result, r)
 31 | 
 32 | template basicDefs(T,F,N,P,S:untyped):untyped {.dirty.} =
 33 |   template numberType*(x:typedesc[T]):typedesc = F
 34 |   template numberType*(x:T):typedesc = F
 35 |   template numNumbers*(x:typedesc[T]):untyped = N
 36 |   template numNumbers*(x:T):untyped = N
 37 |   template simdType*(x:typedesc[T]):typedesc = T
 38 |   template simdType*(x:T):typedesc = T
 39 |   template simdLength*(x:T):untyped = N
 40 |   template simdLength*(x:typedesc[T]):untyped = N
 41 |   template load1*(x:T):untyped = x
 42 |   proc assign*(r:ptr F; x:T) {.inline.} =
 43 |     `P "_storeu_" S`(r, x)
 44 |   proc assign*(r:var T; x:ptr SomeNumber) {.inline.} =
 45 |     when x[] is F:
 46 |       r = `P "_loadu_" S`(x)
 47 |     else:
 48 |       let y = cast[ptr array[N,type(x[])]](x)
 49 |       var t{.noInit.}:array[N,F]
 50 |       for i in 0..<N: t[i] = F(y[][i])
 51 |       assign(r, t)
 52 |   template toSimd*(x:array[N,F]):untyped =
 53 |     `P "_loadu_" S`(cast[ptr F](unsafeAddr(x)))
 54 |   proc toArray*(x:T):array[N,F] {.inline,noInit.} =
 55 |     `P "_storeu_" S`(cast[ptr F](result.addr), x)
 56 |   template to*(x:SomeNumber; t:typedesc[T]):untyped =
 57 |     `P "_set1_" S`(F(x))
 58 |   template to*(x:array[N,F]; t:typedesc[T]):untyped =
 59 |     toSimd(x)
 60 |   proc to*(x:T; t:typedesc[array[N,F]]):array[N,F] {.inline,noInit.} =
 61 |     `P "_storeu_" S`(cast[ptr F](result.addr), x)
 62 |   proc assign1*(r:var T; x:SomeNumber) {.inline.} =
 63 |     r = `P "_set1_" S`(F(x))
 64 |   template setX:untyped = `P "_setr_" S`()
 65 |   template setF(x):untyped = F(x)
 66 |   macro assign*(r:var T; x:varargs[SomeNumber]):auto =
 67 |     if x.len==1:
 68 |       result = newCall(!"assign1", r, x[0])
 69 |     else:
 70 |       result = newStmtList()
 71 |       var call = getAst(setX())[0]
 72 |       for i in 0..<N:
 73 |         call.add getAst(setF(x[i mod x.len]))
 74 |       template asgn(r,c:untyped):untyped = r = c
 75 |       result = getAst(asgn(r, call))
 76 |     #echo result.treerepr
 77 |   proc assign*(r:var T; x:array[N,SomeNumber]) {.inline.} =
 78 |     when x[0] is F:
 79 |       r = `P "_loadu_" S`(cast[ptr F](unsafeAddr(x)))
 80 |     else:
 81 |       var t{.noInit.}:array[N,F]
 82 |       for i in 0..<N: t[i] = F(x[i])
 83 |       assign(r, t)
 84 |   #proc assign*(r:var T; x:T) {.inline.} =
 85 |   #  r = x
 86 |   #template `=`*(r: var T; x: T) = {.emit: [r, " = ", x].}
 87 |   template assign*(r:var T; x:T) =
 88 |     r = x
 89 |   #proc assign*(r:var array[N,F]; x:T) {.inline.} =
 90 |   #  assign(cast[ptr F](r.addr), x)
 91 |   proc assign*(r:var array[N,F]; x:T) {.inline.} =
 92 |     assign(r[0].addr, x)
 93 |   proc `[]`*(x:T; i:SomeInteger):F {.inline,noInit.} =
 94 |     toArray(x)[i]
 95 |   proc `[]=`*(r:var T; i:SomeInteger; x:SomeNumber) {.inline,noInit.} =
 96 |     var a = toArray(r)
 97 |     a[i] = F(x)
 98 |     assign(r, a)
 99 |   proc `$`*(x:T):string =
100 |     result = "[" & $x[0]
101 |     for i in 1..<N:
102 |       result &= "," & $x[i]
103 |     result &= "]"
104 |   proc prefetch*(x:ptr T) {.inline.} =
105 |     mm_prefetch(cast[cstring](x), 3)
106 | 
107 |   template add*(x,y:T):T = `P "_add_" S`(x,y)
108 |   template sub*(x,y:T):T = `P "_sub_" S`(x,y)
109 |   template mul*(x,y:T):T = `P "_mul_" S`(x,y)
110 |   template divd*(x,y:T):T = `P "_div_" S`(x,y)
111 |   template neg*(x:T):T = sub(`P "_setzero_" S`(), x)
112 | 
113 |   binaryMixed(T, add, add)
114 |   binaryMixed(T, sub, sub)
115 |   binaryMixed(T, mul, mul)
116 |   binaryMixed(T, divd, divd)
117 | 
118 |   template neg*(r:var T; x:T) = r = neg(x)
119 |   template add*(r:var T; x,y:T) = r = add(x,y)
120 |   template sub*(r:var T; x,y:T) = r = sub(x,y)
121 |   template mul*(r:var T; x,y:T) = r = mul(x,y)
122 |   template divd*(r:var T; x,y:T) = r = divd(x,y)
123 | 
124 |   binaryMixedVar(T, add, add)
125 |   binaryMixedVar(T, sub, sub)
126 |   binaryMixedVar(T, mul, mul)
127 |   binaryMixedVar(T, divd, divd)
128 | 
129 |   template iadd*(r:var T; x:T) = add(r,r,x)
130 |   template isub*(r:var T; x:T) = sub(r,r,x)
131 |   template imul*(r:var T; x:T) = mul(r,r,x)
132 |   template idiv*(r:var T; x:T) = divd(r,r,x)
133 |   template imadd*(r:var T; x,y:T) = iadd(r,mul(x,y))
134 |   template imsub*(r:var T; x,y:T) = isub(r,mul(x,y))
135 |   template madd*(r:var T; x,y,z:T) = add(r,mul(x,y),z)
136 |   template msub*(r:var T; x,y,z:T) = sub(r,mul(x,y),z)
137 | 
138 |   unaryMixedVar(T, iadd, iadd)
139 |   unaryMixedVar(T, isub, isub)
140 |   unaryMixedVar(T, imul, imul)
141 |   unaryMixedVar(T, idiv, idiv)
142 |   binaryMixedVar(T, imadd, imadd)
143 |   binaryMixedVar(T, imsub, imsub)
144 |   trinaryMixedVar(T, madd, madd)
145 |   trinaryMixedVar(T, msub, msub)
146 | 
147 |   template `-`*(x:T):T = neg(x)
148 |   template `+`*(x,y:T):T = add(x,y)
149 |   template `-`*(x,y:T):T = sub(x,y)
150 |   template `*`*(x,y:T):T = mul(x,y)
151 |   template `/`*(x,y:T):T = divd(x,y)
152 | 
153 |   binaryMixed(T, `+`, add)
154 |   binaryMixed(T, `-`, sub)
155 |   binaryMixed(T, `*`, mul)
156 |   binaryMixed(T, `/`, divd)
157 | 
158 |   template `:=`*(r:var T; x:T) = assign(r,x)
159 |   template `+=`*(r:var T, x:T) = iadd(r,x)
160 |   template `-=`*(r:var T, x:T) = isub(r,x)
161 |   template `*=`*(r:var T, x:T) = imul(r,x)
162 |   template `/=`*(r:var T, x:T) = idiv(r,x)
163 | 
164 |   unaryMixedVar(T, `:=`, assign)
165 |   template `:=`*(r:var T; x:openArray[SomeNumber]) = assign(r,x)
166 |   unaryMixedVar(T, `+=`, iadd)
167 |   unaryMixedVar(T, `-=`, isub)
168 |   unaryMixedVar(T, `*=`, imul)
169 |   #unaryMixedVar(T, `/=`, idiv)
170 |   template `/=`*(r:var T; x:SomeNumber) = idiv(r,x.to(T))
171 | 
172 |   proc trace*(x:T):T {.inline,noInit.}= x
173 |   proc norm2*(x:T):T {.inline,noInit.} = mul(x,x)
174 |   proc norm2*(r:var T; x:T) {.inline.} = mul(r,x,x)
175 |   proc inorm2*(r:var T; x:T) {.inline.} = imadd(r,x,x)
176 |   proc max*(x,y:T):T {.inline,noInit.} = `P "_max_" S`(x,y)
177 |   proc abs*(x:T):T {.inline,noInit.} = max(x,neg(x))
178 |   proc sqrt*(x:T):T {.inline,noInit.} = `P "_sqrt_" S`(x)
179 |   proc rsqrt*(x:T):T {.inline,noInit.} = divd(sqrt(x),x)
180 |   proc rsqrt*(r:var T; x:T) {.inline.} = r = rsqrt(x)
181 |   # map1(T,N, sin)
182 |   # map1(T,N, cos)
183 |   # map1(T,N, acos)
184 | 
185 | basicDefs(m128,  float32,  4, mm, ps)
186 | basicDefs(m128d, float64,  2, mm, pd)
187 | basicDefs(m256,  float32,  8, mm256, ps)
188 | basicDefs(m256d, float64,  4, mm256, pd)
189 | basicDefs(m512,  float32, 16, mm512, ps)
190 | basicDefs(m512d, float64,  8, mm512, pd)
191 | 
192 | 
193 | proc simdReduce*(r:var SomeNumber; x:m128) {.inline.} =
194 |   let y = mm_hadd_ps(x, x)
195 |   let z = mm_hadd_ps(y, y)
196 |   var t{.noInit.}:float32
197 |   mm_store_ss(t.addr, z)
198 |   r = (type(r))(t)
199 | proc simdReduce*(r:var SomeNumber; x:m256) {.inline.} =
200 |   let y = mm256_hadd_ps(x, mm256_permute2f128_ps(x, x, 1))
201 |   let z = mm256_hadd_ps(y, y)
202 |   let w = mm256_hadd_ps(z, z)
203 |   r = (type(r))(w[0])
204 | proc simdReduce*(r:var SomeNumber; x:m256d) {.inline.} =
205 |   let y = mm256_hadd_pd(x, mm256_permute2f128_pd(x, x, 1))
206 |   let z = mm256_hadd_pd(y, y)
207 |   r = (type(r))(z[0])
208 | proc simdReduce*(r:var SomeNumber; x:m512) {.inline.} =
209 |   #r = (type(r))(mm512_reduce_add_ps(x))
210 |   #let t = mm512_shuffle_f32x4(x, x, BASE4(1,0,3,2))
211 |   #let t2 = add(x, t)
212 |   r = x[0]
213 |   for i in 1..<16:
214 |     r += x[i]
215 | proc simdReduce*(r:var SomeNumber; x:m512d) {.inline.} =
216 |   #r = (type(r))(mm512_reduce_add_pd(x))
217 |   r = x[0]
218 |   for i in 1..<8:
219 |     r += x[i]
220 | proc simdReduce*(x:m128):float32 {.inline,noInit.} = simdReduce(result, x)
221 | proc simdReduce*(x:m256):float32 {.inline,noInit.} = simdReduce(result, x)
222 | proc simdReduce*(x:m256d):float64 {.inline,noInit.} = simdReduce(result, x)
223 | proc simdReduce*(x:m512):float32 {.inline,noInit.} = simdReduce(result, x)
224 | proc simdReduce*(x:m512d):float64 {.inline,noInit.} = simdReduce(result, x)
225 | 
226 | template simdSum*(x:m128):untyped = simdReduce(x)
227 | template simdSum*(x:m256):untyped = simdReduce(x)
228 | template simdSum*(x:m256d):untyped = simdReduce(x)
229 | template simdSum*(x:m512):untyped = simdReduce(x)
230 | template simdSum*(x:m512d):untyped = simdReduce(x)
231 | template simdSum*(r:var SomeNumber; x:m128) = simdReduce(r, x)
232 | template simdSum*(r:var SomeNumber; x:m256) = simdReduce(r, x)
233 | template simdSum*(r:var SomeNumber; x:m256d) = simdReduce(r, x)
234 | template simdSum*(r:var SomeNumber; x:m512) = simdReduce(r, x)
235 | template simdSum*(r:var SomeNumber; x:m512d) = simdReduce(r, x)
236 | 
237 | # include perm, pack and blend
238 | include simdX86Ops1
239 | 
240 | when defined(AVX):
241 |   when defined(AVX512):
242 |     proc toDouble*(x:SimdS8):SimdD8 {.inline,noInit.} =
243 |       result = mm512_cvtps_pd(x)
244 |   else:
245 |     proc toDoubleA*(x:SimdS8):array[2,SimdD4] {.inline,noInit.} =
246 |       result[0] = mm256_cvtps_pd(mm256_extractf128_ps(x,0))
247 |       result[1] = mm256_cvtps_pd(mm256_extractf128_ps(x,1))
248 |       #for i in 0..3: result[0][i] = x[i]
249 |       #for i in 0..3: result[1][i] = x[4+i]
250 | 
251 | when defined(AVX512):
252 |   proc toDoubleA*(x:SimdS16):array[2,SimdD8] {.inline,noInit.} =
253 |     result[0] = mm512_cvtps_pd(mm512_castps512_ps256(x))
254 |     var y{.noInit.}:SimdS16
255 |     perm8(y, x)
256 |     result[1] = mm512_cvtps_pd(mm512_castps512_ps256(y))
257 | 
258 | when defined(SimdS4):
259 |   proc mm_cvtph_ps(x:m128i):m128
260 |     {.importC:"_mm_cvtph_ps",header:"f16cintrin.h".}
261 |   proc mm_cvtps_ph(x:m128,y:cint):m128i
262 |     {.importC:"_mm_cvtps_ph",header:"f16cintrin.h".}
263 |   template toHalf(x:SimdS4):SimdH4 = SimdH4(mm_cvtps_ph(x))
264 |   template toSingle(x:SimdH4):SimdS4 = mm_cvtph_ps(x)
265 | when defined(SimdS8):
266 |   proc mm256_cvtph_ps(x:m128i):m256
267 |     {.importC:"_mm256_cvtph_ps",header:"f16cintrin.h".}
268 |   proc mm256_cvtps_ph(x:m256,y:cint):m128i
269 |     {.importC:"_mm256_cvtps_ph",header:"f16cintrin.h".}
270 |   template toHalf(x:SimdS8):SimdH8 = SimdH8(mm256_cvtps_ph(x,0))
271 |   template toSingle(x:SimdH8):SimdS8 = mm256_cvtph_ps(m128i(x))
272 | when defined(SimdS16):
273 |   template toHalf(x:SimdS16):SimdH16 = SimdH16(mm512_cvtps_ph(x,0))
274 |   template toSingle(x:SimdH16):SimdS16 = mm512_cvtph_ps(m256i(x))
275 | 
276 | # toSingle, toDouble, to(x,float32), to(x,float64)
277 | discard """
278 | template `lid`(x:untyped):untyped = to(x,`id`)
279 | """
280 | 
281 | when isMainModule:
282 |   var x,y,z:m256d
283 |   var d:float64
284 |   var a = [1.0,2.0,3.0,4.0]
285 |   assign(x, 1)
286 |   assign(y, 2)
287 |   assign(z, 0)
288 | 
289 |   echo z
290 |   z = x+y
291 |   echo z
292 |   simdReduce(d, z)
293 |   echo d
294 |   assign(z, a)
295 |   echo z
296 |   simdReduce(d, z)
297 |   echo d
298 |   perm1(y, z)
299 |   echo y
300 |   echo z
301 |   perm2(y, z)
302 |   echo y
303 | 
304 |   assign(x, 1)
305 |   echo x
306 |   assign(x, 1, 2)
307 |   echo x
308 |   assign(x, 1, 2, 3)
309 |   echo x
310 |   assign(x, 1, 2, 3, 4)
311 |   echo x
312 |   assign(x, 1, 2, 3, 4, 5)
313 |   echo x
314 | 
315 |   assign(x, a[0], a[1], a[2], a[3])
316 |   echo x
317 | 
318 |   var s:m256
319 |   assign(s, a[0], a[1], a[2], a[3])
320 |   echo s
321 | 
322 |   var s8:SimdS8
323 |   assign(s8, [0,1,2,3,4,5,6,7])
324 |   var d8 = toDoubleA(s8)
325 |   echo d8[0]
326 |   echo d8[1]
327 | 
328 |   var h:SimdH8
329 |   s = toSingle(h)
330 |   h = toHalf(s)
331 |   assign(s,[1,2,3,4,5,6,7,8])
332 |   h = toHalf(s)
333 |   s8 = toSingle(h)
334 |   echo s8
335 | 
336 |   when declared(SimdS16):
337 |     var s16:SimdS16
338 |     assign(s16, [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
339 |     var h16 = toHalf(s16)
340 |     var t16 = toSingle(h16)
341 |     echo t16
342 | 


--------------------------------------------------------------------------------
/demo3/qexLite/simd/simdX86Types.nim:
--------------------------------------------------------------------------------
 1 | {.pragma: imm, header:"immintrin.h".}
 2 | type
 3 |   m64*   {.importc: "__m64"  , imm.} = object
 4 |   m128*  {.importc: "__m128" , imm.} = object
 5 |   m128d* {.importc: "__m128d", imm.} = object
 6 |   m128i* {.importc: "__m128i", imm.} = object
 7 |   m256*  {.importc: "__m256" , imm.} = object
 8 |   m256d* {.importc: "__m256d", imm.} = object
 9 |   m256i* {.importc: "__m256i", imm.} = object
10 |   m512*  {.importc: "__m512" , imm.} = object
11 |   m512d* {.importc: "__m512d", imm.} = object
12 |   m512i* {.importc: "__m512i", imm.} = object
13 |   mmask8*  {.importc: "__mmask8" , imm.} = object
14 |   mmask16* {.importc: "__mmask16", imm.} = object
15 |   mmask32* {.importc: "__mmask32", imm.} = object
16 |   mmask64* {.importc: "__mmask64", imm.} = object
17 | 
18 | when defined(SSE):
19 |   type
20 |     SimdS4* = m128
21 |     SimdD2* = m128d
22 |     SimdI4* = m128i
23 |     SimdH4* = distinct int64
24 | when defined(AVX):
25 |   type
26 |     SimdS8* = m256
27 |     SimdD4* = m256d
28 |     SimdI8* = m256i
29 |     SimdH8* = distinct SimdI4
30 | when defined(AVX512):
31 |   type
32 |     SimdS16* = m512
33 |     SimdD8*  = m512d
34 |     SimdI16* = m512i
35 |     SimdH16* = distinct SimdI8
36 | 


--------------------------------------------------------------------------------
/demo3/qexLite/stdUtils.nim:
--------------------------------------------------------------------------------
  1 | import macros
  2 | import strUtils
  3 | import metaUtils
  4 | import os
  5 | 
  6 | type
  7 |   cArray*{.unchecked.}[T] = array[0,T]
  8 | template `[]`*(x: cArray): untyped = addr x[0]
  9 | template `&`*(x: ptr cArray): untyped = addr x[0]
 10 | 
 11 | template ptrInt*(x:untyped):untyped = cast[ByteAddress](x)
 12 | template addrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
 13 | template unsafeAddrInt*(x:untyped):untyped = cast[ByteAddress](addr(x))
 14 | 
 15 | iterator range*[T: SomeInteger](count: T): T =
 16 |   var res = T(0)
 17 |   while res < count:
 18 |     yield res
 19 |     inc res
 20 | 
 21 | template makeTypeParam(name,typ,deflt,cnvrt: untyped): untyped {.dirty.} =
 22 |   proc name*(s: string, d=deflt): typ =
 23 |     result = d
 24 |     let n = paramCount()
 25 |     for i in 1..n:
 26 |       let p = paramstr(i)
 27 |       if p.startsWith('-'&s&':'):
 28 |         let ll = s.len + 2
 29 |         result = cnvrt(p[ll..^1])
 30 | 
 31 | makeTypeParam(intParam, int, 0, parseInt)
 32 | makeTypeParam(floatParam, float, 0.0, parseFloat)
 33 | makeTypeParam(strParam, string, "", string)
 34 | 
 35 | proc intSeqParam*(s: string, d: seq[int] = @[]): seq[int] =
 36 |   result = d
 37 |   let n = paramCount()
 38 |   for i in 1..n:
 39 |     let p = paramstr(i)
 40 |     if p.startsWith('-'&s&':'):
 41 |       let ll = s.len + 2
 42 |       for c in split(p[ll..^1], ','):
 43 |         result.add parseInt(c)
 44 | 
 45 | template CLIset*(p:typed, n:untyped, prefix:string, runifset:untyped) =
 46 |   mixin echo
 47 |   let
 48 |     o = p.n
 49 |     s = prefix & astToStr(n)
 50 |   when compiles(strParam(s, p.n)):
 51 |     p.n = type(p.n)strParam(s, p.n)
 52 |   elif compiles(intParam(s, p.n)):
 53 |     p.n = type(p.n)intParam(s, p.n)
 54 |   elif compiles(floatParam(s, p.n)):
 55 |     p.n = type(p.n)floatParam(s, p.n)
 56 |   elif compiles(intSeqParam(s, p.n)):
 57 |     p.n = type(p.n)intSeqParam(s, p.n)
 58 |   else:
 59 |     {.fatal:"Cannot set argument "&s&" of "&astToStr(p)&" for command line.".}
 60 |   if o != p.n:
 61 |     runifset
 62 |     echo "Customize $# : $# -> $#"%[s, $o, $p.n]
 63 | template CLIset*(p:typed, n:untyped, prefix = "") =
 64 |   p.CLIset n, prefix:
 65 |     discard
 66 | 
 67 | template `$&`*(x: untyped): string =
 68 |   toHex(unsafeAddrInt(x))
 69 | 
 70 | proc `|`*(s: string, d: tuple[w:int,c:char]): string =
 71 |   let p = abs(d.w) - len(s)
 72 |   let pad = if p>0: repeat(d.c, p) else: ""
 73 |   if d.w >= 0:
 74 |     result = pad & s
 75 |   else:
 76 |     result = s & pad
 77 | proc `|`*(s: string, d: int): string =
 78 |   s | (d,' ')
 79 | proc `|`*(x: int, d: int): string =
 80 |   ($x) | d
 81 | proc `|`*(f: float, d: tuple[w,p: int]): string =
 82 |   if d.p<0:
 83 |     formatFloat(f, ffDecimal, -d.p) | d.w
 84 |   else:
 85 |     formatFloat(f, ffDefault, d.p) | d.w
 86 | proc `|`*(f: float, d: int): string =
 87 |   f | (d,d)
 88 | template `|-`*(x:SomeNumber, y: int): untyped =
 89 |   x | -y
 90 | 
 91 | proc indexOf*[T](x: openArray[T], y: any): int =
 92 |   let n = x.len
 93 |   while result<n and x[result]!=y: inc result
 94 | 
 95 | proc `*`*[T](x:openArray[T], y:int):auto {.inline.} =
 96 |   let n = x.len
 97 |   var r:array[n,T]
 98 |   for i in 0..<n:
 99 |     r[i] = x[i]
100 |   r
101 | proc `+`*[T](x,y:openArray[T]):auto {.inline.} =
102 |   let n = x.len
103 |   var r:array[n,T]
104 |   for i in 0..<n:
105 |     r[i] = x[i] + y[i]
106 |   r
107 | proc `+=`*[T](x:var openArray[T], y: openArray[T]) {.inline.} =
108 |   let n = x.len
109 |   for i in 0..<n:
110 |     x[i] += y[i]
111 | template makeArrayOverloads(n:int):untyped =
112 |   proc `+`*[T](x,y:array[n,T]):array[n,T] {.inline.} =
113 |     for i in 0..<x.len:
114 |       result[i] = x[i] + y[i]
115 |   proc `*`*[T](x:array[n,T], y:int):array[n,T] {.inline.} =
116 |     for i in 0..<x.len:
117 |       result[i] = x[i] * T(y)
118 |   proc `:=`*[T1,T2](r:var array[n,T1]; x:array[n,T2]) =
119 |     for i in 0..<r.len:
120 |       r[i] = T1(x[i])
121 | makeArrayOverloads(4)
122 | makeArrayOverloads(8)
123 | makeArrayOverloads(16)
124 | 
125 | #proc sum*[T](x:openArray[T]): auto =
126 | #  result = x[0]
127 | #  for i in 1..<x.len: result += x[i]
128 | 
129 | macro echoImm*(s:varargs[typed]):auto =
130 |   result = newEmptyNode()
131 |   #echo s.treeRepr
132 |   var t = ""
133 |   for c in s.children():
134 |     if c.kind == nnkStrLit:
135 |       t &= c.strVal
136 |     else:
137 |       t &= c.toStrLit.strVal
138 |   echo t
139 | 
140 | template ctrace* =
141 |   const ii = instantiationInfo()
142 |   echoImm "ctrace: ", ii
143 | 
144 | template declareVla(v,t,n:untyped):untyped =
145 |   type Vla{.gensym.} = distinct t
146 |   #var v{.noInit,codeGenDecl:"$# $#[" & n.astToStr & "]".}:Vla
147 |   #var v{.noInit,codeGenDecl:"$# $#[`n`]".}:Vla
148 |   var v{.noInit,noDecl.}:Vla
149 |   {.emit:"`Vla` `v`[`n`];".}
150 |   template len(x:Vla):untyped = n
151 |   template `[]`(x:Vla; i:untyped):untyped =
152 |     (cast[ptr cArray[t]](unsafeAddr(x)))[][i]
153 |   template `[]=`(x:var Vla; i,y:untyped):untyped =
154 |     (cast[ptr cArray[t]](addr(x)))[][i] = y
155 | 
156 | #[
157 | proc `$`*[T](x:openArray[T]):string =
158 |   var t = newSeq[string]()
159 |   var len = 0
160 |   for e in x:
161 |     let s = $e
162 |     t.add(s)
163 |     len += s.len
164 |   #echo len
165 |   #echo t[0]
166 |   if len < 60:
167 |     result = t.join(" ")
168 |   else:
169 |     result = ""
170 |     for i,v in t:
171 |       result &= ($i & ":" & v & "\n")
172 | ]#
173 | 
174 | macro toLit*(s:static[string]):auto =
175 |   result = newLit(s)
176 | 
177 | template warn*(s:varargs[string,`$`]) =
178 |   let ii = instantiationInfo()
179 |   echo "warning (", ii.filename, ":", ii.line, "):"
180 |   echo "  ", s.join
181 | 
182 | proc factor*(n: int): seq[int] =
183 |   result.newSeq(0)
184 |   var x = n
185 |   if x<0:
186 |     result.add(-1)
187 |     x = -x
188 |   if x<2: result.add x
189 |   while x>1:
190 |     var k = 2
191 |     if (x and 1) != 0:
192 |       k = 3
193 |       while (x mod k) != 0: k += 2
194 |     result.add k
195 |     x = x div k
196 | 
197 | 
198 | when isMainModule:
199 |   #[
200 |   proc test(n:int) =
201 |     declareVla(x, float, n)
202 |     let n2 = n div 2
203 |     block:
204 |       declareVla(y, float, n2)
205 |       #{.emit:"""printf("%p\n", &x[0]);""".}
206 |       x[0] = 1
207 |       echo x[0]
208 |       echo x.len
209 |       echo y.len
210 |   test(10)
211 |   test(20)
212 |   ]#
213 | 
214 |   template testFactor(n: int) =
215 |     echo "factor(", n, ") = ", factor(n)
216 |   for i in -2..20:
217 |     testFactor(i)
218 | 


--------------------------------------------------------------------------------
/demo3/qexLite/threading.nim:
--------------------------------------------------------------------------------
  1 | import times
  2 | import strUtils
  3 | import stdUtils
  4 | import macros
  5 | import omp
  6 | import metaUtils
  7 | 
  8 | type
  9 |   ThreadShare* = object
 10 |     p*:pointer
 11 |     counter*:int
 12 |   ThreadObj* = object
 13 |     threadNum*:int
 14 |     numThreads*:int
 15 |     share*:ptr cArray[ThreadShare]
 16 | 
 17 | var threadNum*{.threadvar.}:int
 18 | var numThreads*{.threadvar.}:int
 19 | var threadLocals*{.threadvar.}:ThreadObj
 20 | var inited = false
 21 | 
 22 | template initThreadLocals*(ts:seq[ThreadShare]):untyped =
 23 |   threadLocals.threadNum = threadNum
 24 |   threadLocals.numThreads = numThreads
 25 |   threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr)
 26 |   threadLocals.share[threadNum].p = nil
 27 |   threadLocals.share[threadNum].counter = 0
 28 | proc init =
 29 |   inited = true
 30 |   threadNum = 0
 31 |   numThreads = 1
 32 |   var ts = newSeq[ThreadShare](numThreads)
 33 |   initThreadLocals(ts)
 34 | template threadsInit* =
 35 |   if not inited:
 36 |     init()
 37 | template checkInit* =
 38 |   threadsInit()
 39 |   #if not inited:
 40 |     #let ii = instantiationInfo()
 41 |     #let ln = ii.line
 42 |     #let fn = ii.filename[0 .. ^5]
 43 |     #echo format("error: $#($#): threads not initialized",fn,ln)
 44 |     #quit(-1)
 45 | 
 46 | template threads*(body:untyped):untyped =
 47 |   checkInit()
 48 |   let tidOld = threadNum
 49 |   let nidOld = numThreads
 50 |   let tlOld = threadLocals
 51 |   #proc tproc2{.genSym,inline.} =
 52 |   #  body
 53 |   proc tproc{.genSym.} =
 54 |     var ts:seq[ThreadShare]
 55 |     ompParallel:
 56 |       threadNum = ompGetThreadNum()
 57 |       numThreads = ompGetNumThreads()
 58 |       if threadNum==0: ts.newSeq(numThreads)
 59 |       threadBarrierO()
 60 |       initThreadLocals(ts)
 61 |       #echoAll threadNum, " s: ", ptrInt(threadLocals.share)
 62 |       body
 63 |       #tproc2()
 64 |       threadBarrierO()
 65 |   tproc()
 66 |   threadNum = tidOld
 67 |   numThreads = nidOld
 68 |   threadLocals = tlOld
 69 | template threads*(x0:untyped;body:untyped):untyped =
 70 |   checkInit()
 71 |   let tidOld = threadNum
 72 |   let nidOld = numThreads
 73 |   let tlOld = threadLocals
 74 |   proc tproc(xx:var type(x0)) {.genSym.} =
 75 |     var ts:seq[ThreadShare]
 76 |     ompParallel:
 77 |       threadNum = ompGetThreadNum()
 78 |       numThreads = ompGetNumThreads()
 79 |       if threadNum==0: ts.newSeq(numThreads)
 80 |       threadBarrierO()
 81 |       initThreadLocals(ts)
 82 |       #echoAll threadNum, " s: ", ptrInt(threadLocals.share)
 83 |       subst(x0,xx):
 84 |         body
 85 |   tproc(x0)
 86 |   threadNum = tidOld
 87 |   numThreads = nidOld
 88 |   threadLocals = tlOld
 89 | 
 90 | template getMaxThreads*() = ompGetMaxThreads()
 91 | template threadBarrierO* = ompBarrier
 92 | template threadMaster*(x:untyped) = ompMaster(x)
 93 | template threadSingle*(x:untyped) = ompSingle(x)
 94 | template threadCritical*(x:untyped) = ompCritical(x)
 95 | 
 96 | template threadDivideLow*(x,y: untyped): untyped =
 97 |   x + (threadNum*(y-x)) div numThreads
 98 | template threadDivideHigh*(x,y: untyped): untyped =
 99 |   x + ((threadNum+1)*(y-x)) div numThreads
100 | 
101 | 
102 | proc tForX*(index,i0,i1,body:NimNode):NimNode =
103 |   return quote do:
104 |     let d = 1+`i1` - `i0`
105 |     let ti0 = `i0` + (threadNum*d) div numThreads
106 |     let ti1 = `i0` + ((threadNum+1)*d) div numThreads
107 |     for `index` in ti0 ..< ti1:
108 |       `body`
109 | macro tFor*(index,i0,i1: untyped; body: untyped): untyped =
110 |   result = tForX(index, i0, i1, body)
111 | macro tFor*(index: untyped; slice: Slice; body: untyped): untyped =
112 |   #echo index.treeRepr
113 |   #echo treeRepr(slice)
114 |   var i0,i1: NimNode
115 |   #echo slice.kind
116 |   if slice.kind == nnkStmtListExpr:
117 |     i0 = slice[1][1]
118 |     i1 = slice[1][2]
119 |   else:
120 |     i0 = slice[1]
121 |     i1 = slice[2]
122 |   result = tForX(index, i0, i1, body)
123 | 
124 | discard """
125 | iterator `.|`*[S, T](a: S, b: T): T {.inline.} =
126 |   mixin threadNum
127 |   var d = b - T(a)
128 |   var res = T(a) + (threadNum*d) div numThreads
129 |   var bb = T(a) + ((threadNum+1)*d) div numThreads
130 |   while res <= bb:
131 |     yield res
132 |     inc(res)
133 | """
134 | 
135 | template t0wait* = threadBarrier()
136 | template t0waitX* =
137 |   if threadNum==0:
138 |     inc threadLocals.share[0].counter
139 |     let tbar0 = threadLocals.share[0].counter
140 |     for b in 1..<numThreads:
141 |       let p{.volatile.} = threadLocals.share[b].counter.addr
142 |       while true:
143 |         if p[] >= tbar0: break
144 |   else:
145 |     inc threadLocals.share[threadNum].counter
146 |     #fence()
147 | 
148 | template twait0* = threadBarrier()
149 | template twait0X* =
150 |   if threadNum==0:
151 |     inc threadLocals.share[0].counter
152 |     #fence()
153 |   else:
154 |     inc threadLocals.share[threadNum].counter
155 |     let tbar0 = threadLocals.share[threadNum].counter
156 |     let p{.volatile.} = threadLocals.share[0].counter.addr
157 |     while true:
158 |       if p[] >= tbar0: break
159 | 
160 | template threadBarrier* =
161 |   #t0wait
162 |   #twait0
163 |   ompBarrier
164 | 
165 | macro threadSum*(a:varargs[untyped]):auto =
166 |   #echo a.treeRepr
167 |   result = newNimNode(nnkStmtList)
168 |   var sum = newNimNode(nnkStmtList)
169 |   let tid = ident("threadNum")
170 |   let nid = ident("numThreads")
171 |   let p = newLit(1)
172 |   for i in 0..<a.len:
173 |     let gi = !("g" & $i)
174 |     let ai = a[i]
175 |     result.add(quote do:
176 |       var `gi`{.global.}:array[`p`*512,type(`ai`)]
177 |       #`gi`[`p`*`tid`] = `ai`
178 |       deepCopy(`gi`[`p`*`tid`], `ai`)
179 |       )
180 |     let s = quote do:
181 |       `ai` = `gi`[0]
182 |       for i in 1..<`nid`:
183 |         `ai` += `gi`[`p`*i]
184 |     sum.add(s)
185 |   let m = quote do:
186 |     threadBarrier()
187 |     `sum`
188 |     threadBarrier()
189 |   result.add(m)
190 |   result = newBlockStmt(result)
191 |   #echo result.treeRepr
192 | macro threadSum2*(a:varargs[untyped]):auto =
193 |   #echo a.treeRepr
194 |   result = newNimNode(nnkStmtList)
195 |   var g0 = newNimNode(nnkStmtList)
196 |   var gp = newNimNode(nnkStmtList)
197 |   var a0 = newNimNode(nnkStmtList)
198 |   for i in 0..<a.len:
199 |     let gi = !("g" & $i)
200 |     let ai = a[i]
201 |     let t = quote do:
202 |       var `gi`{.global.}:type(`ai`)
203 |     result.add(t[0])
204 |     let x0 = quote do:
205 |       `gi` = `ai`
206 |     g0.add(x0[0])
207 |     #echo g0.treeRepr
208 |     let xp = quote do:
209 |       `gi` += `ai`
210 |     gp.add(xp[0])
211 |     #echo gp.treeRepr
212 |     let ax = quote do:
213 |       `ai` = `gi`
214 |     a0.add(ax[0])
215 |     #echo a0.treeRepr
216 |   #echo result.treeRepr
217 |   let m = quote do:
218 |     if threadNum==0:
219 |       `g0`
220 |       threadBarrier()
221 |       threadBarrier()
222 |     else:
223 |       threadBarrier()
224 |       {.emit:"#pragma omp critical"}
225 |       block:
226 |         `gp`
227 |       threadBarrier()
228 |     `a0`
229 |   result.add(m)
230 |   #echo result.treeRepr
231 | 
232 | when isMainModule:
233 |   threadsInit()
234 |   echo threadNum, "/", numThreads
235 |   threads:
236 |     echo threadNum, "/", numThreads
237 |     let n = numThreads
238 |     let s = (n*(n-1)) div 2
239 |     var x = threadNum
240 |     threadSum(x)
241 |     echo threadNum, ": ", x, "  ", s
242 |     threadSum(x)
243 |     echo threadNum, ": ", x, "  ", n*s
244 | 
245 |     let nrep = 1000
246 | 
247 |     threadBarrier()
248 |     var t0 = epochTime()
249 |     for i in 1..nrep:
250 |       threadBarrier()
251 |     var t1 = epochTime()
252 |     echo "threadBarrier time: ", int(1e9*(t1-t0)/nrep.float), " ns"
253 | 
254 |     var f = 0.1
255 |     threadBarrier()
256 |     t0 = epochTime()
257 |     for i in 1..nrep:
258 |       threadSum(f)
259 |     t1 = epochTime()
260 |     echo "threadSum(float) time: ", int(1e9*(t1-t0)/nrep.float), " ns"
261 | 


--------------------------------------------------------------------------------
/demo3/timing.nim:
--------------------------------------------------------------------------------
 1 | include system/timers
 2 | 
 3 | template timex*(rep:var int, nn:int, s:untyped): float =
 4 |   let n = nn
 5 |   var dt {.global.}:float
 6 |   let t = getTicks()
 7 |   for i in 0..<n: s             # repeats the expression, `s`, `n` times
 8 |   var dtt = 1e-9*float(getTicks()-t) # seconds elapsed
 9 |   threadSum(dtt)
10 |   threadSingle:
11 |     dt = dtt/getNumThreads().float
12 |     rep += n
13 |   dt
14 | 


--------------------------------------------------------------------------------
/expr.nim:
--------------------------------------------------------------------------------
  1 | import macros
  2 | 
  3 | var ignore: seq[NimNode]
  4 | proc addIfNewSym(s: var seq[NimNode], x: NimNode): int =
  5 |   let sx = $x
  6 |   for i in 0..<ignore.len:
  7 |     if ignore[i].eqIdent sx: return -1
  8 |   for i in 0..<s.len:
  9 |     if s[i].eqIdent sx: return i
 10 |   result = s.len
 11 |   s.add x
 12 | 
 13 | proc cpNimNode(x: NimNode): NimNode =
 14 |   result = newNimNode(x.kind)
 15 |   case x.kind
 16 |   of nnkCharLit..nnkUInt64Lit:
 17 |     result.intVal = x.intVal
 18 |   of nnkFloatLit..nnkFloat64Lit:
 19 |     result.floatVal = x.floatVal
 20 |   of nnkStrLit..nnkTripleStrLit:
 21 |     result.strVal = x.strVal
 22 |   of nnkIdent:
 23 |     #result.ident = ident(x.repr)
 24 |     result = newIdentNode($x)
 25 |   of {nnkSym,nnkOpenSymChoice}:
 26 |     #echo "got sym"
 27 |     #quit -1
 28 |     #result = newIdentNode($x)
 29 |     result = x.copy
 30 |   else:
 31 |    discard
 32 | 
 33 | proc getVars*(v: var seq[NimNode], x,a: NimNode): NimNode =
 34 |   proc recurse(it: NimNode, vars: var seq[NimNode], a: NimNode): NimNode =
 35 |     var r0 = 0
 36 |     var r1 = it.len - 1
 37 |     case it.kind
 38 |     of {nnkSym, nnkIdent}:
 39 |       let i = vars.addIfNewSym(it)
 40 |       if i>=0:
 41 |         let ii = newLit(i)
 42 |         return newCall(a,ii)
 43 |     of nnkCallKinds: r0 = 1
 44 |     of nnkDotExpr: r1 = 0
 45 |     of {nnkVarSection,nnkLetSection}:
 46 |       result = it.cpNimNode
 47 |       for c in it:
 48 |         result.add c.cpNimNode
 49 |         for i in 0..(c.len-3):
 50 |           ignore.add c[i]
 51 |           result[^1].add c[i].cpNimNode
 52 |         result[^1].add c[^2].cpNimNode
 53 |         result[^1].add recurse(c[^1], vars, a)
 54 |       return
 55 |     else: discard
 56 |       #echo it.treerepr
 57 |     result = it.cpNimNode
 58 |     for i in 0..<r0:
 59 |       result.add it[i].cpNimNode
 60 |     for i in r0..r1:
 61 |       result.add recurse(it[i], vars, a)
 62 |     for i in (r1+1)..<it.len:
 63 |       result.add it[i].cpNimNode
 64 |   ignore.newSeq(0)
 65 |   result = recurse(x, v, a)
 66 | 
 67 | macro packVarsStmt*(x: untyped, f: untyped): auto =
 68 |   #echo x.treerepr
 69 |   var v = newSeq[NimNode](0)
 70 |   let a = ident("foo")
 71 |   let e = getVars(v, x, a)
 72 |   var p = newStmtList()
 73 |   for vs in v:
 74 |     p.add newCall(f,vs)
 75 |   result = p
 76 |   #echo result.treerepr
 77 | 
 78 | macro packVars*(x: untyped, f: untyped): auto =
 79 |   #echo x.treerepr
 80 |   var v = newSeq[NimNode](0)
 81 |   let a = ident("foo")
 82 |   let e = getVars(v, x, a)
 83 |   var p = newPar()
 84 |   if v.len==0:
 85 |     p.add newNimNode(nnkExprColonExpr).add(ident("Field0"),newLit(1))
 86 |   elif v.len==1:
 87 |     let vi = ident($v[0])
 88 |     p.add newNimNode(nnkExprColonExpr).add(ident("Field0"),newCall(f,vi))
 89 |   else:
 90 |     for vs in v:
 91 |       p.add newCall(f,vs)
 92 |   result = p
 93 |   #echo result.treerepr
 94 | 
 95 | macro substVars*(x: untyped, a: untyped): auto =
 96 |   #echo x.treerepr
 97 |   var v = newSeq[NimNode](0)
 98 |   let e = getVars(v, x, a)
 99 |   result = e
100 |   #echo result.treerepr
101 | 
102 | when isMainModule:
103 |   template test(x) =
104 |     template getref(t: untyped): untyped = addr(t)
105 |     let v = packVars(x,getref)
106 |     proc foo(xx: type(v)) =
107 |       template deref(i: int): untyped = xx[i][]
108 |       substVars(x, deref)
109 |     foo(v)
110 | 
111 |   macro dump(x: typed): auto =
112 |     echo x.repr
113 |     x
114 | 
115 |   var x,y,z: float
116 | 
117 |   dump:
118 |     test:
119 |       x = 1
120 |       y = 2
121 |       z = x + y
122 |   echo x, y, z
123 | 


--------------------------------------------------------------------------------
/genkernel.nim:
--------------------------------------------------------------------------------
 1 | import cuda
 2 | 
 3 | type gpuArray[T] = distinct ptr array[0,T]
 4 | template `[]`(x: gpuArray, i: SomeInteger): untyped =
 5 |   (ptr array[0,x.T])(x)[][i]
 6 | template `[]=`(x: gpuArray, i: SomeInteger, y: untyped): untyped =
 7 |   (ptr array[0,x.T])(x)[][i] = y
 8 | 
 9 | proc alloc(a: var gpuArray, n: int) =
10 |   let err = cudaMalloc(a, n*sizeof(a.T))
11 |   echo "alloc err: ", err
12 | proc newGpuArray[T](n: int): gpuArray[T] =
13 |   var p: pointer
14 |   let err = cudaMalloc(p.addr, n*sizeof(T))
15 |   let pa = cast[ptr array[0,T]](p)
16 |   result = (type(result))(pa)
17 |   if err:
18 |     echo "alloc err: ", err
19 |     quit(-1)
20 | 
21 | proc timesTwo[T](a: gpuArray[T]; n: int32) {.cudaGlobal.} =
22 |   var i = blockDim.x * blockIdx.x + threadIdx.x
23 |   if i < n:
24 |     a[i] *= T(2)
25 | 
26 | var
27 |   n = 10000.int32
28 |   a = newGpuArray[float32](n)
29 |   b = newGpuArray[float64](n)
30 | 
31 | var threadsPerBlock: int32 = 256
32 | var blocksPerGrid: int32 = (n + threadsPerBlock - 1) div threadsPerBlock
33 | 
34 | timesTwo<<(blocksPerGrid,threadsPerBlock)>>(a,n)
35 | 
36 | timesTwo<<(blocksPerGrid,threadsPerBlock)>>(b,n)
37 | 


--------------------------------------------------------------------------------
/opts.c2nim:
--------------------------------------------------------------------------------
1 | #def __global__
2 | 


--------------------------------------------------------------------------------
/runc2nim:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | exit
3 | 
4 | f="vectorAdd.cu"
5 | g="vectorAdd.cup"
6 | 
7 | cat $f |sed 's/\([a-zA-Z0-9_]*\)<<</cudaLaunch(\1,/;s/>>>(/,/' >$g
8 | c2nim opts.c2nim $g
9 | 


--------------------------------------------------------------------------------
/test/config.nims:
--------------------------------------------------------------------------------
1 | --path:".."
2 | 


--------------------------------------------------------------------------------
/test/test:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf nimcache
 4 | 
 5 | if (($#>0));then
 6 |     declare -ar Ts=( "$@" )
 7 | else
 8 |     declare -ar Ts=( $(echo t*.nim) )
 9 | fi
10 | logfile(){ printf "out/%s.log" "${Ts[$1]}"; }
11 | [[ -d out ]] || mkdir out
12 | declare -i j ret
13 | declare -ai F
14 | for ((j=0;j<${#Ts[@]};++j));do
15 |     printf "Testing: % 6d / %d" $j "${#Ts[@]}"
16 |     nim c -r "${Ts[j]}" > "$(logfile $j)" 2>&1
17 |     ret=$?
18 |     if ((ret!=0));then
19 |         printf '\rFail: %s\n' "${Ts[j]}"
20 |         F+=( $j )
21 |     else
22 |         rm "$(logfile $j)"
23 |         printf '\r%60s\r' ' '
24 |     fi
25 |     rm -f "${Ts[j]%.nim}"
26 | done
27 | echo
28 | echo "Total:  ${#Ts[@]}"
29 | echo "Passed: $((${#Ts[@]}-${#F[@]}))"
30 | if ((${#F[@]}>0));then
31 |     echo "Failed: ${#F[@]}"
32 |     echo
33 |     echo "Check log file(s):"
34 |     for j in ${F[@]};do
35 |         echo "$(logfile $j)"
36 |     done
37 | else
38 |     rm -rf out
39 | fi
40 | 
41 | rm -rf nimcache
42 | 


--------------------------------------------------------------------------------
/test/tinline000.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | proc f1(r: var any; x: any) = r = 2*x
 4 | proc f2(x: any): auto = 2*x
 5 | 
 6 | proc a1(x: float) =
 7 |   inlineProcs:
 8 |     var r: float
 9 |     var s: type(r)
10 |     f1(r, x)
11 | proc a2(x: float) =
12 |   inlineProcs:
13 |     var r = f2(x)
14 | 
15 | echo "* Basics"
16 | a1(1.0)
17 | a2(1.0)
18 | 


--------------------------------------------------------------------------------
/test/tinline001.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* multiple iterators"
 4 | type T = array[3,float]
 5 | proc loop(x:var T, y:T) =
 6 |   echo "loop"
 7 |   let n = 3.0
 8 |   for k in 0..<x.len:
 9 |     x[k] = n * y[k]
10 | proc loop2(x:T,y:T):T =
11 |   echo "loop2"
12 |   let n = 0.1
13 |   for k in 0..<x.len:
14 |     result[k] = n * x[k] + y[k]
15 |   for k in 0..<x.len:
16 |     result[k] = n * result[k]
17 | proc loop3(x:var T,y:T) =
18 |   echo "loop3"
19 |   x.loop y
20 |   x = y.loop2 y
21 | proc cl =
22 |   var x {.noinit.}: T
23 |   var z {.noinit.}: T
24 |   for i in 0..<x.len: x[i] = i.float
25 |   inlineProcs: z.loop x
26 |   for i in 0..<x.len: echo z[i]
27 |   inlineProcs: z = x.loop2 x
28 |   for i in 0..<x.len: echo z[i]
29 |   inlineProcs:
30 |     z.loop x
31 |     z = x.loop2 x
32 |   for i in 0..<x.len: echo z[i]
33 |   inlineProcs: z.loop3 x
34 |   for i in 0..<x.len: echo z[i]
35 | cl()
36 | 


--------------------------------------------------------------------------------
/test/tinline002.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | type T = array[3,float]
 4 | proc loop(x:var T, y:T) =
 5 |   echo "loop"
 6 |   let n = 3.0
 7 |   for k in 0..<x.len:
 8 |     x[k] = n * y[k]
 9 | proc loop2(x:T,y:T):T =
10 |   echo "loop2"
11 |   let n = 0.1
12 |   for k in 0..<x.len:
13 |     result[k] = n * x[k] + y[k]
14 |   for k in 0..<x.len:
15 |     result[k] = n * result[k]
16 | proc loop3(x:var T,y:T) =
17 |   echo "loop3"
18 |   x.loop y
19 |   x = y.loop2 y
20 | proc cl =
21 |   var x {.noinit.}: T
22 |   var z {.noinit.}: T
23 |   for i in 0..<x.len: x[i] = i.float
24 |   inlineProcs: z.loop x
25 |   for i in 0..<x.len: echo z[i]
26 |   inlineProcs: z = x.loop2 x
27 |   for i in 0..<x.len: echo z[i]
28 |   inlineProcs:
29 |     z.loop x
30 |     z = x.loop2 x
31 |   for i in 0..<x.len: echo z[i]
32 |   inlineProcs: z.loop3 x
33 |   for i in 0..<x.len: echo z[i]
34 | 
35 | echo "* top level inlineProcs calling proc with inlineProcs"
36 | proc rec =
37 |   var x {.noinit.}: T
38 |   var z {.noinit.}: T
39 |   for i in 0..<x.len: x[i] = i.float
40 |   inlineProcs: z.loop x
41 |   for i in 0..<x.len: echo i," ",z[i]
42 | inlineProcs:
43 |   rec()
44 |   cl()
45 | 


--------------------------------------------------------------------------------
/test/tinline003.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | type T = array[3,float]
 4 | proc loop(x:var T, y:T) =
 5 |   echo "loop"
 6 |   let n = 3.0
 7 |   for k in 0..<x.len:
 8 |     x[k] = n * y[k]
 9 | proc loop2(x:T,y:T):T =
10 |   echo "loop2"
11 |   let n = 0.1
12 |   for k in 0..<x.len:
13 |     result[k] = n * x[k] + y[k]
14 |   for k in 0..<x.len:
15 |     result[k] = n * result[k]
16 | 
17 | echo "* avoid duplicate computations"
18 | proc inplace(x:var float, y:float) =
19 |   x = x + y
20 |   x = x + 1000*y
21 | inlineProcs:
22 |   var x {.noinit.}: T
23 |   var z {.noinit.}: T
24 |   for i in 0..<x.len: x[i] = i.float
25 |   z.loop(x.loop2 x)
26 |   for i in 0..<x.len:
27 |     z[i].inplace(0.1*i.float)
28 |     echo i," ",z[i]
29 | 


--------------------------------------------------------------------------------
/test/tinline004.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* redeclaration of formal params"
 4 | proc redecl(x:var float, y:float) =
 5 |   block:
 6 |     var x = x
 7 |     x += y
 8 |     var y = y
 9 |     y += 1
10 |     echo x," ",y
11 |   x += 3
12 |   let x = x
13 |   var y = y
14 |   y += x
15 |   echo x," ",y
16 | block:
17 |   echo "Without inlining:"
18 |   var x = 1.0
19 |   var y = 0.1
20 |   x.redecl(y+0.01)
21 |   echo x," ",y
22 | inlineProcs:
23 |   echo "With inlining:"
24 |   var x = 1.0
25 |   var y = 0.1
26 |   x.redecl(y+0.01)
27 |   echo x," ",y
28 | 


--------------------------------------------------------------------------------
/test/tinline005.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | type T = array[3,float]
 4 | echo "* Generic parameters"
 5 | proc g[T;N:static[int]](x:array[N,T]) =
 6 |   var s = ""
 7 |   for i in 0..<N:
 8 |     if i>0: s &= " , "
 9 |     s &= $x[i]
10 |   echo "x = [ ",s," ] has size ",N*sizeof(T)
11 | block:
12 |   inlineProcs:
13 |     var v = [0,1,2,3]
14 |     g v
15 | 


--------------------------------------------------------------------------------
/test/tinline006.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* object construction"
 4 | proc oc(x:int):auto =
 5 |   type A = object
 6 |     x:int
 7 |   return A(x:x)
 8 | block:
 9 |   inlineProcs:
10 |     var x = 3
11 |     echo oc(x).x
12 | 


--------------------------------------------------------------------------------
/test/tinline007.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | proc g[T;N:static[int]](x:array[N,T]) =
 4 |   var s = ""
 5 |   for i in 0..<N:
 6 |     if i>0: s &= " , "
 7 |     s &= $x[i]
 8 |   echo "x = [ ",s," ] has size ",N*sizeof(T)
 9 | echo "* Types with generic parameters"
10 | proc gt[T] =
11 |   type
12 |     M[N:static[int]] = object
13 |       d:array[N,T]
14 |   var A:M[3]
15 |   proc g[N:static[int]](x:M[N]) = x.d.g
16 |   proc `[]`[N:static[int]](x:M[N],i:int):T = x.d[i]
17 |   proc `[]=`[N:static[int]](x:var M[N],i:int,y:T) = x.d[i] = y
18 |   inlineProcs:
19 |     for i in 0..<A.N:
20 |       A[i] = T(i)
21 |     g(A)
22 | gt[float]()
23 | gt[int]()                     # Note github issue #6126
24 | 


--------------------------------------------------------------------------------
/test/tinline008.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* Proc return an auto generic type"
 4 | proc rg[T](x:T):auto = x
 5 | type
 6 |   M[N:static[int],T] = object
 7 |     d:array[N,T]
 8 | proc rgt =
 9 |   var x,y {.noinit.}:M[3,float]
10 |   for i in 0..<x.N: x.d[i] = 0.1+i.float
11 |   inlineProcs:
12 |     y = x.rg
13 |   for i in 0..<y.N: echo i," ",y.d[i]
14 | rgt()
15 | 


--------------------------------------------------------------------------------
/test/tinline009.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | type
 4 |   M[N:static[int],T] = object
 5 |     d:array[N,T]
 6 | echo "* Object wrappers of generic types"
 7 | type
 8 |   W[T] = object
 9 |     o:T
10 |   Walt[T] = object
11 |     o:W[T]
12 | proc toAlt[S](x:W[S]):auto = Walt[S](o:x)
13 | proc toAlt2[S](x:W[W[S]]):auto = Walt[S](o:x.o)
14 | block:
15 |   var A {.noinit.} :M[3,float]
16 |   for i in 0..<A.d.len: A.d[i] = i.float
17 |   var w = W[type(A)](o:A)
18 |   inlineProcs:
19 |     var walt = w.toAlt
20 |   for i in 0..<walt.o.o.d.len: echo walt.o.o.d[i]
21 |   var w2 = W[type(w)](o:w)
22 |   inlineProcs:
23 |     var walt2 = w2.toAlt2
24 |   for i in 0..<walt2.o.o.d.len: echo walt2.o.o.d[i]
25 | 


--------------------------------------------------------------------------------
/test/tinline010.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* Proc with local proc/template"
 4 | type Mt[F] = object
 5 |   m:array[3,F]
 6 | proc len[F](m:Mt[F]):int = m.m.len
 7 | template `[]`[F](x:Mt[F],i:int):F = x.m[i]
 8 | iterator items[F](m:Mt[F]):F =
 9 |   var i = 0
10 |   while i < m.len:
11 |     yield m[i]
12 |     inc i
13 | proc lp =
14 |   proc `$`[F](m:Mt[F]):string =
15 |     result = "Mt["
16 |     for x in m: result &= " " & $x
17 |     result &= " ]"
18 |   template go[F](x:Mt[F],y:untyped) =
19 |     for i in 0..<x.len: x[i] += y[i]
20 |   var x = Mt[float](m:[1.0,2.0,3.0])
21 |   var y = [0.1,0.2,0.3]
22 |   for i in 0..<y.len: y[i] *= 0.1
23 |   x.go y
24 |   echo x
25 | inlineProcs:
26 |   lp()
27 | 


--------------------------------------------------------------------------------
/test/tinline011.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* varargs"
 4 | proc square[T](x:T):float =
 5 |   let y = float(x)
 6 |   y*y
 7 | proc fv(z:var float, xs:varargs[float, square]) =
 8 |   for x in xs: z += x
 9 | block:
10 |   inlineProcs:
11 |     var
12 |       s = 0.0
13 |       x = 1
14 |       y = 2.2
15 |       z:float32 = 3.3
16 |     s.fv(x,y,z)
17 |     echo s
18 | 


--------------------------------------------------------------------------------
/test/tinline012.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | echo "* noinit"
 4 | proc fr =
 5 |   type
 6 |     R[K:static[int]] = object
 7 |       a:array[K,float]
 8 |       s:float
 9 |   proc fr[K:static[int]]:R[K] {.noinit.} =
10 |     result.s = 0
11 |     for i in 0..<K:
12 |       result.a[i] = i.float
13 |       result.s += result.a[i]
14 |   var v = fr[5]()
15 |   for x in v.a: echo x
16 |   echo v.s
17 | inlineProcs:
18 |   fr()
19 | 


--------------------------------------------------------------------------------
/test/tinline013.nim:
--------------------------------------------------------------------------------
1 | import inline
2 | 
3 | echo "* static[T]"
4 | proc fs(x:int, y:static[int]):int = x*y
5 | inlineProcs:
6 |   var x = 2
7 |   let y = x.fs 3
8 |   echo y
9 | 


--------------------------------------------------------------------------------
/test/tinline014.nim:
--------------------------------------------------------------------------------
 1 | import inline
 2 | 
 3 | type T = array[3,float]
 4 | 
 5 | echo "* avoid duplicate computations"
 6 | proc inplace(x:var float, y:float) =
 7 |   x = x + y
 8 |   x = x + 1000*y
 9 | inlineProcs:
10 |   var x {.noinit.}: T
11 |   for i in 0..<x.len: x[i] = i.float
12 |   var s = 0.0
13 |   for m in mitems(x):
14 |     s += m
15 |     m.inplace(1000 * s)
16 |   for i in 0..<x.len: echo x[i]
17 | 


--------------------------------------------------------------------------------