├── .gitignore ├── README.md ├── build ├── config.nims ├── cuda.nim ├── demo1 ├── compile ├── config.nims ├── cpugpuarray.nim ├── demo1.rst ├── ex1.nim ├── ex2.nim ├── gpuarray.nim └── linalg.nim ├── demo2 ├── .gitignore ├── coalesced.nim ├── compile ├── config.nims ├── cpugpuarray.nim ├── demo1.rst ├── ex1.nim ├── ex2.nim ├── gpuarray.nim └── linalg.nim ├── demo3 ├── .gitignore ├── bench ├── bench_cpu ├── ccwrapper ├── coalesced.nim ├── compile ├── compile_cpu ├── config.nims ├── cpugpuarray.nim ├── doc │ ├── PP-Nim-metaprogramming-DOE-COE-PP-2017.pdf │ ├── bandwidth_knl.gp │ ├── bandwidth_knl.pdf │ ├── bandwidth_knl.tex │ ├── bandwidth_p100.gp │ ├── bandwidth_p100.pdf │ ├── bandwidth_p100.tex │ ├── readme.html │ └── readme.org ├── ex1.nim ├── ex2.nim ├── gpuarray.nim ├── linalg.nim ├── llbits.h ├── out │ ├── bludhaven.ex2 │ ├── bludhaven.info │ ├── kingly.ex2 │ ├── kingly.info │ ├── neddy.ftm.alcf.anl.gov.ex2 │ └── neddy.ftm.alcf.anl.gov.info ├── qexLite │ ├── alignedMem.nim │ ├── comms │ │ ├── comms.nim │ │ ├── commsQmp.nim │ │ └── qmp.nim │ ├── config.nims │ ├── layout.nim │ ├── layout │ │ ├── layoutX.nim │ │ ├── qgather.c │ │ ├── qlayout.c │ │ ├── qlayout.h │ │ ├── qshifts.c │ │ └── shifts.nim │ ├── metaUtils.nim │ ├── omp.nim │ ├── simd.nim │ ├── simd │ │ ├── simdArray.nim │ │ ├── simdAvx.cnim │ │ ├── simdAvx.nim │ │ ├── simdAvx512.cnim │ │ ├── simdAvx512.nim │ │ ├── simdQpx.nim │ │ ├── simdSse.cnim │ │ ├── simdSse.nim │ │ ├── simdX86.nim │ │ ├── simdX86Ops.nim │ │ ├── simdX86Ops1.nim │ │ └── simdX86Types.nim │ ├── stdUtils.nim │ └── threading.nim ├── timing.nim └── vectorized.nim ├── expr.nim ├── genkernel.nim ├── inline.nim ├── opts.c2nim ├── runc2nim └── test ├── config.nims ├── test ├── tinline000.nim ├── tinline001.nim ├── tinline002.nim ├── tinline003.nim ├── tinline004.nim ├── tinline005.nim ├── tinline006.nim ├── tinline007.nim ├── tinline008.nim ├── tinline009.nim ├── tinline010.nim ├── tinline011.nim ├── tinline012.nim ├── tinline013.nim └── tinline014.nim /.gitignore: -------------------------------------------------------------------------------- 1 | /cuda 2 | /expr 3 | /genkernel 4 | /inline 5 | nimcache/ 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cudanim 2 | CUDA for Nim 3 | 4 | initial proof of concept 5 | -------------------------------------------------------------------------------- /build: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #nvcc="/usr/local/cuda/bin/nvcc" 4 | #$nvcc vectorAdd.cu 5 | 6 | #nim cpp -c -d:release cuda 7 | 8 | #/usr/local/cuda/bin/nvcc -I$HOME/tmp/nim/Nim/lib nimcache/cuda.cpp 9 | 10 | #nim cpp -d:release cuda 11 | 12 | #nim cpp -d:release vectorAdd 13 | nim cpp -d:release genkernel 14 | -------------------------------------------------------------------------------- /config.nims: -------------------------------------------------------------------------------- 1 | switch("cc", "gcc") 2 | switch("gcc.cpp.exe", "/usr/local/cuda/bin/nvcc") 3 | switch("gcc.cpp.linkerexe", "/usr/local/cuda/bin/nvcc") 4 | #switch("gcc.cpp.options.always", "--x cu -ccbin=gcc-5") 5 | switch("gcc.cpp.options.always", "--x cu") 6 | switch("gcc.cpp.options.speed", "-O3") 7 | -------------------------------------------------------------------------------- /cuda.nim: -------------------------------------------------------------------------------- 1 | import macros 2 | import inline 3 | import expr 4 | 5 | #macro dumpType(x:typed): auto = 6 | # result = newEmptyNode() 7 | # echo x.getType.treerepr 8 | proc addChildrenFrom*(dst,src: NimNode): NimNode = 9 | for c in src: dst.add(c) 10 | result = dst 11 | macro procInst*(p: typed): auto = 12 | #echo "begin procInst:" 13 | #echo p.treerepr 14 | result = p[0] 15 | macro makeCall*(p: proc, x: tuple): NimNode = 16 | result = newCall(p).addChildrenFrom(x) 17 | 18 | type 19 | CudaDim3* {.importc:"dim3",header:"cuda_runtime.h".} = object 20 | x*, y*, z*: cint 21 | cudaError_t* {.importc,header:"cuda_runtime.h".} = object 22 | cudaMemcpyKind* {.importc,header:"cuda_runtime.h".} = object 23 | var 24 | cudaSuccess*{.importC,header:"cuda_runtime.h".}: cudaError_t 25 | cudaErrorNotSupported*{.importC,header:"cuda_runtime.h".}: cudaError_t 26 | cudaMemcpyHostToDevice*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind 27 | cudaMemcpyDeviceToHost*{.importC,header:"cuda_runtime.h".}: cudaMemcpyKind 28 | 29 | #template toPointer*(x: pointer): pointer = x 30 | #template toPointer*[T](x: ptr T): pointer = pointer(x) 31 | #template toPointer*(x: seq): pointer = toPointer(x[0]) 32 | #template toPointer*(x: not (pointer|seq)): pointer = pointer(unsafeAddr(x)) 33 | template toPointer*(x: typed): pointer = 34 | #dumpType: x 35 | when x is pointer: x 36 | elif x is ptr: x 37 | elif x is seq: toPointer(x[0]) 38 | else: pointer(unsafeAddr(x)) 39 | template dataAddr*(x: typed): pointer = 40 | #dumpType: x 41 | when x is seq: dataAddr(x[0]) 42 | elif x is array: dataAddr(x[0]) 43 | #elif x is ptr: x 44 | else: pointer(unsafeAddr(x)) 45 | #else: x 46 | 47 | proc cudaGetLastError*(): cudaError_t 48 | {.importC,header:"cuda_runtime.h".} 49 | proc cudaGetErrorStringX*(error: cudaError_t): ptr char 50 | {.importC:"cudaGetErrorString",header:"cuda_runtime.h".} 51 | proc cudaGetErrorString*(error: cudaError_t): cstring = 52 | var s {.codegendecl:"const $# $#".} = cudaGetErrorStringX(error) 53 | result = s 54 | proc `$`*(error: cudaError_t): string = 55 | let s = cudaGetErrorString(error) 56 | result = $s 57 | converter toBool*(e: cudaError_t): bool = 58 | cast[cint](e) != cast[cint](cudaSuccess) 59 | 60 | proc cudaMalloc*(p:ptr pointer, size: csize): cudaError_t 61 | {.importC,header:"cuda_runtime.h".} 62 | template cudaMalloc*(p:pointer, size: csize): cudaError_t = 63 | cudaMalloc((ptr pointer)(p.addr), size) 64 | proc cudaFree*(p: pointer): cudaError_t 65 | {.importC,header:"cuda_runtime.h".} 66 | proc cudaMallocManaged*(p: ptr pointer, size: csize): cudaError_t 67 | {.importC,header:"cuda_runtime.h".} 68 | 69 | proc cudaMemcpyX*(dst,src: pointer, count: csize, kind: cudaMemcpyKind): 70 | cudaError_t {.importC:"cudaMemcpy",header:"cuda_runtime.h".} 71 | template cudaMemcpy*(dst,src: typed, count: csize, 72 | kind: cudaMemcpyKind): cudaError_t = 73 | let pdst = toPointer(dst) 74 | let psrc = toPointer(src) 75 | cudaMemcpyX(pdst, psrc, count, kind) 76 | 77 | proc cudaLaunchKernel(p:pointer, gd,bd: CudaDim3, args: ptr pointer): 78 | cudaError_t {.importC,header:"cuda_runtime.h".} 79 | 80 | proc cudaDeviceReset*(): cudaError_t 81 | {.importC,header:"cuda_runtime.h".} 82 | proc cudaDeviceSynchronize*(): cudaError_t 83 | {.importC,header:"cuda_runtime.h".} 84 | 85 | #proc printf*(fmt:cstring):cint {.importc,varargs,header:"",discardable.} 86 | #proc fprintf*(stream:ptr FILE,fmt:cstring):cint {.importc,varargs,header:"".} 87 | #proc malloc*(size: csize):pointer {.importc,header:"".} 88 | 89 | template cudaDefs(body: untyped): untyped {.dirty.} = 90 | var gridDim{.global,importC,noDecl.}: CudaDim3 91 | var blockIdx{.global,importC,noDecl.}: CudaDim3 92 | var blockDim{.global,importC,noDecl.}: CudaDim3 93 | var threadIdx{.global,importC,noDecl.}: CudaDim3 94 | template getGridDim: untyped {.used.} = gridDim 95 | template getBlockIdx: untyped {.used.} = blockIdx 96 | template getBlockDim: untyped {.used.} = blockDim 97 | template getThreadIdx: untyped {.used.} = threadIdx 98 | template getThreadNum: untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x 99 | template getNumThreads: untyped {.used.} = gridDim.x * blockDim.x 100 | bind inlineProcs 101 | inlineProcs: 102 | body 103 | 104 | template cudaLaunch*(p: proc; blocksPerGrid,threadsPerBlock: SomeInteger; 105 | arg: varargs[pointer,dataAddr]) = 106 | var pp: proc = p 107 | var gridDim, blockDim: CudaDim3 108 | gridDim.x = blocksPerGrid 109 | gridDim.y = 1 110 | gridDim.z = 1 111 | blockDim.x = threadsPerBlock 112 | blockDim.y = 1 113 | blockDim.z = 1 114 | var args: array[arg.len, pointer] 115 | for i in 0..>`*(px: tuple, y: any): auto = 133 | #echo "begin >>:" 134 | #echo px.treerepr 135 | #echo "kernel type:" 136 | #echo px[0].getTypeImpl.treerepr 137 | #echo "kernel args:" 138 | #echo y.treerepr 139 | #var a = y 140 | #if y.kind != nnkPar: a = newNimNode(nnkPar).addChildrenFrom(y) 141 | result = newCall(ident("cudaLaunch")) 142 | let krnl = newCall(px[0]).addChildrenFrom(y) 143 | #echo "kernel inst call:" 144 | #echo krnl.treerepr 145 | result.add getAst(getInst(krnl))[0] 146 | result.add px[1][0] 147 | result.add px[1][1] 148 | for c in y: 149 | result.add c 150 | #echo "kernel launch body:" 151 | #echo result.treerepr 152 | 153 | macro cuda*(s,p: untyped): auto = 154 | #echo "begin cuda:" 155 | #echo s.treerepr 156 | let ss = s.strVal 157 | #echo "proc:" 158 | #echo p.treerepr 159 | p.expectKind nnkProcDef 160 | result = p 161 | # if p.kind == nnkProcDef: 162 | # result = p 163 | # else: 164 | # result = p[0] 165 | result.addPragma parseExpr("{.codegenDecl:\""&ss&" $# $#$#\".}")[0] 166 | result.body = getAst(cudaDefs(result.body)) 167 | var sl = newStmtList() 168 | sl.add( quote do: 169 | {.push checks: off.} 170 | {.push stacktrace: off.} ) 171 | sl.add result 172 | result = sl 173 | #echo "end cuda:" 174 | #echo result.treerepr 175 | template cudaGlobal*(p: untyped): auto = cuda("__global__",p) 176 | 177 | template onGpu*(nn,tpb: untyped, body: untyped): untyped = 178 | block: 179 | var v = packVars(body, getGpuPtr) 180 | type ByCopy {.bycopy.} [T] = object 181 | d: T 182 | proc kern(xx: ByCopy[type(v)]) {.cudaGlobal.} = 183 | template deref(k: int): untyped = xx.d[k] 184 | substVars(body, deref) 185 | let ni = nn.int32 186 | let threadsPerBlock = tpb.int32 187 | let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock 188 | #echo "launching kernel" 189 | cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v) 190 | discard cudaDeviceSynchronize() 191 | template onGpu*(nn: untyped, body: untyped): untyped = onGpu(nn, 64, body) 192 | template onGpu*(body: untyped): untyped = onGpu(512*64, 64, body) 193 | 194 | when isMainModule: 195 | type FltArr = array[0,float32] 196 | proc vectorAdd*(A: FltArr; B: FltArr; C: var FltArr; n: int32) 197 | {.cudaGlobal.} = 198 | var i = blockDim.x * blockIdx.x + threadIdx.x 199 | if i < n: 200 | C[i] = A[i] + B[i] 201 | 202 | proc test = 203 | var n = 50000.cint 204 | var 205 | a = newSeq[float32](n) 206 | b = newSeq[float32](n) 207 | c = newSeq[float32](n) 208 | var threadsPerBlock: cint = 256 209 | var blocksPerGrid: cint = (n + threadsPerBlock - 1) div threadsPerBlock 210 | 211 | cudaLaunch(vectorAdd, blocksPerGrid, threadsPerBlock, a, b, c, n) 212 | 213 | template getGpuPtr(x: int): untyped = x 214 | template getGpuPtr[T](x: seq[T]): untyped = addr(x[0]) 215 | template `[]`(x: ptr SomeNumber, i: SomeInteger): untyped {.used.} = 216 | cast[ptr array[0,type(x[])]](x)[][i] 217 | template `[]=`(x: ptr SomeNumber, i: SomeInteger, y:untyped): untyped {.used.} = 218 | cast[ptr array[0,type(x[])]](x)[][i] = y 219 | onGpu(n): 220 | let i = getBlockDim().x * getBlockIdx().x + getThreadIdx().x 221 | if i < n: 222 | c[i] = a[i] + b[i] 223 | 224 | test() 225 | -------------------------------------------------------------------------------- /demo1/compile: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | f="$1" 4 | if [ -z "$f" ]; then 5 | f="ex1.nim" 6 | fi 7 | 8 | nim cpp -d:release $f 9 | -------------------------------------------------------------------------------- /demo1/config.nims: -------------------------------------------------------------------------------- 1 | switch("cc", "gcc") 2 | switch("gcc.cpp.exe", "/usr/local/cuda/bin/nvcc") 3 | switch("gcc.cpp.linkerexe", "/usr/local/cuda/bin/nvcc") 4 | switch("gcc.cpp.options.always", "--x cu") 5 | switch("gcc.cpp.options.speed", "-O3 -Xcompiler -march=native,-fPIC") 6 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -mcpu=native,-mtune=native,-fPIC") 7 | # switch("gcc.cpp.options.always", "--x cu -ccbin=g++-4.9") 8 | # switch("gcc.cpp.options.speed", "-O3 -Xcompiler -march=haswell,-fPIC") 9 | 10 | #switch("gcc.cpp.options.speed", "-O3 -march=haswell") 11 | -------------------------------------------------------------------------------- /demo1/cpugpuarray.nim: -------------------------------------------------------------------------------- 1 | import gpuarray 2 | export gpuarray 3 | import macros 4 | import ../cuda 5 | export cuda 6 | import ../expr 7 | import linalg 8 | export linalg 9 | include system/ansi_c 10 | 11 | #template onGpu*(x: untyped): untyped = x 12 | #template onGpu*(a,b,x: untyped): untyped = x 13 | 14 | type 15 | ArrayObj*[T] = object 16 | p*: ptr array[0,T] 17 | n*: int 18 | g*: GpuArrayObj[T] 19 | lastOnGpu*: bool 20 | unifiedMem*: bool 21 | ArrayRef*[T] = ref ArrayObj[T] 22 | Array*[T] = ArrayRef[T] 23 | Arrays* = ArrayObj | ArrayRef 24 | Arrays2* = ArrayObj | ArrayRef 25 | Arrays3* = ArrayObj | ArrayRef 26 | 27 | proc init[T](r: var ArrayObj[T], n: int) = 28 | var p: ptr T 29 | r.unifiedMem = true 30 | if r.unifiedMem: 31 | let err = cudaMallocManaged(cast[ptr pointer](addr p), n*sizeof(T)) 32 | else: 33 | p = createSharedU(T, n) 34 | r.n = n 35 | r.p = cast[type(r.p)](p) 36 | proc init[T](r: var ArrayRef[T], n: int) = 37 | r.new 38 | r[].init(n) 39 | 40 | proc newArrayObj[T](r: var ArrayObj[T], n: int) = 41 | r.init(n) 42 | proc newArrayObj[T](n: int): ArrayObj[T] = 43 | result.init(n) 44 | 45 | proc newArrayRef*[T](r: var ArrayRef[T], n: int) = 46 | r.init(n) 47 | proc newArrayRef*[T](n: int): ArrayRef[T] = 48 | result.init(n) 49 | 50 | proc toGpu*(x: var Arrays) = 51 | if x.unifiedMem: 52 | if x.g.n==0: 53 | x.g.n = x.n 54 | x.g.p = cast[type(x.g.p)](x.p) 55 | else: 56 | if not x.lastOnGpu: 57 | x.lastOnGpu = true 58 | if x.g.n==0: x.g.init(x.n) 59 | let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice) 60 | if err: echo err 61 | 62 | proc toCpu*(x: var Arrays) = 63 | if not x.unifiedMem: 64 | if x.lastOnGpu: 65 | x.lastOnGpu = false 66 | let err = cudaMemcpy(x.p, x.g.p, x.n*sizeof(x.T), cudaMemcpyDeviceToHost) 67 | if err: echo err 68 | 69 | template getGpuPtr*(x: var Arrays): untyped = 70 | toGpu(x) 71 | x.g 72 | 73 | template indexArray*(x: Arrays, i: SomeInteger): untyped = 74 | x.p[][i] 75 | #template `[]=`(x: ArrayObj, i: SomeInteger, y: untyped): untyped = 76 | # x.p[][i] = y 77 | 78 | macro indexArray*(x: Arrays{call}, y: SomeInteger): untyped = 79 | #echo "call[", y.repr, "]" 80 | #echo x.treerepr 81 | #if siteLocalsField.contains($x[0]): 82 | result = newCall(ident($x[0])) 83 | for i in 1..", varargs, discardable.} 176 | 177 | when isMainModule: 178 | var N = 100 179 | 180 | proc testfloat = 181 | var x = newArrayRef[float32](N) 182 | var y = newArrayRef[float32](N) 183 | var z = newArrayRef[float32](N) 184 | x := 1 185 | y := 2 186 | z := 3 187 | x += y * z 188 | if (x.n-1) mod getNumThreads() == getThreadNum(): 189 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 190 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 191 | onGpu(1,32): 192 | x += y * z 193 | if (x.n-1) mod getNumThreads() == getThreadNum(): 194 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 195 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 196 | x.toCpu 197 | if (x.n-1) mod getNumThreads() == getThreadNum(): 198 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 199 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 200 | testfloat() 201 | 202 | proc testcomplex = 203 | var x = newArrayRef[Complex[float32]](N) 204 | var y = newArrayRef[Complex[float32]](N) 205 | var z = newArrayRef[Complex[float32]](N) 206 | x := 1 207 | y := 2 208 | z := 3 209 | x += y * z 210 | if (x.n-1) mod getNumThreads() == getThreadNum(): 211 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 212 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re) 213 | 214 | onGpu: 215 | x += y * z 216 | x += 1 217 | 218 | x += y * z 219 | if (x.n-1) mod getNumThreads() == getThreadNum(): 220 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 221 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re) 222 | testcomplex() 223 | 224 | proc testcolmat = 225 | var x = newArrayRef[Colmat[float32]](N) 226 | var y = newArrayRef[Colmat[float32]](N) 227 | var z = newArrayRef[Colmat[float32]](N) 228 | x := 1 229 | y := 2 230 | z := 3 231 | x += y * z 232 | if (x.n-1) mod getNumThreads() == getThreadNum(): 233 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 234 | cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re) 235 | 236 | onGpu(N): 237 | x += y * z 238 | 239 | x += y * z 240 | if (x.n-1) mod getNumThreads() == getThreadNum(): 241 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 242 | cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re) 243 | testcolmat() 244 | -------------------------------------------------------------------------------- /demo1/demo1.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | Portable expressions in Nim 3 | =========================== 4 | 5 | :Author: James C. Osborn 6 | 7 | .. contents:: 8 | 9 | Preliminaries 10 | ============= 11 | 12 | This document was created with Nim's built-in documentation generator. 13 | It can parse documentation comments in the source code and also process 14 | separate reStructuredText_ files. 15 | This document was made from a reStructuredText file using Nim's 16 | document generator to try it out and also take advantage of its Nim 17 | code highlighter. 18 | 19 | .. _reStructuredText: https://en.wikipedia.org/wiki/ReStructuredText 20 | 21 | Code portability in Nim 22 | ======================= 23 | 24 | Here's an example of the result (so far) 25 | 26 | .. code-block:: Nim 27 | 28 | import cpugpuarray 29 | 30 | let N = 1000 31 | var x = newColorMatrixArray(N) 32 | var y = newColorMatrixArray(N) 33 | var z = newColorMatrixArray(N) 34 | 35 | # set them to diagonal matrices on CPU 36 | x := 1 37 | y := 2 38 | z := 3 39 | 40 | # do something on CPU 41 | x += y * z 42 | 43 | # do something on GPU 44 | onGpu: 45 | x += y * z 46 | z := 4 47 | 48 | # do something on CPU again 49 | x += y * z 50 | 51 | if x[0][0,0].re == 21.0: 52 | echo "yay, it worked!" 53 | echo "do you agree, GPU?" 54 | 55 | onGpu: 56 | if getThreadNum()==0: 57 | if x[0][0,0].re == 21.0: 58 | printf("yes, I agree!\n") 59 | 60 | # outputs: 61 | # yay, it worked! 62 | # do you agree, GPU? 63 | # yes, I agree! 64 | 65 | 66 | The above can be compiled and run with 67 | 68 | :: 69 | 70 | nim cpp -d:release -r ex1.nim 71 | 72 | 73 | This is basically the main interface that the average user would need to 74 | deal with, the rest is just details for the curious. 75 | 76 | 77 | Implementation details 78 | ====================== 79 | 80 | The main container object in the example above is an array that can live 81 | on the CPU and also the GPU. This is defined as 82 | 83 | .. code-block:: Nim 84 | 85 | type 86 | ArrayObj*[T] = object 87 | p*: ptr array[0,T] 88 | n*: int 89 | g*: GpuArrayObj[T] 90 | lastOnGpu*: bool 91 | 92 | GpuArrayObj*[T] = object 93 | p*: ptr array[0,T] 94 | n*: int 95 | 96 | ``ArrayObj[T]`` is a generic array-like object parameterized on the type ``T``. 97 | This is similar to a templated type declaration in C++ with ``T`` being the template parameter (Nim uses ``[T]`` instead of ```` for generics). 98 | The ``*`` (star) after all the type and field names above means that they are exported from this module (file). 99 | They will be visible to another module that ``import``'s this module (otherwise they would be private to this module). 100 | 101 | The ``ArrayObj`` contains four fields: 102 | 103 | - ``p``: which is a pointer (``ptr``) for the data on the host. \ 104 | This is implemented as a pointer to an array of length ``0`` \ 105 | with elements of type ``T`` for convenience. \ 106 | This should really be marked with an ``{.unchecked.}`` pragma to prevent \ 107 | bounds checking in debug mode (bounds checks are off by default in release mode). 108 | - ``n``: the number of elements in the array. 109 | - ``g``: a GPU array object, defined next. 110 | - ``lastOnGpu``: a Boolean that tells us which pointer is valid. 111 | 112 | The ``GpuArrayObj`` is similar to ``ArrayObj``, but just contains a pointer \ 113 | (which will hold a GPU pointer) and the number of elements. 114 | This is the object we will pass to the GPU, so it contains a copy of the \ 115 | length for convenience. 116 | 117 | 118 | Offloading 119 | ========== 120 | 121 | The offload magic happens in the ``onGpu:`` block. 122 | It is defined like 123 | 124 | .. code-block:: Nim 125 | 126 | # the default total threads (nn=32*256) and threads per block (tpb=256) 127 | # are just for testing, they really should be an educated 128 | # guess made from querying the device 129 | template onGpu*(body: untyped): untyped = onGpu(32*256, 256, body) 130 | 131 | This launches a CUDA kernel using the default number of threads and threads \ 132 | per block. Right now they are hard-coded, but should really come from \ 133 | querying the device (or let the user specify some global default). 134 | 135 | One can override the defaults for a call by explicitly specifying them 136 | 137 | .. code-block:: Nim 138 | 139 | onGpu(x.n, 128): 140 | x += y * z 141 | z := 4 142 | 143 | This would launch one (virtual) thread per element of the array ``x`` and use 144 | 128 threads per block. 145 | 146 | The CUDA kernel gets created here 147 | 148 | .. code-block:: Nim 149 | 150 | template onGpu*(nn,tpb: untyped, body: untyped): untyped = 151 | block: 152 | var v = packVars(body, getGpuPtr) 153 | type myt {.bycopy.} = object 154 | d: type(v) 155 | proc kern(xx: myt) {.cudaGlobal.} = 156 | template deref(k: int): untyped = xx.d[k] 157 | substVars(body, deref) 158 | let ni = nn.int32 159 | let threadsPerBlock = tpb.int32 160 | let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock 161 | cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v) 162 | discard cudaDeviceSynchronize() 163 | 164 | This starts a new block scope (``block:``), similar to ``{...}`` in C. 165 | This is done to isolate the defined kernel (``proc kern ...``) from other \ 166 | ``onGpu`` blocks. 167 | 168 | The first major task is to examine the body of the ``onGpu`` block and \ 169 | extract the variables that are used. 170 | This is done by the ``packVars`` macro. 171 | It walks the syntax tree of the code block passed in and keeps track of \ 172 | the (unique) variables it references. 173 | It then spits out a data structure (a tuple_) containing those variables. 174 | It wraps each variable in a call to the function name that was passed in \ 175 | (in this case ``getGpuPtr``). 176 | For the example above, this line would get expanded to 177 | 178 | .. _tuple: https://nim-lang.org/docs/manual.html#types-tuples-and-object-types 179 | 180 | .. code-block:: Nim 181 | 182 | var v = (getGpuPtr(x), getGpuPtr(y), getGpuPtr(z)) 183 | 184 | The function ``getGpuPtr`` can then be defined independently for each type \ 185 | to return a valid GPU object (it actually doesn't have to be a pointer as we'llsee next). 186 | For the ``ArrayObj`` type it is defined as 187 | 188 | .. code-block:: Nim 189 | 190 | template getGpuPtr*(x: var ArrayObj): untyped = 191 | toGpu(x) 192 | x.g 193 | 194 | This copies the data to the GPU (if necessary) and then returns the \ 195 | ``GpuArrayObj`` containing the GPU pointer and the length of the array. 196 | This is a (small) object residing in CPU memory, and the CUDA library \ 197 | takes care of copying it to the GPU when passed as an argument. 198 | 199 | Copying the data to the GPU is handled by 200 | 201 | .. code-block:: Nim 202 | 203 | proc toGpu*(x: var ArrayObj) = 204 | if not x.lastOnGpu: 205 | x.lastOnGpu = true 206 | if x.g.n==0: x.g.init(x.n) 207 | let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice) 208 | if err: echo err 209 | 210 | Here we check if this array was last used on the GPU. 211 | If not we check if it has been initialized yet (``x.g.n==0``) and \ 212 | initialize it if not (which will call cudaMalloc). 213 | We then copy the CPU memory to GPU memory. 214 | Here we could also translate the layout if we wanted. 215 | 216 | Currently I am not distinguishing between read access and write access. 217 | This could lead to further optimization. 218 | It should be possible to modify the existing methods to handle that too. 219 | 220 | Next we create the CUDA kernel (``kern``). 221 | The kernel is defined here 222 | 223 | .. code-block:: Nim 224 | 225 | proc kern(xx: myt) {.cudaGlobal.} = 226 | template deref(k: int): untyped = xx.d[k] 227 | substVars(body, deref) 228 | 229 | This is a function taking one argument (which contains the packed \ 230 | ``GpuArrayObj``'s or any other objects used by the expressions. 231 | I originally wrote the procedure definition as 232 | 233 | .. code-block:: Nim 234 | 235 | proc kern(xx: type(v)) {.cudaGlobal.} = 236 | template deref(k: int): untyped = xx[k] 237 | substVars(body, deref) 238 | 239 | but found that Nim decided in some cases to pass the argument of \ 240 | ``kern`` (``xx``) as a pointer, instead of by value. 241 | Nim does this to optimize function calls when it feels it is safe to do so. 242 | To prevent this I wrapped the tuple in another object type (``myt``) that \ 243 | is explicitly declared ``{.bycopy.}``, so that Nim will always pass it by \ 244 | value (which makes a copy). 245 | 246 | In retrospect, another approach may have been to mark the procedure as \ 247 | ``{.exportC.}``, which will also prevent Nim from changing the calling \ 248 | conventions. I would then need to make the procedure names ``kern`` unique \ 249 | on my own since Nim will also not perform name-mangling on ``{.exportC.}`` \ 250 | procedures. 251 | 252 | The main body of the kernel comes from the 253 | 254 | .. code-block:: Nim 255 | 256 | substVars(body, deref) 257 | 258 | macro. 259 | It works similarly to the ``packVars`` macro above, but this time it will \ 260 | identify the variables referenced in the code block and substitute them \ 261 | with a call to the provided function (``deref``) with an integer argument \ 262 | that specifies which position in the kernel argument tuple that variable \ 263 | is in. For the example above this would generate 264 | 265 | .. code-block:: Nim 266 | 267 | deref(0) += deref(1) * deref(2) 268 | deref(2) := 4 269 | 270 | The ``deref`` template then simply expands to the appropriate expression \ 271 | that refers to the kernel argument. 272 | 273 | The rest of the magic needed to transform this procedure into a valid CUDA \ 274 | kernel is handled in the macro ``cudaGlobal`` which is applied to the \ 275 | procedure as a pragma ``{.cudaGlobal.}``. 276 | It also performs function inlining, so that one can still call host functions \ 277 | from the device (and not have to worry about marking then with ``__device__``. 278 | I won't go into the details here. 279 | 280 | The main step left now is to launch the kernel 281 | 282 | .. code-block:: Nim 283 | 284 | let ni = nn.int32 285 | let threadsPerBlock = tpb.int32 286 | let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock 287 | cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v) 288 | 289 | This selects the blocksPerGrid and threadsPerBlock to be used in the CUDA \ 290 | kernel, then launches the kernel ``kern`` with the argument tuple ``v``. 291 | 292 | Lastly, we synchronize. 293 | 294 | .. code-block:: Nim 295 | 296 | discard cudaDeviceSynchronize() 297 | 298 | This returns an error code, which I really should be checking instead \ 299 | of discarding. 300 | Nim requires you to explicitly discard a return value to be clear that you \ 301 | meant to ignore it and didn't just forget. 302 | We may be able to delay this until we actually use the fields again. 303 | 304 | 305 | Back and forth 306 | ============== 307 | 308 | To get the expression to evaluate correctly on the CPU again we \ 309 | also check on every assignment made on the CPU that the fields are \ 310 | updated there. So in the expression 311 | 312 | .. code-block:: Nim 313 | 314 | # do something on CPU again 315 | x += y * z 316 | 317 | the ``+=`` will do something like ``packVars``, but this time will generate \ 318 | statements containing ``toCpu`` calls on the used variables. 319 | 320 | To do 321 | ===== 322 | 323 | This is just a toy example. 324 | 325 | The next step is to get the vectorization working properly on the GPU \ 326 | arrays. 327 | The explicit copy allows us to use a different vectorization layout between \ 328 | the CPU and GPU. 329 | 330 | The examples here also need to be integrated with the existing ``thread:`` \ 331 | block in QEX_. 332 | One possibility is simply 333 | 334 | .. _QEX: https://github.com/jcosborn/qex 335 | 336 | .. code-block:: Nim 337 | 338 | threads: 339 | # do something on CPU 340 | x += y * z 341 | 342 | # do something on GPU 343 | onGpu: 344 | x += y * z 345 | z := 4 346 | 347 | # do something on CPU again 348 | x += y * z 349 | 350 | Other variants are also possible. 351 | -------------------------------------------------------------------------------- /demo1/ex1.nim: -------------------------------------------------------------------------------- 1 | import cpugpuarray 2 | 3 | let N = 100 4 | var x = newColorMatrixArray(N) 5 | var y = newColorMatrixArray(N) 6 | var z = newColorMatrixArray(N) 7 | 8 | # set them to diagonal matrices on CPU 9 | x := 1 10 | y := 2 11 | z := 3 12 | 13 | # do something on CPU 14 | x += y * z 15 | 16 | # do something on GPU 17 | onGpu: 18 | x += y * z 19 | z := 4 20 | 21 | # do something on CPU again 22 | x += y * z 23 | 24 | if x[0][0,0].re == 21.0: 25 | echo "yay, it worked!" 26 | echo "do you agree, GPU?" 27 | 28 | onGpu: 29 | if getThreadNum()==0: 30 | if x[0][0,0].re == 21.0: 31 | printf("yes, I agree!\n") 32 | 33 | # outputs: 34 | # yay, it worked! 35 | # do you agree, GPU? 36 | # yes, I agree! 37 | -------------------------------------------------------------------------------- /demo1/ex2.nim: -------------------------------------------------------------------------------- 1 | import cpugpuarray 2 | include system/timers 3 | include system/ansi_c 4 | import strUtils 5 | 6 | proc test(N: int) = 7 | echo "=== N: ", N 8 | #var x = newFloatArray(N) 9 | #var y = newFloatArray(N) 10 | #var z = newFloatArray(N) 11 | #var x = newComplexArray(N) 12 | #var y = newComplexArray(N) 13 | #var z = newComplexArray(N) 14 | var x = newColorMatrixArray(N) 15 | var y = newColorMatrixArray(N) 16 | var z = newColorMatrixArray(N) 17 | 18 | var t0,t1: Ticks 19 | template tic = 20 | t0 = getTicks() 21 | template toc = 22 | t1 = getTicks() 23 | #echo "nanos: ", formatFloat((t1-t0).float, precision=0) 24 | cprintf("nanos: %9i\n", t1-t0) 25 | #cprintf("GF/s: %9.3f\n", (2*N).float/(t1-t0).float) 26 | #cprintf("GF/s: %9.3f\n", (8*N).float/(t1-t0).float) 27 | cprintf("GF/s: %9.3f\n", (3*72*N).float/(t1-t0).float) 28 | 29 | # set them to diagonal matrices on CPU 30 | x := 1 31 | y := 2 32 | z := 3 33 | 34 | # do something on CPU 35 | tic() 36 | x += y * z 37 | toc() 38 | tic() 39 | x += y * z 40 | toc() 41 | #for i in 1..10000: 42 | # tic() 43 | # x += y * z 44 | # toc() 45 | 46 | var s = 1.0'f32 47 | template getGpuPtr(x: float): float = x 48 | # do something on GPU 49 | echo "GPU1" 50 | tic() 51 | #onGpu: 52 | onGpu(2*768,64): 53 | #var t = s 54 | x += y * z 55 | #if ff(): discard 56 | #z := 4 57 | toc() 58 | echo "GPU2" 59 | tic() 60 | onGpu(2*768,64): 61 | x += y * z 62 | # #z := 4 63 | toc() 64 | 65 | # do something on CPU again 66 | tic() 67 | x += y * z 68 | toc() 69 | tic() 70 | x += y * z 71 | toc() 72 | 73 | #if x[0][0,0].re == 21.0: 74 | # echo "yay, it worked!" 75 | # echo "do you agree, GPU?" 76 | 77 | #onGpu: 78 | # if getThreadNum()==0: 79 | # if x[0][0,0].re == 21.0: 80 | # printf("yes, I agree!\n") 81 | 82 | # outputs: 83 | # yay, it worked! 84 | # do you agree, GPU? 85 | # yes, I agree! 86 | 87 | var n = 1000 88 | while n<=1_000_000: 89 | test(n) 90 | n *= 10 91 | -------------------------------------------------------------------------------- /demo1/gpuarray.nim: -------------------------------------------------------------------------------- 1 | when not declared(haveCuda): 2 | const haveCuda = true 3 | 4 | when haveCuda: 5 | import ../cuda 6 | 7 | import macros 8 | include system/ansi_c 9 | import linalg 10 | 11 | type 12 | GpuArrayObj*[T] = object 13 | p*: ptr array[0,T] 14 | n*: int 15 | GpuArrayRef*[T] = ref GpuArrayObj[T] 16 | GpuArray*[T] = GpuArrayRef[T] 17 | GpuArrays* = GpuArrayObj | GpuArrayRef 18 | GpuArrays2* = GpuArrayObj | GpuArrayRef 19 | GpuArrays3* = GpuArrayObj | GpuArrayRef 20 | 21 | proc init*[T](r: var GpuArrayObj[T], n: int) = 22 | var p: pointer 23 | when haveCuda: 24 | let err = cudaMalloc(p.addr, n*sizeof(T)) 25 | if err: 26 | echo "alloc err: ", err 27 | quit(-1) 28 | else: 29 | p = createSharedU(T, n) 30 | r.n = n 31 | r.p = cast[type(r.p)](p) 32 | proc init[T](r: var GpuArrayRef[T], n: int) = 33 | r.new 34 | r[].init(n) 35 | 36 | proc newGpuArrayObj[T](r: var GpuArrayObj[T], n: int) = 37 | r.init(n) 38 | proc newGpuArrayObj[T](n: int): GpuArrayObj[T] = 39 | result.init(n) 40 | 41 | proc newGpuArrayRef[T](r: var GpuArrayRef[T], n: int) = 42 | r.init(n) 43 | proc newGpuArrayRef[T](n: int): GpuArrayRef[T] = 44 | result.init(n) 45 | 46 | template getGpuPtr(x: SomeNumber): untyped = x 47 | #template getGpuPtr(x: GpuArrayObj): untyped = x 48 | template getGpuPtr(x: GpuArrayRef): untyped = x[] 49 | #template getGpuPtr(x: GpuArrayRef): untyped = x.p 50 | #template getGpuPtr(x: GpuArrayRef): untyped = (p:x.p,n:x.n) 51 | 52 | template indexGpuArray(x: GpuArrays, i: SomeInteger): untyped = 53 | x.p[][i] 54 | template `[]=`(x: GpuArrayObj, i: SomeInteger, y: untyped): untyped = 55 | x.p[][i] = y 56 | 57 | macro indexGpuArray*(x: GpuArrays{call}, y: SomeInteger): untyped = 58 | #echo "call[", y.repr, "]" 59 | #echo x.treerepr 60 | #if siteLocalsField.contains($x[0]): 61 | result = newCall(ident($x[0])) 62 | for i in 1.. 64 | void check_size() {static_assert(ProvidedSize == RealSize, "newCoalesced got the wrong size!");} 65 | """.} 66 | proc newCoalesced*[T](p:ptr T, n:int):auto {.noinit.} = 67 | when compiles((const size = sizeof(T))): 68 | const size = sizeof(T) 69 | else: 70 | mixin structSize 71 | const size = structSize(T) 72 | {.emit:"check_size<`T`, `size`>();".} 73 | const N = size div (M*sizeof(RegisterWord)) 74 | when N*(M*sizeof(RegisterWord)) != size: {.fatal:"sizeof(T) must be divisible by memory word size."} 75 | if n mod V != 0: 76 | echo "Array length for Coalesced must be multiples of ",V 77 | quit 1 78 | Coalesced[T](p:p, n:n) 79 | proc `[]`*[T](x:Coalesced[T], i:int):auto = CoalescedObj[T](o:x, i:i) 80 | proc len*[T](x:Coalesced[T]):auto = x.n 81 | 82 | type 83 | RWA {.unchecked.} = array[0,RegisterWord] 84 | MWA {.unchecked.} = array[0,MemoryWord] 85 | 86 | proc copy(x:ptr MemoryWord, y:ptr RegisterWord, n:int) = # n is number of MemoryWord in x 87 | let 88 | x = cast[ptr MWA](x) 89 | y = cast[ptr RWA](y) 90 | for i in 0.. 1: 93 | x[i].b = y[M*i+1] 94 | when M > 2: 95 | x[i].c = y[M*i+2] 96 | x[i].d = y[M*i+3] 97 | proc copy(x:ptr RegisterWord, y:ptr MemoryWord, n:int) = # n is number of MemoryWord in y 98 | let 99 | x = cast[ptr RWA](x) 100 | y = cast[ptr MWA](y) 101 | for i in 0.. 1: 104 | x[M*i+1] = y[i].b 105 | when M > 2: 106 | x[M*i+2] = y[i].c 107 | x[M*i+3] = y[i].d 108 | 109 | converter fromCoalesced*[T](x:CoalescedObj[T]):T {.noinit.} = 110 | mixin structSize 111 | const N = structSize(T) div (M*sizeof(RegisterWord)) 112 | let p = cast[ptr MWA](x.o.p) 113 | var m {.noinit.}: array[N,MemoryWord] 114 | for i in 0..>>>>> indexArray" 102 | # echo "call[", y.repr, "]" 103 | # echo x.treerepr 104 | #if siteLocalsField.contains($x[0]): 105 | result = newCall(ident($x[0])) 106 | for i in 1..", varargs, discardable.} 201 | 202 | when isMainModule: 203 | var N = 100 204 | 205 | proc testfloat = 206 | var x = newArrayRef[float32](N) 207 | var y = newArrayRef[float32](N) 208 | var z = newArrayRef[float32](N) 209 | x := 1 210 | y := 2 211 | z := 3 212 | x += y * z 213 | if (x.n-1) mod getNumThreads() == getThreadNum(): 214 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 215 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 216 | onGpu(1,32): 217 | x += y * z 218 | if (x.n-1) mod getNumThreads() == getThreadNum(): 219 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 220 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 221 | x.toCpu 222 | if (x.n-1) mod getNumThreads() == getThreadNum(): 223 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 224 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1]) 225 | testfloat() 226 | 227 | proc testcomplex = 228 | var x = newArrayRef[Complex[float32]](N) 229 | var y = newArrayRef[Complex[float32]](N) 230 | var z = newArrayRef[Complex[float32]](N) 231 | x := 1 232 | y := 2 233 | z := 3 234 | x += y * z 235 | if (x.n-1) mod getNumThreads() == getThreadNum(): 236 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 237 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re) 238 | 239 | onGpu: 240 | x += y * z 241 | x += 1 242 | 243 | x += y * z 244 | if (x.n-1) mod getNumThreads() == getThreadNum(): 245 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 246 | cprintf("x[%i]: %g\n", x.n-1, x[x.n-1].re) 247 | testcomplex() 248 | 249 | proc testcolmat = 250 | var x = newArrayRef[Colmat[3,float32]](N) 251 | var y = newArrayRef[Colmat[3,float32]](N) 252 | var z = newArrayRef[Colmat[3,float32]](N) 253 | x := 1 254 | y := 2 255 | z := 3 256 | x += y * z 257 | if (x.n-1) mod getNumThreads() == getThreadNum(): 258 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 259 | cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re) 260 | 261 | onGpu(N): 262 | x += y * z 263 | 264 | x += y * z 265 | if (x.n-1) mod getNumThreads() == getThreadNum(): 266 | cprintf("thread %i/%i\n", getThreadNum(), getNumThreads()) 267 | cprintf("x[%i][0,0]: %g\n", x.n-1, x[x.n-1].d[0][0].re) 268 | x.free 269 | y.free 270 | z.free 271 | testcolmat() 272 | -------------------------------------------------------------------------------- /demo2/demo1.rst: -------------------------------------------------------------------------------- 1 | =========================== 2 | Portable expressions in Nim 3 | =========================== 4 | 5 | :Author: James C. Osborn 6 | 7 | .. contents:: 8 | 9 | Preliminaries 10 | ============= 11 | 12 | This document was created with Nim's built-in documentation generator. 13 | It can parse documentation comments in the source code and also process 14 | separate reStructuredText_ files. 15 | This document was made from a reStructuredText file using Nim's 16 | document generator to try it out and also take advantage of its Nim 17 | code highlighter. 18 | 19 | .. _reStructuredText: https://en.wikipedia.org/wiki/ReStructuredText 20 | 21 | Code portability in Nim 22 | ======================= 23 | 24 | Here's an example of the result (so far) 25 | 26 | .. code-block:: Nim 27 | 28 | import cpugpuarray 29 | 30 | let N = 1000 31 | var x = newColorMatrixArray(N) 32 | var y = newColorMatrixArray(N) 33 | var z = newColorMatrixArray(N) 34 | 35 | # set them to diagonal matrices on CPU 36 | x := 1 37 | y := 2 38 | z := 3 39 | 40 | # do something on CPU 41 | x += y * z 42 | 43 | # do something on GPU 44 | onGpu: 45 | x += y * z 46 | z := 4 47 | 48 | # do something on CPU again 49 | x += y * z 50 | 51 | if x[0][0,0].re == 21.0: 52 | echo "yay, it worked!" 53 | echo "do you agree, GPU?" 54 | 55 | onGpu: 56 | if getThreadNum()==0: 57 | if x[0][0,0].re == 21.0: 58 | printf("yes, I agree!\n") 59 | 60 | # outputs: 61 | # yay, it worked! 62 | # do you agree, GPU? 63 | # yes, I agree! 64 | 65 | 66 | The above can be compiled and run with 67 | 68 | :: 69 | 70 | nim cpp -d:release -r ex1.nim 71 | 72 | 73 | This is basically the main interface that the average user would need to 74 | deal with, the rest is just details for the curious. 75 | 76 | 77 | Implementation details 78 | ====================== 79 | 80 | The main container object in the example above is an array that can live 81 | on the CPU and also the GPU. This is defined as 82 | 83 | .. code-block:: Nim 84 | 85 | type 86 | ArrayObj*[T] = object 87 | p*: ptr array[0,T] 88 | n*: int 89 | g*: GpuArrayObj[T] 90 | lastOnGpu*: bool 91 | 92 | GpuArrayObj*[T] = object 93 | p*: ptr array[0,T] 94 | n*: int 95 | 96 | ``ArrayObj[T]`` is a generic array-like object parameterized on the type ``T``. 97 | This is similar to a templated type declaration in C++ with ``T`` being the template parameter (Nim uses ``[T]`` instead of ```` for generics). 98 | The ``*`` (star) after all the type and field names above means that they are exported from this module (file). 99 | They will be visible to another module that ``import``'s this module (otherwise they would be private to this module). 100 | 101 | The ``ArrayObj`` contains four fields: 102 | 103 | - ``p``: which is a pointer (``ptr``) for the data on the host. \ 104 | This is implemented as a pointer to an array of length ``0`` \ 105 | with elements of type ``T`` for convenience. \ 106 | This should really be marked with an ``{.unchecked.}`` pragma to prevent \ 107 | bounds checking in debug mode (bounds checks are off by default in release mode). 108 | - ``n``: the number of elements in the array. 109 | - ``g``: a GPU array object, defined next. 110 | - ``lastOnGpu``: a Boolean that tells us which pointer is valid. 111 | 112 | The ``GpuArrayObj`` is similar to ``ArrayObj``, but just contains a pointer \ 113 | (which will hold a GPU pointer) and the number of elements. 114 | This is the object we will pass to the GPU, so it contains a copy of the \ 115 | length for convenience. 116 | 117 | 118 | Offloading 119 | ========== 120 | 121 | The offload magic happens in the ``onGpu:`` block. 122 | It is defined like 123 | 124 | .. code-block:: Nim 125 | 126 | # the default total threads (nn=32*256) and threads per block (tpb=256) 127 | # are just for testing, they really should be an educated 128 | # guess made from querying the device 129 | template onGpu*(body: untyped): untyped = onGpu(32*256, 256, body) 130 | 131 | This launches a CUDA kernel using the default number of threads and threads \ 132 | per block. Right now they are hard-coded, but should really come from \ 133 | querying the device (or let the user specify some global default). 134 | 135 | One can override the defaults for a call by explicitly specifying them 136 | 137 | .. code-block:: Nim 138 | 139 | onGpu(x.n, 128): 140 | x += y * z 141 | z := 4 142 | 143 | This would launch one (virtual) thread per element of the array ``x`` and use 144 | 128 threads per block. 145 | 146 | The CUDA kernel gets created here 147 | 148 | .. code-block:: Nim 149 | 150 | template onGpu*(nn,tpb: untyped, body: untyped): untyped = 151 | block: 152 | var v = packVars(body, getGpuPtr) 153 | type myt {.bycopy.} = object 154 | d: type(v) 155 | proc kern(xx: myt) {.cudaGlobal.} = 156 | template deref(k: int): untyped = xx.d[k] 157 | substVars(body, deref) 158 | let ni = nn.int32 159 | let threadsPerBlock = tpb.int32 160 | let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock 161 | cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v) 162 | discard cudaDeviceSynchronize() 163 | 164 | This starts a new block scope (``block:``), similar to ``{...}`` in C. 165 | This is done to isolate the defined kernel (``proc kern ...``) from other \ 166 | ``onGpu`` blocks. 167 | 168 | The first major task is to examine the body of the ``onGpu`` block and \ 169 | extract the variables that are used. 170 | This is done by the ``packVars`` macro. 171 | It walks the syntax tree of the code block passed in and keeps track of \ 172 | the (unique) variables it references. 173 | It then spits out a data structure (a tuple_) containing those variables. 174 | It wraps each variable in a call to the function name that was passed in \ 175 | (in this case ``getGpuPtr``). 176 | For the example above, this line would get expanded to 177 | 178 | .. _tuple: https://nim-lang.org/docs/manual.html#types-tuples-and-object-types 179 | 180 | .. code-block:: Nim 181 | 182 | var v = (getGpuPtr(x), getGpuPtr(y), getGpuPtr(z)) 183 | 184 | The function ``getGpuPtr`` can then be defined independently for each type \ 185 | to return a valid GPU object (it actually doesn't have to be a pointer as we'llsee next). 186 | For the ``ArrayObj`` type it is defined as 187 | 188 | .. code-block:: Nim 189 | 190 | template getGpuPtr*(x: var ArrayObj): untyped = 191 | toGpu(x) 192 | x.g 193 | 194 | This copies the data to the GPU (if necessary) and then returns the \ 195 | ``GpuArrayObj`` containing the GPU pointer and the length of the array. 196 | This is a (small) object residing in CPU memory, and the CUDA library \ 197 | takes care of copying it to the GPU when passed as an argument. 198 | 199 | Copying the data to the GPU is handled by 200 | 201 | .. code-block:: Nim 202 | 203 | proc toGpu*(x: var ArrayObj) = 204 | if not x.lastOnGpu: 205 | x.lastOnGpu = true 206 | if x.g.n==0: x.g.init(x.n) 207 | let err = cudaMemcpy(x.g.p, x.p, x.n*sizeof(x.T), cudaMemcpyHostToDevice) 208 | if err: echo err 209 | 210 | Here we check if this array was last used on the GPU. 211 | If not we check if it has been initialized yet (``x.g.n==0``) and \ 212 | initialize it if not (which will call cudaMalloc). 213 | We then copy the CPU memory to GPU memory. 214 | Here we could also translate the layout if we wanted. 215 | 216 | Currently I am not distinguishing between read access and write access. 217 | This could lead to further optimization. 218 | It should be possible to modify the existing methods to handle that too. 219 | 220 | Next we create the CUDA kernel (``kern``). 221 | The kernel is defined here 222 | 223 | .. code-block:: Nim 224 | 225 | proc kern(xx: myt) {.cudaGlobal.} = 226 | template deref(k: int): untyped = xx.d[k] 227 | substVars(body, deref) 228 | 229 | This is a function taking one argument (which contains the packed \ 230 | ``GpuArrayObj``'s or any other objects used by the expressions. 231 | I originally wrote the procedure definition as 232 | 233 | .. code-block:: Nim 234 | 235 | proc kern(xx: type(v)) {.cudaGlobal.} = 236 | template deref(k: int): untyped = xx[k] 237 | substVars(body, deref) 238 | 239 | but found that Nim decided in some cases to pass the argument of \ 240 | ``kern`` (``xx``) as a pointer, instead of by value. 241 | Nim does this to optimize function calls when it feels it is safe to do so. 242 | To prevent this I wrapped the tuple in another object type (``myt``) that \ 243 | is explicitly declared ``{.bycopy.}``, so that Nim will always pass it by \ 244 | value (which makes a copy). 245 | 246 | In retrospect, another approach may have been to mark the procedure as \ 247 | ``{.exportC.}``, which will also prevent Nim from changing the calling \ 248 | conventions. I would then need to make the procedure names ``kern`` unique \ 249 | on my own since Nim will also not perform name-mangling on ``{.exportC.}`` \ 250 | procedures. 251 | 252 | The main body of the kernel comes from the 253 | 254 | .. code-block:: Nim 255 | 256 | substVars(body, deref) 257 | 258 | macro. 259 | It works similarly to the ``packVars`` macro above, but this time it will \ 260 | identify the variables referenced in the code block and substitute them \ 261 | with a call to the provided function (``deref``) with an integer argument \ 262 | that specifies which position in the kernel argument tuple that variable \ 263 | is in. For the example above this would generate 264 | 265 | .. code-block:: Nim 266 | 267 | deref(0) += deref(1) * deref(2) 268 | deref(2) := 4 269 | 270 | The ``deref`` template then simply expands to the appropriate expression \ 271 | that refers to the kernel argument. 272 | 273 | The rest of the magic needed to transform this procedure into a valid CUDA \ 274 | kernel is handled in the macro ``cudaGlobal`` which is applied to the \ 275 | procedure as a pragma ``{.cudaGlobal.}``. 276 | It also performs function inlining, so that one can still call host functions \ 277 | from the device (and not have to worry about marking then with ``__device__``. 278 | I won't go into the details here. 279 | 280 | The main step left now is to launch the kernel 281 | 282 | .. code-block:: Nim 283 | 284 | let ni = nn.int32 285 | let threadsPerBlock = tpb.int32 286 | let blocksPerGrid = (ni+threadsPerBlock-1) div threadsPerBlock 287 | cudaLaunch(kern, blocksPerGrid, threadsPerBlock, v) 288 | 289 | This selects the blocksPerGrid and threadsPerBlock to be used in the CUDA \ 290 | kernel, then launches the kernel ``kern`` with the argument tuple ``v``. 291 | 292 | Lastly, we synchronize. 293 | 294 | .. code-block:: Nim 295 | 296 | discard cudaDeviceSynchronize() 297 | 298 | This returns an error code, which I really should be checking instead \ 299 | of discarding. 300 | Nim requires you to explicitly discard a return value to be clear that you \ 301 | meant to ignore it and didn't just forget. 302 | We may be able to delay this until we actually use the fields again. 303 | 304 | 305 | Back and forth 306 | ============== 307 | 308 | To get the expression to evaluate correctly on the CPU again we \ 309 | also check on every assignment made on the CPU that the fields are \ 310 | updated there. So in the expression 311 | 312 | .. code-block:: Nim 313 | 314 | # do something on CPU again 315 | x += y * z 316 | 317 | the ``+=`` will do something like ``packVars``, but this time will generate \ 318 | statements containing ``toCpu`` calls on the used variables. 319 | 320 | To do 321 | ===== 322 | 323 | This is just a toy example. 324 | 325 | The next step is to get the vectorization working properly on the GPU \ 326 | arrays. 327 | The explicit copy allows us to use a different vectorization layout between \ 328 | the CPU and GPU. 329 | 330 | The examples here also need to be integrated with the existing ``thread:`` \ 331 | block in QEX_. 332 | One possibility is simply 333 | 334 | .. _QEX: https://github.com/jcosborn/qex 335 | 336 | .. code-block:: Nim 337 | 338 | threads: 339 | # do something on CPU 340 | x += y * z 341 | 342 | # do something on GPU 343 | onGpu: 344 | x += y * z 345 | z := 4 346 | 347 | # do something on CPU again 348 | x += y * z 349 | 350 | Other variants are also possible. 351 | -------------------------------------------------------------------------------- /demo2/ex1.nim: -------------------------------------------------------------------------------- 1 | import cpugpuarray 2 | 3 | let N = 64 4 | var x = newColorMatrixArray(N) 5 | var y = newColorMatrixArray(N) 6 | var z = newColorMatrixArray(N) 7 | 8 | # set them to diagonal matrices on CPU 9 | x := 1 10 | y := 2 11 | z := 3 12 | 13 | # do something on CPU 14 | x += y * z 15 | 16 | # do something on GPU 17 | onGpu: 18 | x += y * z 19 | z := 4 20 | 21 | # do something on CPU again 22 | x += y * z 23 | 24 | if x[0][0,0].re == 21.0: 25 | echo "yay, it worked!" 26 | echo "do you agree, GPU?" 27 | else: 28 | echo x[0][0,0].re 29 | 30 | onGpu: 31 | if getThreadNum()==0: 32 | if x[0][0,0].re == 21.0: 33 | printf("yes, I agree!\n") 34 | 35 | # outputs: 36 | # yay, it worked! 37 | # do you agree, GPU? 38 | # yes, I agree! 39 | -------------------------------------------------------------------------------- /demo2/ex2.nim: -------------------------------------------------------------------------------- 1 | import cpugpuarray 2 | include system/timers 3 | include system/ansi_c 4 | import strUtils 5 | 6 | proc test(N: int) = 7 | var x = newColorMatrixArray(N) 8 | var y = newColorMatrixArray(N) 9 | var z = newColorMatrixArray(N) 10 | 11 | template timeit(s:string, b:untyped) = 12 | let R = 64 13 | let t0 = getTicks() 14 | for i in 0.. "out/$n.info" 8 | cat /proc/cpuinfo >> "out/$n.info" 9 | "$c/deviceQuery/deviceQuery" >> "out/$n.info" 10 | "$c/bandwidthTest/bandwidthTest" >> "out/$n.info" 11 | fi 12 | ./compile ex2 13 | ./ex2 > out/$n.ex2 14 | -------------------------------------------------------------------------------- /demo3/bench_cpu: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | n="$(hostname)" 3 | [[ -d out ]] || mkdir out 4 | if [[ ! -s out/$n.info ]];then 5 | gcc --version > "out/$n.info" 6 | numactl -H >> "out/$n.info" 7 | cat /proc/cpuinfo >> "out/$n.info" 8 | fi 9 | ./compile_cpu ex2 10 | numactl -m 1 ./ex2 > out/$n.ex2 11 | -------------------------------------------------------------------------------- /demo3/ccwrapper: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nvcc="/usr/local/cuda/bin/nvcc" 3 | #nvcc=nvcc 4 | cc=g++-4.8 5 | #args=(-arch sm_60 -ccbin $cc) 6 | args=(-arch sm_21 -ccbin $cc) 7 | if (($# == 1)) && [[ $1 == --version ]];then 8 | $nvcc "$@" 9 | $cc "$@" 10 | exit 11 | fi 12 | ccargs="" 13 | #verbosity=1 14 | forcc(){ 15 | case "$1" in 16 | "-pthread") return 0 ;; 17 | "-fopenmp") return 0 ;; 18 | *) return 1 ;; 19 | esac 20 | } 21 | ps(){ if ((verbosity>0));then printf "#";printf " '%s'" "$@";printf "\n";fi;"$@";ret=$?; } 22 | ex(){ # OUTFILE ERRFILE CMD [...] 23 | local O="$1" E="$2"; shift 2 24 | { { ps "$@" | tee -a "$O"; } 2>&1 1>&3 | tee -a "$E"; } 3>&1 1>&2 25 | } 26 | while (($#>0));do 27 | if forcc "$1";then 28 | ccargs+=",'$1'" 29 | else 30 | args+=("$1") 31 | fi 32 | shift 33 | done 34 | ret=0 35 | ex ccwrapper.{out,err} "$nvcc" "${args[@]}" -Xcompiler "${ccargs#,}" 36 | exit $ret 37 | -------------------------------------------------------------------------------- /demo3/coalesced.nim: -------------------------------------------------------------------------------- 1 | #[ 2 | 3 | Following Nvidia's idea of coalesced_ptr in C++, we use a wrapper 4 | object type to hide the actual coalesced memory layout here. 5 | Original comments from Nvidia's coalesced_ptr.h follows: 6 | 7 | A smart pointer that automatically provide coalesced memory 8 | transcations for arrays of arbtrary structures. Given a structure 9 | T, of size S bytes, e.g., 10 | 11 | struct T { 12 | char a[S]; 13 | } 14 | 15 | in an array with sites elements 16 | 17 | T t[sites]; 18 | 19 | using a coalesced_ptr will split the structure for reading and 20 | writing to memory as an array of structures of array of structures (AoSoAoS), 21 | where: 22 | - the inner structure size is given by memory_word_size 23 | - the inner array size is given by site_vector 24 | - the outer structure size is given by sizeof(T)/memory_word_size 25 | - the outer array size is given by sites/site_vector 26 | 27 | ]# 28 | 29 | import macros, qexLite/metaUtils 30 | 31 | type 32 | Coalesced*[V,M:static[int],T] = object 33 | ## `V`: Inner array length. 34 | ## `M`: Number of RegisterWords in a MemoryWord, the granularity of memory transactions. 35 | p*: ptr T ## pointer to an array of T 36 | n*: int ## the length of the array being coalesced 37 | CoalescedObj[V,M:static[int],T] = object 38 | o*: Coalesced[V,M,T] 39 | i*: int # the index to which we asks 40 | 41 | const llbits = currentSourcePath()[0..^14] & "llbits.h" 42 | type 43 | RegisterWord* {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 44 | MemoryWord1 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 45 | a*: array[1,RegisterWord] 46 | MemoryWord2 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 47 | a*: array[2,RegisterWord] 48 | MemoryWord4 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 49 | a*: array[4,RegisterWord] 50 | MemoryWord8 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 51 | a*: array[8,RegisterWord] 52 | MemoryWord16 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 53 | a*: array[16,RegisterWord] 54 | MemoryWord32 {.importc, header:llbits.} = object # Word fits in a register, 4 bytes for current GPU 55 | a*: array[32,RegisterWord] 56 | template MemoryWord(M:static[int]):untyped = 57 | when 1 == M: MemoryWord1 58 | elif 2 == M: MemoryWord2 59 | elif 4 == M: MemoryWord4 60 | elif 8 == M: MemoryWord8 61 | elif 16 == M: MemoryWord16 62 | elif 32 == M: MemoryWord32 63 | 64 | template sizeOf*(t:typedesc[RegisterWord]):int = 4 65 | 66 | # Nim doesn't know the size of any struct for sure without the help of a C/C++ compiler. 67 | # So we use a C++ compiler to check if the user has provided a correct size. 68 | # The following C++ code only works with c++11 or later. 69 | {.emit:""" 70 | #if __cplusplus >= 201103L 71 | template 72 | void coalesced_check_size() {static_assert(ProvidedSize == RealSize, "newCoalesced got the wrong size!");} 73 | #else 74 | #define coalesced_check_size(type,size) typedef char ProvidedWrongSizeForType##type[2*!!(sizeof(type)==(size))-1] 75 | #endif 76 | """.} 77 | template getSize*(T:typedesc):untyped = 78 | when compiles((const size = sizeof(T))): 79 | const size = sizeof(T) 80 | else: 81 | mixin structSize 82 | const size = structSize(T) 83 | {.emit:""" 84 | #if __cplusplus >= 201103L 85 | coalesced_check_size<`T`,`size`>(); 86 | #else 87 | coalesced_check_size(`T`,`size`); 88 | #endif 89 | """.} 90 | size 91 | 92 | # Nim bug as of 8/7/2017, cannot overload init/newCoalesced. 93 | # Overloaded type matching would SIGSEGV. 94 | proc initCoalesced*[V,M:static[int],T](x:var Coalesced[V,M,T], p:ptr T, n:int) = 95 | const 96 | size = getSize(T) 97 | N = size div (M*sizeof(RegisterWord)) 98 | when N*(M*sizeof(RegisterWord)) != size: {.fatal:"sizeof(T) must be divisible by memory word size."} 99 | if n mod V != 0: 100 | echo "Array length for Coalesced must be multiples of V = ",V 101 | quit 1 102 | x.p = p 103 | x.n = n 104 | proc newCoalesced*[T](V,M:static[int], p:ptr T, n:int):auto {.noinit.} = 105 | var r {.noinit.}:Coalesced[V,M,T] 106 | r.initCoalesced(p,n) 107 | r 108 | 109 | template `[]`*(x:Coalesced, ix:int):untyped = CoalescedObj[x.V,x.M,x.T](o:x, i:ix) 110 | template len*(x:Coalesced):untyped = x.n 111 | 112 | template fromCoalesced*(x:CoalescedObj):untyped = 113 | const N = getSize(x.T) div (x.M*sizeof(RegisterWord)) 114 | type A {.unchecked.}= ptr array[0,MemoryWord(x.M)] 115 | var r {.noinit.}: x.T 116 | let offset = (x.i div x.V)*N*x.V + x.i mod x.V 117 | staticfor j, 0, N-1: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V] 118 | #for j in 0.. 7 | #+HTML_HEAD_EXTRA: 14 | #+HTML_HEAD_EXTRA: 15 | 16 | * Code portability in Nim 17 | 18 | Here's an benchmark example. 19 | 20 | #+BEGIN_SRC nim -n 21 | import timing, cpugpuarray, qexLite/metaUtils, math 22 | 23 | proc test(vecLen, memLen: static[int]; N: int) = 24 | var 25 | x = newColorMatrixArray(vecLen,memLen,N) # array of N 3x3 single prec complex matrices 26 | y = newColorMatrixArray(vecLen,memLen,N) 27 | z = newColorMatrixArray(vecLen,memLen,N) 28 | rep = 0 # accumulates the number of runs 29 | 30 | let 31 | mr = float(3 * 8 * x.T.N * x.T.N * N) / float(1024 * 1024 * 1024) # Resident memory in 2^30 bytes 32 | mt = 4 * mr / 3 # Memory transaction 33 | fp = float(8 * x.T.N * x.T.N * x.T.N * N) * 1e-9 # Floating point op / 10^9 34 | template timeit(label:string, s:untyped) = 35 | var 36 | R {.global.}:int 37 | T {.global.}:float 38 | threadSingle: 39 | R = 128 # Base repeat 40 | T = 1.0 # Time limit 41 | var t = timex(rep, R, s) # Always warm up cache 42 | while true: 43 | threadSingle: 44 | R = min(64*R,max(R,int(R.float*0.8/t))) # set up to run for at least 0.8 sec or 64*R 45 | t = timex(rep, R, s) 46 | threadSingle: T -= t 47 | if T < 0: break 48 | threadSingle: # Use the last R & t for performance measure 49 | printf("%8d %3d %d %-8s rep: %7d KB: %8.0f ms: %8.4f GF/s: %7.2f GB/s: %7.2f\n", 50 | N, vecLen, memLen, label, R, 1024*1024*mr, 1e3*t/R.float, fp*R.float/t, mt*R.float/t) 51 | 52 | threads: # CPU threads 53 | x := 0 # set them to diagonal matrices on CPU 54 | y := 1 55 | z := 2 56 | timeit "CPU": x += y * z 57 | 58 | timeit "GPU5": # includes kernel launching and synchronization 59 | onGpu(N, 32): # Number of threads, threads per block 60 | x += y * z 61 | timeit "GPU6": onGpu(N, 64): x += y * z 62 | timeit "GPU7": onGpu(N, 128): x += y * z 63 | 64 | threads: timeit "CPU": x += y * z # back to CPU threads again 65 | 66 | let scale = 0.5 / (sqrt(3.0) * rep.float) 67 | threads: 68 | x *= scale 69 | var n = x.norm2 70 | threadSingle: echo "# Final scaled x.norm2: ",n," rep: ",rep 71 | x.free 72 | y.free 73 | z.free 74 | 75 | for n in 8..26: 76 | staticFor v, 2, 7: 77 | when (1 shl v) >= (structsize(vectorizedElementType(float32)) div sizeof(float32)): 78 | staticFor ml, 1, 2: 79 | test(1 shl v, ml, 1 shl n) 80 | #+END_SRC 81 | 82 | The above can be compiled and run with 83 | 84 | #+BEGIN_SRC sh 85 | nim cpp -d:SSE -d:AVX -d:CPUVLEN=256 -d:release ex2 86 | #+END_SRC 87 | 88 | * Implementation details 89 | 90 | The main container object in the example above is an array that can live 91 | on the CPU and also the GPU. This is defined as 92 | 93 | #+BEGIN_SRC nim -n 94 | when useGPU: 95 | type 96 | ArrayObj*[V,M:static[int],T] = object 97 | p*: Coalesced[V,M,T] 98 | n*: int 99 | g*: GpuArrayObj[V,M,T] 100 | lastOnGpu*: bool 101 | unifiedMem*: bool 102 | mem:pointer ## Pointer to the allocated memory. 103 | else: 104 | type 105 | ArrayObj*[V,M:static[int],T] = object 106 | p*: Coalesced[V,M,T] 107 | n*: int 108 | mem:pointer ## Pointer to the allocated memory. 109 | 110 | type 111 | GpuArrayObj*[V,M:static[int],T] = object 112 | p*: Coalesced[V,M,T] 113 | n*: int 114 | 115 | type 116 | Coalesced*[V,M:static[int],T] = object 117 | ## `V`: Inner array length. 118 | ## `M`: Number of RegisterWords in a MemoryWord, the granularity of memory transactions. 119 | p*: ptr T ## pointer to an array of T 120 | n*: int ## the length of the array being coalesced 121 | CoalescedObj[V,M:static[int],T] = object 122 | o*: Coalesced[V,M,T] 123 | i*: int # the index to which we asks 124 | 125 | template `[]`*(x:Coalesced, ix:int):untyped = CoalescedObj[x.V,x.M,x.T](o:x, i:ix) 126 | template len*(x:Coalesced):untyped = x.n 127 | 128 | template fromCoalesced*(x:CoalescedObj):untyped = 129 | const N = getSize(x.T) div (x.M*sizeof(RegisterWord)) 130 | type A {.unchecked.}= ptr array[0,MemoryWord(x.M)] 131 | var r {.noinit.}: x.T 132 | let offset = (x.i div x.V)*N*x.V + x.i mod x.V 133 | staticfor j, 0, N-1: cast[A](r.addr)[j] = cast[A](x.o.p)[offset + j*x.V] 134 | r 135 | 136 | type 137 | ShortVector*[V:static[int],E] = object 138 | a*:array[V,E] 139 | ShortVectorIndex* = distinct int 140 | VectorizedObj*[V,M:static[int],T] = object 141 | o*:Coalesced[V,M,T] 142 | i*:ShortVectorIndex 143 | 144 | template `[]`*(x:Coalesced, ix:ShortVectorIndex):untyped = VectorizedObj[x.V,x.M,x.T](o:x,i:ix) 145 | template veclen*(x:Coalesced):untyped = x.n div x.V 146 | #+END_SRC 147 | 148 | * CPU threads 149 | 150 | #+BEGIN_SRC nim -n 151 | import omp 152 | 153 | when defined(noOpenmp): 154 | template omp_set_num_threads*(x: cint) = discard 155 | template omp_get_num_threads*(): cint = 1 156 | template omp_get_max_threads*(): cint = 1 157 | template omp_get_thread_num*(): cint = 0 158 | template ompPragma(p:string):untyped = discard 159 | template setupGc = discard 160 | else: 161 | const OMPFlag {.strDefine.} = "-fopenmp" 162 | {. passC: OMPFlag .} 163 | {. passL: OMPFlag .} 164 | {. pragma: omp, header:"omp.h" .} 165 | proc omp_set_num_threads*(x: cint) {.omp.} 166 | proc omp_get_num_threads*(): cint {.omp.} 167 | proc omp_get_max_threads*(): cint {.omp.} 168 | proc omp_get_thread_num*(): cint {.omp.} 169 | template ompPragma(p:string):untyped = 170 | {. emit:"\n#pragma omp " & p .} 171 | template setupGc = 172 | if(omp_get_thread_num()!=0): setupForeignThreadGc() 173 | 174 | template ompBarrier* = ompPragma("barrier") 175 | template ompBlock(p:string; body:untyped):untyped = 176 | ompPragma(p) 177 | block: 178 | body 179 | 180 | template ompParallel*(body:untyped):untyped = 181 | ompBlock("parallel"): 182 | setupGc() 183 | body 184 | template ompMaster*(body:untyped):untyped = ompBlock("master", body) 185 | template ompSingle*(body:untyped):untyped = ompBlock("single", body) 186 | template ompCritical*(body:untyped):untyped = ompBlock("critical", body) 187 | #+END_SRC 188 | 189 | #+BEGIN_SRC nim -n 190 | template threads*(body:untyped):untyped = 191 | checkInit() 192 | let tidOld = threadNum 193 | let nidOld = numThreads 194 | let tlOld = threadLocals 195 | proc tproc{.genSym.} = 196 | var ts:seq[ThreadShare] 197 | ompParallel: 198 | threadNum = ompGetThreadNum() 199 | numThreads = ompGetNumThreads() 200 | if threadNum==0: ts.newSeq(numThreads) 201 | threadBarrierO() 202 | initThreadLocals(ts) 203 | body 204 | threadBarrierO() 205 | tproc() 206 | threadNum = tidOld 207 | numThreads = nidOld 208 | threadLocals = tlOld 209 | #+END_SRC 210 | 211 | * Offloading 212 | 213 | #+BEGIN_SRC nim -n 214 | template cudaDefs(body: untyped): untyped {.dirty.} = 215 | var gridDim{.global,importC,noDecl.}: CudaDim3 216 | var blockIdx{.global,importC,noDecl.}: CudaDim3 217 | var blockDim{.global,importC,noDecl.}: CudaDim3 218 | var threadIdx{.global,importC,noDecl.}: CudaDim3 219 | template getGridDim: untyped {.used.} = gridDim 220 | template getBlockIdx: untyped {.used.} = blockIdx 221 | template getBlockDim: untyped {.used.} = blockDim 222 | template getThreadIdx: untyped {.used.} = threadIdx 223 | template getThreadNum: untyped {.used.} = blockDim.x * blockIdx.x + threadIdx.x 224 | template getNumThreads: untyped {.used.} = gridDim.x * blockDim.x 225 | bind inlineProcs 226 | inlineProcs: 227 | body 228 | 229 | template cudaLaunch*(p: proc; blocksPerGrid,threadsPerBlock: SomeInteger; 230 | arg: varargs[pointer,dataAddr]) = 231 | var pp: proc = p 232 | var gridDim, blockDim: CudaDim3 233 | gridDim.x = blocksPerGrid 234 | gridDim.y = 1 235 | gridDim.z = 1 236 | blockDim.x = threadsPerBlock 237 | blockDim.y = 1 238 | blockDim.z = 1 239 | var args: array[arg.len, pointer] 240 | for i in 0..=0: 312 | let ii = newLit(i) 313 | return newCall(a,ii) 314 | of nnkCallKinds: r0 = 1 315 | of nnkDotExpr: r1 = 0 316 | of {nnkVarSection,nnkLetSection}: 317 | result = it.cpNimNode 318 | for c in it: 319 | result.add c.cpNimNode 320 | for i in 0..(c.len-3): 321 | ignore.add c[i] 322 | result[^1].add c[i].cpNimNode 323 | result[^1].add c[^2].cpNimNode 324 | result[^1].add recurse(c[^1], vars, a) 325 | return 326 | else: discard 327 | result = it.cpNimNode 328 | for i in 0..= (structsize(vectorizedElementType(float32)) div sizeof(float32)): 58 | staticFor ml, 1, 2: 59 | test(1 shl v, ml, 1 shl n) 60 | -------------------------------------------------------------------------------- /demo3/gpuarray.nim: -------------------------------------------------------------------------------- 1 | import coalesced 2 | 3 | when not declared(haveCuda): 4 | const haveCuda = true 5 | 6 | when haveCuda: 7 | import ../cuda 8 | 9 | import macros 10 | include system/ansi_c 11 | import linalg 12 | 13 | type 14 | GpuArrayObj*[V,M:static[int],T] = object 15 | p*: Coalesced[V,M,T] 16 | n*: int 17 | # GpuArrayRef*[V,M:static[int],T] = ref GpuArrayObj[V,M,T] 18 | # GpuArray*[V,M:static[int],T] = GpuArrayRef[V,M,T] 19 | # GpuArrays* = GpuArrayObj | GpuArrayRef 20 | # GpuArrays2* = GpuArrayObj | GpuArrayRef 21 | # GpuArrays3* = GpuArrayObj | GpuArrayRef 22 | 23 | # Nim Bug, cannot overload this function with generic static parameters. 24 | # proc init*(r: var GpuArrayObj, n: int) = 25 | # type T = r.T 26 | # var p: ptr T 27 | # when haveCuda: 28 | # let err = cudaMalloc(cast[ptr pointer](addr p), n*sizeof(T)) 29 | # if err: 30 | # echo "alloc err: ", err 31 | # quit(-1) 32 | # else: 33 | # p = createSharedU(T, n) 34 | # r.n = n 35 | # r.p.newCoalesced(r.V, r.M, p, n) 36 | # proc init[V,M:static[int],T](r: var GpuArrayRef[V,M,T], n: int) = 37 | # r.new 38 | # r[].init(n) 39 | 40 | proc free*(r: var GpuArrayObj) = 41 | when haveCuda: discard r.p.p.cudaFree 42 | # proc free*[V,M:static[int],T](r: GpuArrayRef[V,M,T]) = 43 | # when haveCuda: discard r.p.p.cudaFree 44 | 45 | proc initGpuArrayObj*(r: var GpuArrayObj, n: int) = 46 | type T = r.T 47 | var p: ptr T 48 | when haveCuda: 49 | let err = cudaMalloc(cast[ptr pointer](addr p), n*sizeof(T)) 50 | if err: 51 | echo "alloc err: ", err 52 | quit(-1) 53 | else: 54 | p = createSharedU(T, n) 55 | r.n = n 56 | r.p.initCoalesced(p, n) 57 | # echo "GpuArray init done." 58 | proc newGpuArrayObj*(V,M:static[int], n:int, T:typedesc): auto {.noinit.} = 59 | var z {.noinit.}: GpuArrayObj[V,M,T] 60 | z.initGpuArrayObj(n) 61 | z 62 | 63 | # proc newGpuArrayRef*[V,M:static[int],T](r: var GpuArrayRef[V,M,T], n: int) = 64 | # r.init(n) 65 | # proc newGpuArrayRef*[T](V,M:static[int], n: int): auto {.noinit.} = 66 | # var z {.noinit.}: GpuArrayRef[V,M,T] 67 | # z.init(n) 68 | # z 69 | 70 | template getGpuPtr*(x: SomeNumber): untyped = x 71 | template getGpuPtr*(x: GpuArrayObj): untyped = x 72 | # template getGpuPtr*(x: GpuArrayRef): untyped = x[] 73 | #template getGpuPtr(x: GpuArrayRef): untyped = x.p 74 | #template getGpuPtr(x: GpuArrayRef): untyped = (p:x.p,n:x.n) 75 | 76 | template indexGpuArray*(x: GpuArrayObj, i: SomeInteger): untyped = 77 | x.p[i] 78 | 79 | macro indexGpuArray*(x: GpuArrayObj{call}, y: SomeInteger): untyped = 80 | #echo "call[", y.repr, "]" 81 | #echo x.treerepr 82 | #if siteLocalsField.contains($x[0]): 83 | result = newCall(ident($x[0])) 84 | for i in 1.. 4 | // Prepare to break the strict aliasing rule. 5 | typedef uint32_t __attribute__((__may_alias__,__aligned__(4))) RegisterWord; 6 | typedef struct MemoryWord1 {RegisterWord a[1];} __attribute__((__may_alias__,__aligned__(4))) MemoryWord1; 7 | typedef struct MemoryWord2 {RegisterWord a[2];} __attribute__((__may_alias__,__aligned__(8))) MemoryWord2; 8 | typedef struct MemoryWord4 {RegisterWord a[4];} __attribute__((__may_alias__,__aligned__(16))) MemoryWord4; 9 | typedef struct MemoryWord8 {RegisterWord a[8];} __attribute__((__may_alias__,__aligned__(32))) MemoryWord8; 10 | typedef struct MemoryWord16 {RegisterWord a[16];} __attribute__((__may_alias__,__aligned__(64))) MemoryWord16; 11 | typedef struct MemoryWord32 {RegisterWord a[32];} __attribute__((__may_alias__,__aligned__(128))) MemoryWord32; 12 | #endif//_CUDANIM_LLBITS_H_ 13 | -------------------------------------------------------------------------------- /demo3/out/bludhaven.info: -------------------------------------------------------------------------------- 1 | nvcc: NVIDIA (R) Cuda compiler driver 2 | Copyright (c) 2005-2015 NVIDIA Corporation 3 | Built on Tue_Aug_11_14:27:32_CDT_2015 4 | Cuda compilation tools, release 7.5, V7.5.17 5 | g++-4.8 (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4 6 | Copyright (C) 2013 Free Software Foundation, Inc. 7 | This is free software; see the source for copying conditions. There is NO 8 | warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 9 | 10 | processor : 0 11 | vendor_id : GenuineIntel 12 | cpu family : 6 13 | model : 58 14 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 15 | stepping : 9 16 | microcode : 0x12 17 | cpu MHz : 1600.000 18 | cache size : 8192 KB 19 | physical id : 0 20 | siblings : 8 21 | core id : 0 22 | cpu cores : 4 23 | apicid : 0 24 | initial apicid : 0 25 | fpu : yes 26 | fpu_exception : yes 27 | cpuid level : 13 28 | wp : yes 29 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 30 | bogomips : 6784.24 31 | clflush size : 64 32 | cache_alignment : 64 33 | address sizes : 36 bits physical, 48 bits virtual 34 | power management: 35 | 36 | processor : 1 37 | vendor_id : GenuineIntel 38 | cpu family : 6 39 | model : 58 40 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 41 | stepping : 9 42 | microcode : 0x12 43 | cpu MHz : 1600.000 44 | cache size : 8192 KB 45 | physical id : 0 46 | siblings : 8 47 | core id : 1 48 | cpu cores : 4 49 | apicid : 2 50 | initial apicid : 2 51 | fpu : yes 52 | fpu_exception : yes 53 | cpuid level : 13 54 | wp : yes 55 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 56 | bogomips : 6784.24 57 | clflush size : 64 58 | cache_alignment : 64 59 | address sizes : 36 bits physical, 48 bits virtual 60 | power management: 61 | 62 | processor : 2 63 | vendor_id : GenuineIntel 64 | cpu family : 6 65 | model : 58 66 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 67 | stepping : 9 68 | microcode : 0x12 69 | cpu MHz : 3000.000 70 | cache size : 8192 KB 71 | physical id : 0 72 | siblings : 8 73 | core id : 2 74 | cpu cores : 4 75 | apicid : 4 76 | initial apicid : 4 77 | fpu : yes 78 | fpu_exception : yes 79 | cpuid level : 13 80 | wp : yes 81 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 82 | bogomips : 6784.24 83 | clflush size : 64 84 | cache_alignment : 64 85 | address sizes : 36 bits physical, 48 bits virtual 86 | power management: 87 | 88 | processor : 3 89 | vendor_id : GenuineIntel 90 | cpu family : 6 91 | model : 58 92 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 93 | stepping : 9 94 | microcode : 0x12 95 | cpu MHz : 1600.000 96 | cache size : 8192 KB 97 | physical id : 0 98 | siblings : 8 99 | core id : 3 100 | cpu cores : 4 101 | apicid : 6 102 | initial apicid : 6 103 | fpu : yes 104 | fpu_exception : yes 105 | cpuid level : 13 106 | wp : yes 107 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 108 | bogomips : 6784.24 109 | clflush size : 64 110 | cache_alignment : 64 111 | address sizes : 36 bits physical, 48 bits virtual 112 | power management: 113 | 114 | processor : 4 115 | vendor_id : GenuineIntel 116 | cpu family : 6 117 | model : 58 118 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 119 | stepping : 9 120 | microcode : 0x12 121 | cpu MHz : 1600.000 122 | cache size : 8192 KB 123 | physical id : 0 124 | siblings : 8 125 | core id : 0 126 | cpu cores : 4 127 | apicid : 1 128 | initial apicid : 1 129 | fpu : yes 130 | fpu_exception : yes 131 | cpuid level : 13 132 | wp : yes 133 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 134 | bogomips : 6784.24 135 | clflush size : 64 136 | cache_alignment : 64 137 | address sizes : 36 bits physical, 48 bits virtual 138 | power management: 139 | 140 | processor : 5 141 | vendor_id : GenuineIntel 142 | cpu family : 6 143 | model : 58 144 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 145 | stepping : 9 146 | microcode : 0x12 147 | cpu MHz : 1600.000 148 | cache size : 8192 KB 149 | physical id : 0 150 | siblings : 8 151 | core id : 1 152 | cpu cores : 4 153 | apicid : 3 154 | initial apicid : 3 155 | fpu : yes 156 | fpu_exception : yes 157 | cpuid level : 13 158 | wp : yes 159 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 160 | bogomips : 6784.24 161 | clflush size : 64 162 | cache_alignment : 64 163 | address sizes : 36 bits physical, 48 bits virtual 164 | power management: 165 | 166 | processor : 6 167 | vendor_id : GenuineIntel 168 | cpu family : 6 169 | model : 58 170 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 171 | stepping : 9 172 | microcode : 0x12 173 | cpu MHz : 1600.000 174 | cache size : 8192 KB 175 | physical id : 0 176 | siblings : 8 177 | core id : 2 178 | cpu cores : 4 179 | apicid : 5 180 | initial apicid : 5 181 | fpu : yes 182 | fpu_exception : yes 183 | cpuid level : 13 184 | wp : yes 185 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 186 | bogomips : 6784.24 187 | clflush size : 64 188 | cache_alignment : 64 189 | address sizes : 36 bits physical, 48 bits virtual 190 | power management: 191 | 192 | processor : 7 193 | vendor_id : GenuineIntel 194 | cpu family : 6 195 | model : 58 196 | model name : Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz 197 | stepping : 9 198 | microcode : 0x12 199 | cpu MHz : 1600.000 200 | cache size : 8192 KB 201 | physical id : 0 202 | siblings : 8 203 | core id : 3 204 | cpu cores : 4 205 | apicid : 7 206 | initial apicid : 7 207 | fpu : yes 208 | fpu_exception : yes 209 | cpuid level : 13 210 | wp : yes 211 | flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb xsaveopt pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms 212 | bogomips : 6784.24 213 | clflush size : 64 214 | cache_alignment : 64 215 | address sizes : 36 bits physical, 48 bits virtual 216 | power management: 217 | 218 | /homes/xjin/tmp/samples/1_Utilities/deviceQuery/deviceQuery Starting... 219 | 220 | CUDA Device Query (Runtime API) version (CUDART static linking) 221 | 222 | Detected 1 CUDA Capable device(s) 223 | 224 | Device 0: "Quadro 600" 225 | CUDA Driver Version / Runtime Version 8.0 / 7.5 226 | CUDA Capability Major/Minor version number: 2.1 227 | Total amount of global memory: 964 MBytes (1010761728 bytes) 228 | ( 2) Multiprocessors, ( 48) CUDA Cores/MP: 96 CUDA Cores 229 | GPU Max Clock rate: 1280 MHz (1.28 GHz) 230 | Memory Clock rate: 800 Mhz 231 | Memory Bus Width: 128-bit 232 | L2 Cache Size: 131072 bytes 233 | Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65535), 3D=(2048, 2048, 2048) 234 | Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers 235 | Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers 236 | Total amount of constant memory: 65536 bytes 237 | Total amount of shared memory per block: 49152 bytes 238 | Total number of registers available per block: 32768 239 | Warp size: 32 240 | Maximum number of threads per multiprocessor: 1536 241 | Maximum number of threads per block: 1024 242 | Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 243 | Max dimension size of a grid size (x,y,z): (65535, 65535, 65535) 244 | Maximum memory pitch: 2147483647 bytes 245 | Texture alignment: 512 bytes 246 | Concurrent copy and kernel execution: Yes with 1 copy engine(s) 247 | Run time limit on kernels: Yes 248 | Integrated GPU sharing Host Memory: No 249 | Support host page-locked memory mapping: Yes 250 | Alignment requirement for Surfaces: Yes 251 | Device has ECC support: Disabled 252 | Device supports Unified Addressing (UVA): Yes 253 | Device PCI Domain ID / Bus ID / location ID: 0 / 1 / 0 254 | Compute Mode: 255 | < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 256 | 257 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 8.0, CUDA Runtime Version = 7.5, NumDevs = 1, Device0 = Quadro 600 258 | Result = PASS 259 | [CUDA Bandwidth Test] - Starting... 260 | Running on... 261 | 262 | Device 0: Quadro 600 263 | Quick Mode 264 | 265 | Host to Device Bandwidth, 1 Device(s) 266 | PINNED Memory Transfers 267 | Transfer Size (Bytes) Bandwidth(MB/s) 268 | 33554432 6488.5 269 | 270 | Device to Host Bandwidth, 1 Device(s) 271 | PINNED Memory Transfers 272 | Transfer Size (Bytes) Bandwidth(MB/s) 273 | 33554432 6483.5 274 | 275 | Device to Device Bandwidth, 1 Device(s) 276 | PINNED Memory Transfers 277 | Transfer Size (Bytes) Bandwidth(MB/s) 278 | 33554432 20514.2 279 | 280 | Result = PASS 281 | 282 | NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled. 283 | -------------------------------------------------------------------------------- /demo3/qexLite/alignedMem.nim: -------------------------------------------------------------------------------- 1 | import strUtils 2 | import stdUtils 3 | 4 | type 5 | alignedMem*[T] = object 6 | len*: int 7 | align*: int 8 | stride*: int 9 | bytes*: int 10 | mem*: ref cArray[char] 11 | data*: ptr cArray[T] 12 | 13 | proc unsafeNewU*[T](a: var ref T, size: Natural) = 14 | {.emit: "N_NIMCALL(void*, newObjNoInit)(TNimType* typ0, NI size0);".} 15 | {.emit: "#define newObj newObjNoInit".} 16 | unsafeNew(a, size) 17 | {.emit: "#undef newObj".} 18 | 19 | proc ptrAlign[T](p:ptr T; a:int):ptr T = 20 | let x = cast[ByteAddress](p) 21 | let a1 = a - 1 22 | let y = x + (a1-((x+a1) mod a)) 23 | #echo x, ":", y 24 | result = cast[type(result)](y) 25 | 26 | proc new*[T](t:var alignedMem[T], n:int, align:int=64) = 27 | t.len = n 28 | t.align = align 29 | t.stride = sizeof(T) 30 | t.bytes = t.len * t.stride + t.align 31 | unsafeNew(t.mem, t.bytes) 32 | t.data = ptrAlign(cast[ptr cArray[T]](t.mem[0].addr), align) 33 | proc newU*[T](t:var alignedMem[T], n:int, align:int=64) = 34 | t.len = n 35 | t.align = align 36 | t.stride = sizeof(T) 37 | t.bytes = t.len * t.stride + t.align 38 | unsafeNewU(t.mem, t.bytes) 39 | t.data = ptrAlign(cast[ptr cArray[T]](t.mem[0].addr), align) 40 | proc newAlignedMem*[T](t:var alignedMem[T], n:int, align:int=64) = 41 | new(t, n, align) 42 | proc newAlignedMem*[T](n:int, align:int=64): alignedMem[T] = 43 | newAlignedMem[T](result, n, align) 44 | proc newAlignedMemU*[T](t:var alignedMem[T], n:int, align:int=64) = 45 | newU(t, n, align) 46 | proc newAlignedMemU*[T](n:int, align:int=64): alignedMem[T] = 47 | newAlignedMemU[T](result, n, align) 48 | 49 | template low*(s:alignedMem):untyped = 0 50 | template high*(s:alignedMem):untyped = s.len-1 51 | proc `[]`*[T](s:alignedMem[T], i:SomeInteger):var T = 52 | result = s.data[i] 53 | #template `[]`*[T](s:alignedMem[T], i:SomeInteger):untyped = s.data[i] 54 | template `[]`*[T](s:var alignedMem[T], i:SomeInteger):untyped = s.data[i] 55 | template `[]=`*[T](s:var alignedMem[T], i:SomeInteger, v:untyped) = 56 | s.data[i] = v 57 | 58 | when isMainModule: 59 | var x: alignedMem[float] 60 | newAlignedMem(x, 10) 61 | let c0 = cast[ByteAddress](x.mem[0].addr) 62 | echo c0, " ", toHex(c0,8) 63 | let x0 = cast[ByteAddress](x[0].addr) 64 | echo x0, " ", toHex(x0,8) 65 | 66 | for i in x.low..x.high: 67 | x[i] = float(i) 68 | for i in 0..".} 34 | #proc printfOrdered( 35 | macro printf*(fmt:string; args:varargs[untyped]):auto = 36 | var call = newCall(ident("cprintf"), fmt) 37 | result = evalArgs(call, args) 38 | result.add(quote do: 39 | if myRank==0 and threadNum==0: 40 | `call` 41 | ) 42 | proc echoRaw*(x: varargs[typed, `$`]) {.magic: "Echo".} 43 | macro echoAll*(args:varargs[untyped]):auto = 44 | var call = newCall(bindSym"echoRaw") 45 | result = evalArgs(call, args) 46 | result.add(quote do: 47 | `call` 48 | ) 49 | macro echoRank*(args:varargs[untyped]):auto = 50 | var call = newCall(bindSym"echoRaw") 51 | call.add ident"myRank" 52 | call.add newLit"/" 53 | call.add ident"nRanks" 54 | call.add newLit": " 55 | result = evalArgs(call, args) 56 | template f(x:untyped):untyped = 57 | if threadNum==0: x 58 | result.add getAst(f(call)) 59 | macro echo0*(args: varargs[untyped]): auto = 60 | var call = newCall(bindSym"echoRaw") 61 | result = evalArgs(call, args) 62 | result.add(quote do: 63 | if myRank==0 and threadNum==0: 64 | `call` 65 | ) 66 | macro makeEchos(n:static[int]):auto = 67 | template ech(x,y: untyped): untyped = 68 | template echo*(): untyped = 69 | when nimvm: 70 | x 71 | else: 72 | y 73 | result = newStmtList() 74 | var er = newCall(bindSym"echoRaw") 75 | var e0 = newCall(bindSym"echo0") 76 | var ea = newSeq[NimNode](0) 77 | for i in 1..n: 78 | let ai = ident("a" & $i) 79 | er.add ai 80 | e0.add ai 81 | ea.add newNimNode(nnkIdentDefs).add(ai).add(ident"untyped").add(newEmptyNode()) 82 | var t = getAst(ech(er,e0)) 83 | #echo t.treerepr 84 | for j in 0.. 2 | #include 3 | #include "qlayout.h" 4 | 5 | #define myalloc malloc 6 | #define PRINTV(s,f,v,n) do { printf(s); \ 7 | for(int _i=0; _ibegin = 0; 14 | s->end = l->nSites; 15 | s->beginOuter = 0; 16 | s->endOuter = l->nSitesOuter; 17 | if(sub[0]=='e') { 18 | s->end = l->nEven; 19 | s->endOuter = l->nEvenOuter; 20 | } else if(sub[0]=='o') { 21 | s->begin = l->nOdd; 22 | s->beginOuter = l->nOddOuter; 23 | } 24 | } 25 | 26 | void 27 | layoutSetup(Layout *l) 28 | { 29 | int nd = l->nDim; 30 | l->outerGeom = myalloc(nd*sizeof(int)); 31 | l->localGeom = myalloc(nd*sizeof(int)); 32 | int pvol=1, lvol=1, ovol=1, icb=0, icbd=-1; 33 | for(int i=0; ilocalGeom[i] = l->physGeom[i]/l->rankGeom[i]; 35 | l->outerGeom[i] = l->localGeom[i]/l->innerGeom[i]; 36 | pvol *= l->physGeom[i]; 37 | lvol *= l->localGeom[i]; 38 | ovol *= l->outerGeom[i]; 39 | if(l->innerGeom[i]>1 && (l->outerGeom[i]&1)==1) icb++; 40 | if(l->innerGeom[i]==1 && (l->outerGeom[i]&1)==0) icbd = i; 41 | } 42 | if(icb==0) { 43 | icbd = 0; 44 | } else { 45 | if(icbd<0) { 46 | if(l->myrank==0) { 47 | printf("not enough 2's in localGeom\n"); 48 | PRINTV("physGeom:", "%i", l->physGeom, nd); 49 | PRINTV("rankGeom:", "%i", l->rankGeom, nd); 50 | PRINTV("localGeom:", "%i", l->localGeom, nd); 51 | PRINTV("outerGeom:", "%i", l->outerGeom, nd); 52 | PRINTV("innerGeom:", "%i", l->innerGeom, nd); 53 | } 54 | exit(-1); 55 | } 56 | icb = l->outerGeom[icbd]/2; 57 | if((icb&1)==0) { 58 | if(l->myrank==0) { 59 | printf("error in cb choice\n"); 60 | PRINTV("physGeom:", "%i", l->physGeom, nd); 61 | PRINTV("rankGeom:", "%i", l->rankGeom, nd); 62 | PRINTV("localGeom:", "%i", l->localGeom, nd); 63 | PRINTV("outerGeom:", "%i", l->outerGeom, nd); 64 | PRINTV("innerGeom:", "%i", l->innerGeom, nd); 65 | printf("innerCb: %i\n", icb); 66 | printf("innerCbDir: %i\n", icbd); 67 | } 68 | exit(-1); 69 | } 70 | } 71 | l->physVol = pvol; 72 | l->nSites = lvol; 73 | l->nOdd = lvol/2; 74 | l->nEven = lvol - l->nOdd; 75 | l->nSitesOuter = ovol; 76 | l->nOddOuter = ovol/2; 77 | l->nEvenOuter = ovol - l->nOddOuter; 78 | l->nSitesInner = l->nSites/l->nSitesOuter; 79 | l->innerCb = icb; 80 | l->innerCbDir = icbd; 81 | if(l->myrank==0) { 82 | printf("#innerCb: %i\n", icb); 83 | printf("#innerCbDir: %i\n", icbd); 84 | } 85 | } 86 | 87 | static void 88 | lex_x(int *x, int l, int *s, int ndim) 89 | { 90 | for(int i=0; i=0; --i) { 92 | x[i] = l % s[i]; 93 | l = l / s[i]; 94 | } 95 | } 96 | 97 | // x[0] is fastest 98 | static int 99 | lex_i(int *x, int *s, int *d, int ndim) 100 | { 101 | int l = 0; 102 | for(int i=ndim-1; i>=0; --i) { 103 | int xx = x[i]; 104 | if(d) xx /= d[i]; 105 | l = l*s[i] + (xx%s[i]); 106 | } 107 | return l; 108 | } 109 | 110 | #if 0 111 | // x[0] is slowest 112 | static int 113 | lexr_i(int *x, int *s, int *d, int ndim) 114 | { 115 | int l = 0; 116 | for(int i=0; inDim; 129 | int ri = lex_i(coords, l->rankGeom, l->localGeom, nd); 130 | int ii = lex_i(coords, l->innerGeom, l->outerGeom, nd); 131 | int ib = 0; 132 | for(int i=0; iouterGeom[i]; 134 | int li = xi % l->innerGeom[i]; 135 | ib += li * l->outerGeom[i]; 136 | } 137 | ib &= 1; 138 | coords[l->innerCbDir] += l->innerCb * ib; 139 | int oi = lex_i(coords, l->outerGeom, NULL, nd); 140 | coords[l->innerCbDir] -= l->innerCb * ib; 141 | int p = 0; 142 | for(int i=0; inSitesOuter)/2; 145 | li->rank = ri; 146 | li->index = oi2*l->nSitesInner + ii; 147 | } 148 | 149 | void 150 | layoutCoord(Layout *l, int *coords, LayoutIndex *li) 151 | { 152 | int nd = l->nDim; 153 | int cr[nd]; 154 | lex_x(cr, li->rank, l->rankGeom, nd); 155 | int p = 0; 156 | int ll = li->index % l->nSitesInner; 157 | int ib = 0; 158 | for(int i=0; iinnerGeom[i]; 160 | int wl = l->outerGeom[i]; 161 | int k = ll % w; 162 | int c = l->localGeom[i]*cr[i] + k*wl; 163 | cr[i] = c; 164 | //printf("cr[%i]: %i\n", i, c); 165 | p += c; 166 | ll = ll / w; 167 | ib += k*wl; 168 | } 169 | ib &= 1; 170 | int ii = li->index / l->nSitesInner; 171 | if(ii>=l->nEvenOuter) { 172 | ii -= l->nEvenOuter; 173 | p++; 174 | } 175 | ii *= 2; 176 | for(int i=0; iouterGeom[i]; 178 | int k = ii % wl; 179 | if(i==l->innerCbDir) k = (k + l->innerCb * ib)%wl; 180 | coords[i] = k; 181 | //printf("coords[%i]: %i\n", i, k); 182 | p += k; 183 | ii = ii / wl; 184 | } 185 | if(p&1) { 186 | for(int i=0; iouterGeom[i]; 188 | if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl; 189 | coords[i]++; 190 | if(coords[i]>=wl) { 191 | coords[i] = 0; 192 | if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl; 193 | } else { 194 | if(i==l->innerCbDir) coords[i] = (coords[i] + l->innerCb * ib)%wl; 195 | break; 196 | } 197 | } 198 | } 199 | for(int i=0; irank!=li2.rank ||li->index!=li2.index) { 204 | printf("error: bad coord:\n"); 205 | printf(" %i,%i -> %i %i %i %i -> %i,%i\n", li->rank, li->index, 206 | coords[0],coords[1],coords[2],coords[3], li2.rank, li2.index); 207 | exit(-1); 208 | } 209 | } 210 | } 211 | 212 | void 213 | layoutShift(Layout *l, LayoutIndex *li, LayoutIndex *li2, int *disp) 214 | { 215 | int nd = l->nDim; 216 | int x[nd]; 217 | layoutCoord(l, x, li2); 218 | for(int i=0; iphysGeom[i])%l->physGeom[i]; 220 | } 221 | layoutIndex(l, li, x); 222 | } 223 | -------------------------------------------------------------------------------- /demo3/qexLite/layout/qlayout.h: -------------------------------------------------------------------------------- 1 | #include "qmp.h" 2 | 3 | typedef struct llist { 4 | void *value; 5 | struct llist *next; 6 | } llist; 7 | 8 | // k = sum_i sum_j ((x[i]/d[i][j])%m[i][j])*f[i][j] 9 | // x[i] = sum_j ((k/f[i][j])%m[i][j])*d[i][j] 10 | // parity? 11 | 12 | typedef struct { 13 | int nDim; 14 | int *physGeom; 15 | int *rankGeom; 16 | int *innerGeom; //wrap 17 | int *outerGeom; //wls 18 | int *localGeom; 19 | int physVol; 20 | int nEven; 21 | int nOdd; 22 | int nSites; 23 | int nEvenOuter; 24 | int nOddOuter; 25 | int nSitesOuter; 26 | int nSitesInner; 27 | int innerCb; 28 | int innerCbDir; 29 | llist *shifts; 30 | int nranks; 31 | int myrank; 32 | } Layout; 33 | 34 | typedef struct { 35 | int rank; 36 | int index; 37 | } LayoutIndex; 38 | 39 | typedef struct { 40 | int begin; 41 | int end; 42 | int beginOuter; 43 | int endOuter; 44 | } Subset; 45 | 46 | void layoutSetup(Layout *l); 47 | void layoutIndex(Layout *l, LayoutIndex *li, int coords[]); 48 | void layoutCoord(Layout *l, int coords[], LayoutIndex *li); 49 | void layoutShift(Layout *l, LayoutIndex *li, LayoutIndex *li2, int disp[]); 50 | void layoutSubset(Subset *s, Layout *l, char *sub); 51 | 52 | typedef struct { 53 | int myRank; 54 | int nIndices; 55 | int *srcRanks; 56 | int *srcIndices; 57 | int nRecvDests; 58 | int nSendIndices; 59 | int *sendSrcIndices; 60 | int *sendDestRanks; 61 | int *sendDestIndices; 62 | } GatherDescription; 63 | 64 | typedef struct { 65 | GatherDescription *gd; 66 | int myRank; 67 | int nIndices; 68 | int *srcIndices; 69 | int nRecvRanks; 70 | int *recvRanks; 71 | int *recvRankSizes; 72 | int *recvRankOffsets; 73 | int recvSize; 74 | int nRecvDests; 75 | int *recvDestIndices; 76 | int *recvBufIndices; 77 | int nSendRanks; 78 | int *sendRanks; 79 | int *sendRankSizes; 80 | int *sendRankOffsets; 81 | int sendSize; // same as nSendIndices 82 | int nSendIndices; 83 | int *sendIndices; 84 | } GatherIndices; 85 | 86 | // per gather: 87 | // pidx 88 | // recv 89 | // combined: 90 | // send* 91 | 92 | typedef struct { 93 | GatherIndices *gi; 94 | int *disp; 95 | int *sidx; 96 | int *pidx; 97 | int nRecvRanks; 98 | int *recvRanks; 99 | int *recvRankSizes; 100 | int *recvRankSizes1; 101 | int *recvRankOffsets; 102 | int *recvRankOffsets1; 103 | int nRecvSites; 104 | int nRecvSites1; 105 | int nRecvDests; 106 | int *recvDests; 107 | int *recvLocalSrcs; 108 | int *recvRemoteSrcs; 109 | int nSendRanks; 110 | int *sendRanks; 111 | int *sendRankSizes; 112 | int *sendRankSizes1; 113 | int *sendRankOffsets; 114 | int *sendRankOffsets1; 115 | int nSendSites; 116 | int nSendSites1; 117 | int *sendSites; 118 | int vv; 119 | //int offr, lenr, nthreads; 120 | int perm; 121 | int pack; 122 | int blend; 123 | //QMP_msgmem_t sqmpmem; 124 | //QMP_msghandle_t smsg; 125 | //QMP_msgmem_t rqmpmem; 126 | //QMP_msghandle_t rmsg; 127 | //QMP_msghandle_t pairmsg; 128 | } ShiftIndices; 129 | 130 | typedef struct { 131 | QMP_msgmem_t sqmpmem; 132 | QMP_msghandle_t smsg; 133 | QMP_msgmem_t rqmpmem; 134 | QMP_msghandle_t rmsg; 135 | QMP_msghandle_t pairmsg; 136 | char *sbuf; 137 | char *rbuf; 138 | int sbufSize; 139 | int rbufSize; 140 | int first; 141 | int *offr; 142 | int *lenr; 143 | int *nthreads; 144 | } ShiftBuf; 145 | 146 | typedef void GatherMap(int *srcRank, int *srcIdx, int dstRank, int *dstIdx, void *args); 147 | -------------------------------------------------------------------------------- /demo3/qexLite/metaUtils.nim: -------------------------------------------------------------------------------- 1 | import macros 2 | import strUtils 3 | 4 | proc symToIdent*(x: NimNode): NimNode = 5 | case x.kind: 6 | of nnkCharLit..nnkUInt64Lit: 7 | result = newNimNode(x.kind) 8 | result.intVal = x.intVal 9 | of nnkFloatLit..nnkFloat64Lit: 10 | result = newNimNode(x.kind) 11 | result.floatVal = x.floatVal 12 | of nnkStrLit..nnkTripleStrLit: 13 | result = newNimNode(x.kind) 14 | result.strVal = x.strVal 15 | of nnkIdent, nnkSym: 16 | result = newIdentNode($x) 17 | of nnkOpenSymChoice: 18 | result = newIdentNode($x[0]) 19 | else: 20 | result = newNimNode(x.kind) 21 | for c in x: 22 | result.add symToIdent(c) 23 | 24 | macro getConst*(x: static[int]): auto = 25 | return newLit(x) 26 | #macro getConst*(x: typed): auto = 27 | #echo x.treerepr 28 | #result = newLit(3) 29 | #result = newLit(x.intVal) 30 | 31 | macro delayExpansion*(x:untyped):auto = result = x 32 | 33 | macro `$`*(t:typedesc):auto = 34 | result = newLit(t.getType[1].repr) 35 | 36 | macro echoType*(x:typed):auto = 37 | result = newEmptyNode() 38 | let t1 = x.getType 39 | echo t1.treeRepr 40 | echo t1.getType.treeRepr 41 | macro echoType*(x:typedesc):auto = 42 | result = newEmptyNode() 43 | let t1 = x.getType 44 | echo t1.treeRepr 45 | echo t1[1].getType.treeRepr 46 | 47 | macro treerep*(x:typed):auto = 48 | echo x.treeRepr 49 | newEmptyNode() 50 | 51 | macro echoAst*(x:untyped):untyped = 52 | echo x.treeRepr 53 | x 54 | 55 | #template dump*(x:untyped):untyped = 56 | # echo $(x) 57 | # echo astToStr(x) 58 | # echo repr(x) 59 | macro dump*(x:untyped):untyped = 60 | let s = x[0].strVal 61 | #echo s 62 | let v = parseExpr(s) 63 | #echo v.treeRepr 64 | #echo v.toStrLit.treeRepr 65 | result = quote do: 66 | echo `x`, ": ", `v` 67 | 68 | macro toId*(s:static[string]):untyped = 69 | echo s 70 | newIdentNode(!s) 71 | 72 | macro toId*(s:typed):untyped = 73 | echo s.treeRepr 74 | #newIdentNode(!s) 75 | 76 | macro toString*(id:untyped):untyped = 77 | #echo id.repr 78 | echo id.treeRepr 79 | if id.kind==nnkSym: 80 | result = newLit($id) 81 | else: 82 | result = newLit($id[0]) 83 | 84 | macro catId*(x:varargs[untyped]):auto = 85 | #echo x.repr 86 | var s = "" 87 | for i in 0..0: 22 | let b = ident("Simd" & $P & $m) 23 | let l = n div m 24 | result.add getAst(tryArray(t,newLit(l),b)) 25 | m = m div 2 26 | #echo result.repr 27 | 28 | makeArray(D, 16) 29 | makeArray(D, 8) 30 | makeArray(D, 4) 31 | 32 | #when defined(SSE): 33 | #proc toDoubleA*(x:SimdS4):array[2,SimdD2] {.inline,noInit.} = 34 | # result[0] = mm_cvtps_pd(x) 35 | # var y{.noInit.}:SimdS4 36 | # perm2(y, x) 37 | # result[1] = mm_cvtps_pd(y) 38 | 39 | when defined(AVX): 40 | when not defined(AVX512): 41 | proc toDouble*(x:SimdS8):SimdD8 {.inline,noInit.} = 42 | #result = SimdD8(toDoubleA(x)) 43 | result := toDoubleA(x) 44 | 45 | #when declared(SimdS4): 46 | # proc toDouble*(x:SimdS4):SimdD4 {.inline,noInit.} = 47 | # result = SimdD4(toDoubleA(x)) 48 | # proc inorm2*(r:var SimdD4; x:SimdS4) {.inline.} = 49 | # let y = toDouble(x) 50 | # inorm2(r, y) 51 | 52 | when declared(SimdS8): 53 | #proc toDouble*(x:SimdS8):SimdD8 {.inline,noInit.} = 54 | # result = SimdD8(toDoubleA(x)) 55 | proc inorm2*(r:var SimdD8; x:SimdS8) {.inline.} = 56 | var xx{.noInit.} = toDouble(x) 57 | inorm2(r, xx) 58 | proc imadd*(r:var SimdD8; x,y:SimdS8) {.inline.} = 59 | var xx{.noInit.} = toDouble(x) 60 | var yy{.noInit.} = toDouble(y) 61 | imadd(r, xx, yy) 62 | proc imsub*(r:var SimdD8; x,y:SimdS8) {.inline.} = 63 | let xd = toDouble(x) 64 | let yd = toDouble(y) 65 | imsub(r, xd, yd) 66 | 67 | when declared(SimdS16): 68 | proc toDouble*(x:SimdS16):SimdD16 {.inline,noInit.} = 69 | #for i in 0..15: result[i] = float64(x[i]) 70 | result = SimdD16(v: toDoubleA(x)) 71 | proc inorm2*(r:var SimdD16; x:SimdS16) {.inline.} = inorm2(r, toDouble(x)) 72 | proc imadd*(r:var SimdD16; x,y:SimdS16) {.inline.} = 73 | var xx{.noInit.} = toDouble(x) 74 | var yy{.noInit.} = toDouble(y) 75 | imadd(r, xx, yy) 76 | proc imsub*(r:var SimdD16; x,y:SimdS16) {.inline.} = 77 | let xd = toDouble(x) 78 | let yd = toDouble(y) 79 | imsub(r, xd, yd) 80 | 81 | when declared(SimdD4): 82 | template toDouble*(x:SimdD4):untyped = x 83 | when declared(SimdD8): 84 | template toDouble*(x:SimdD8):untyped = x 85 | when declared(SimdD16): 86 | template toDouble*(x:SimdD16):untyped = x 87 | 88 | when isMainModule: 89 | var s8:SimdS8 90 | assign(s8, [0,1,2,3,4,5,6,7]) 91 | var d8 = toDouble(s8) 92 | echo d8 93 | inorm2(d8, s8) 94 | echo d8 95 | -------------------------------------------------------------------------------- /demo3/qexLite/simd/simdX86Ops.nim: -------------------------------------------------------------------------------- 1 | {. deadCodeElim: on .} 2 | 3 | import simdX86Types 4 | import simdSse 5 | import simdAvx 6 | import simdAvx512 7 | #import ../basicOps 8 | # import base 9 | # import math 10 | import macros 11 | 12 | template binaryMixed(T,op1,op2:untyped):untyped = 13 | template op1*(x:SomeNumber; y:T):T = op2(x.to(T),y) 14 | template op1*(x:T; y:SomeNumber):T = op2(x,y.to(T)) 15 | template unaryMixedVar(T,op1,op2:untyped):untyped = 16 | template op1*(r:var T; x:SomeNumber) = op2(r,x.to(T)) 17 | template binaryMixedVar(T,op1,op2:untyped):untyped = 18 | template op1*(r:var T; x:SomeNumber; y:T) = op2(r,x.to(T),y) 19 | template op1*(r:var T; x:T; y:SomeNumber) = op2(r,x,y.to(T)) 20 | template trinaryMixedVar(T,op1,op2:untyped):untyped = 21 | template op1*(r:var T; x:SomeNumber; y:T; z:T) = op2(r,x.to(T),y,z) 22 | template op1*(r:var T; x:T; y:SomeNumber; z:T) = op2(r,x,y.to(T),z) 23 | template op1*(r:var T; x:T; y:T; z:SomeNumber) = op2(r,x,y,z.to(T)) 24 | template map1(T,N,op:untyped):untyped {.dirty.} = 25 | proc op*(x:T):T {.inline,noInit.} = 26 | let t = x.toArray 27 | var r{.noInit.}:type(t) 28 | for i in 0.. $#"%[s, $o, $p.n] 63 | template CLIset*(p:typed, n:untyped, prefix = "") = 64 | p.CLIset n, prefix: 65 | discard 66 | 67 | template `$&`*(x: untyped): string = 68 | toHex(unsafeAddrInt(x)) 69 | 70 | proc `|`*(s: string, d: tuple[w:int,c:char]): string = 71 | let p = abs(d.w) - len(s) 72 | let pad = if p>0: repeat(d.c, p) else: "" 73 | if d.w >= 0: 74 | result = pad & s 75 | else: 76 | result = s & pad 77 | proc `|`*(s: string, d: int): string = 78 | s | (d,' ') 79 | proc `|`*(x: int, d: int): string = 80 | ($x) | d 81 | proc `|`*(f: float, d: tuple[w,p: int]): string = 82 | if d.p<0: 83 | formatFloat(f, ffDecimal, -d.p) | d.w 84 | else: 85 | formatFloat(f, ffDefault, d.p) | d.w 86 | proc `|`*(f: float, d: int): string = 87 | f | (d,d) 88 | template `|-`*(x:SomeNumber, y: int): untyped = 89 | x | -y 90 | 91 | proc indexOf*[T](x: openArray[T], y: any): int = 92 | let n = x.len 93 | while result1: 190 | var k = 2 191 | if (x and 1) != 0: 192 | k = 3 193 | while (x mod k) != 0: k += 2 194 | result.add k 195 | x = x div k 196 | 197 | 198 | when isMainModule: 199 | #[ 200 | proc test(n:int) = 201 | declareVla(x, float, n) 202 | let n2 = n div 2 203 | block: 204 | declareVla(y, float, n2) 205 | #{.emit:"""printf("%p\n", &x[0]);""".} 206 | x[0] = 1 207 | echo x[0] 208 | echo x.len 209 | echo y.len 210 | test(10) 211 | test(20) 212 | ]# 213 | 214 | template testFactor(n: int) = 215 | echo "factor(", n, ") = ", factor(n) 216 | for i in -2..20: 217 | testFactor(i) 218 | -------------------------------------------------------------------------------- /demo3/qexLite/threading.nim: -------------------------------------------------------------------------------- 1 | import times 2 | import strUtils 3 | import stdUtils 4 | import macros 5 | import omp 6 | import metaUtils 7 | 8 | type 9 | ThreadShare* = object 10 | p*:pointer 11 | counter*:int 12 | ThreadObj* = object 13 | threadNum*:int 14 | numThreads*:int 15 | share*:ptr cArray[ThreadShare] 16 | 17 | var threadNum*{.threadvar.}:int 18 | var numThreads*{.threadvar.}:int 19 | var threadLocals*{.threadvar.}:ThreadObj 20 | var inited = false 21 | 22 | template initThreadLocals*(ts:seq[ThreadShare]):untyped = 23 | threadLocals.threadNum = threadNum 24 | threadLocals.numThreads = numThreads 25 | threadLocals.share = cast[ptr cArray[ThreadShare]](ts[0].addr) 26 | threadLocals.share[threadNum].p = nil 27 | threadLocals.share[threadNum].counter = 0 28 | proc init = 29 | inited = true 30 | threadNum = 0 31 | numThreads = 1 32 | var ts = newSeq[ThreadShare](numThreads) 33 | initThreadLocals(ts) 34 | template threadsInit* = 35 | if not inited: 36 | init() 37 | template checkInit* = 38 | threadsInit() 39 | #if not inited: 40 | #let ii = instantiationInfo() 41 | #let ln = ii.line 42 | #let fn = ii.filename[0 .. ^5] 43 | #echo format("error: $#($#): threads not initialized",fn,ln) 44 | #quit(-1) 45 | 46 | template threads*(body:untyped):untyped = 47 | checkInit() 48 | let tidOld = threadNum 49 | let nidOld = numThreads 50 | let tlOld = threadLocals 51 | #proc tproc2{.genSym,inline.} = 52 | # body 53 | proc tproc{.genSym.} = 54 | var ts:seq[ThreadShare] 55 | ompParallel: 56 | threadNum = ompGetThreadNum() 57 | numThreads = ompGetNumThreads() 58 | if threadNum==0: ts.newSeq(numThreads) 59 | threadBarrierO() 60 | initThreadLocals(ts) 61 | #echoAll threadNum, " s: ", ptrInt(threadLocals.share) 62 | body 63 | #tproc2() 64 | threadBarrierO() 65 | tproc() 66 | threadNum = tidOld 67 | numThreads = nidOld 68 | threadLocals = tlOld 69 | template threads*(x0:untyped;body:untyped):untyped = 70 | checkInit() 71 | let tidOld = threadNum 72 | let nidOld = numThreads 73 | let tlOld = threadLocals 74 | proc tproc(xx:var type(x0)) {.genSym.} = 75 | var ts:seq[ThreadShare] 76 | ompParallel: 77 | threadNum = ompGetThreadNum() 78 | numThreads = ompGetNumThreads() 79 | if threadNum==0: ts.newSeq(numThreads) 80 | threadBarrierO() 81 | initThreadLocals(ts) 82 | #echoAll threadNum, " s: ", ptrInt(threadLocals.share) 83 | subst(x0,xx): 84 | body 85 | tproc(x0) 86 | threadNum = tidOld 87 | numThreads = nidOld 88 | threadLocals = tlOld 89 | 90 | template getMaxThreads*() = ompGetMaxThreads() 91 | template threadBarrierO* = ompBarrier 92 | template threadMaster*(x:untyped) = ompMaster(x) 93 | template threadSingle*(x:untyped) = ompSingle(x) 94 | template threadCritical*(x:untyped) = ompCritical(x) 95 | 96 | template threadDivideLow*(x,y: untyped): untyped = 97 | x + (threadNum*(y-x)) div numThreads 98 | template threadDivideHigh*(x,y: untyped): untyped = 99 | x + ((threadNum+1)*(y-x)) div numThreads 100 | 101 | 102 | proc tForX*(index,i0,i1,body:NimNode):NimNode = 103 | return quote do: 104 | let d = 1+`i1` - `i0` 105 | let ti0 = `i0` + (threadNum*d) div numThreads 106 | let ti1 = `i0` + ((threadNum+1)*d) div numThreads 107 | for `index` in ti0 ..< ti1: 108 | `body` 109 | macro tFor*(index,i0,i1: untyped; body: untyped): untyped = 110 | result = tForX(index, i0, i1, body) 111 | macro tFor*(index: untyped; slice: Slice; body: untyped): untyped = 112 | #echo index.treeRepr 113 | #echo treeRepr(slice) 114 | var i0,i1: NimNode 115 | #echo slice.kind 116 | if slice.kind == nnkStmtListExpr: 117 | i0 = slice[1][1] 118 | i1 = slice[1][2] 119 | else: 120 | i0 = slice[1] 121 | i1 = slice[2] 122 | result = tForX(index, i0, i1, body) 123 | 124 | discard """ 125 | iterator `.|`*[S, T](a: S, b: T): T {.inline.} = 126 | mixin threadNum 127 | var d = b - T(a) 128 | var res = T(a) + (threadNum*d) div numThreads 129 | var bb = T(a) + ((threadNum+1)*d) div numThreads 130 | while res <= bb: 131 | yield res 132 | inc(res) 133 | """ 134 | 135 | template t0wait* = threadBarrier() 136 | template t0waitX* = 137 | if threadNum==0: 138 | inc threadLocals.share[0].counter 139 | let tbar0 = threadLocals.share[0].counter 140 | for b in 1..= tbar0: break 144 | else: 145 | inc threadLocals.share[threadNum].counter 146 | #fence() 147 | 148 | template twait0* = threadBarrier() 149 | template twait0X* = 150 | if threadNum==0: 151 | inc threadLocals.share[0].counter 152 | #fence() 153 | else: 154 | inc threadLocals.share[threadNum].counter 155 | let tbar0 = threadLocals.share[threadNum].counter 156 | let p{.volatile.} = threadLocals.share[0].counter.addr 157 | while true: 158 | if p[] >= tbar0: break 159 | 160 | template threadBarrier* = 161 | #t0wait 162 | #twait0 163 | ompBarrier 164 | 165 | macro threadSum*(a:varargs[untyped]):auto = 166 | #echo a.treeRepr 167 | result = newNimNode(nnkStmtList) 168 | var sum = newNimNode(nnkStmtList) 169 | let tid = ident("threadNum") 170 | let nid = ident("numThreads") 171 | let p = newLit(1) 172 | for i in 0..=0: 41 | let ii = newLit(i) 42 | return newCall(a,ii) 43 | of nnkCallKinds: r0 = 1 44 | of nnkDotExpr: r1 = 0 45 | of {nnkVarSection,nnkLetSection}: 46 | result = it.cpNimNode 47 | for c in it: 48 | result.add c.cpNimNode 49 | for i in 0..(c.len-3): 50 | ignore.add c[i] 51 | result[^1].add c[i].cpNimNode 52 | result[^1].add c[^2].cpNimNode 53 | result[^1].add recurse(c[^1], vars, a) 54 | return 55 | else: discard 56 | #echo it.treerepr 57 | result = it.cpNimNode 58 | for i in 0..>(a,n) 35 | 36 | timesTwo<<(blocksPerGrid,threadsPerBlock)>>(b,n) 37 | -------------------------------------------------------------------------------- /opts.c2nim: -------------------------------------------------------------------------------- 1 | #def __global__ 2 | -------------------------------------------------------------------------------- /runc2nim: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | exit 3 | 4 | f="vectorAdd.cu" 5 | g="vectorAdd.cup" 6 | 7 | cat $f |sed 's/\([a-zA-Z0-9_]*\)<<>>(/,/' >$g 8 | c2nim opts.c2nim $g 9 | -------------------------------------------------------------------------------- /test/config.nims: -------------------------------------------------------------------------------- 1 | --path:".." 2 | -------------------------------------------------------------------------------- /test/test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf nimcache 4 | 5 | if (($#>0));then 6 | declare -ar Ts=( "$@" ) 7 | else 8 | declare -ar Ts=( $(echo t*.nim) ) 9 | fi 10 | logfile(){ printf "out/%s.log" "${Ts[$1]}"; } 11 | [[ -d out ]] || mkdir out 12 | declare -i j ret 13 | declare -ai F 14 | for ((j=0;j<${#Ts[@]};++j));do 15 | printf "Testing: % 6d / %d" $j "${#Ts[@]}" 16 | nim c -r "${Ts[j]}" > "$(logfile $j)" 2>&1 17 | ret=$? 18 | if ((ret!=0));then 19 | printf '\rFail: %s\n' "${Ts[j]}" 20 | F+=( $j ) 21 | else 22 | rm "$(logfile $j)" 23 | printf '\r%60s\r' ' ' 24 | fi 25 | rm -f "${Ts[j]%.nim}" 26 | done 27 | echo 28 | echo "Total: ${#Ts[@]}" 29 | echo "Passed: $((${#Ts[@]}-${#F[@]}))" 30 | if ((${#F[@]}>0));then 31 | echo "Failed: ${#F[@]}" 32 | echo 33 | echo "Check log file(s):" 34 | for j in ${F[@]};do 35 | echo "$(logfile $j)" 36 | done 37 | else 38 | rm -rf out 39 | fi 40 | 41 | rm -rf nimcache 42 | -------------------------------------------------------------------------------- /test/tinline000.nim: -------------------------------------------------------------------------------- 1 | import inline 2 | 3 | proc f1(r: var any; x: any) = r = 2*x 4 | proc f2(x: any): auto = 2*x 5 | 6 | proc a1(x: float) = 7 | inlineProcs: 8 | var r: float 9 | var s: type(r) 10 | f1(r, x) 11 | proc a2(x: float) = 12 | inlineProcs: 13 | var r = f2(x) 14 | 15 | echo "* Basics" 16 | a1(1.0) 17 | a2(1.0) 18 | -------------------------------------------------------------------------------- /test/tinline001.nim: -------------------------------------------------------------------------------- 1 | import inline 2 | 3 | echo "* multiple iterators" 4 | type T = array[3,float] 5 | proc loop(x:var T, y:T) = 6 | echo "loop" 7 | let n = 3.0 8 | for k in 0..0: s &= " , " 9 | s &= $x[i] 10 | echo "x = [ ",s," ] has size ",N*sizeof(T) 11 | block: 12 | inlineProcs: 13 | var v = [0,1,2,3] 14 | g v 15 | -------------------------------------------------------------------------------- /test/tinline006.nim: -------------------------------------------------------------------------------- 1 | import inline 2 | 3 | echo "* object construction" 4 | proc oc(x:int):auto = 5 | type A = object 6 | x:int 7 | return A(x:x) 8 | block: 9 | inlineProcs: 10 | var x = 3 11 | echo oc(x).x 12 | -------------------------------------------------------------------------------- /test/tinline007.nim: -------------------------------------------------------------------------------- 1 | import inline 2 | 3 | proc g[T;N:static[int]](x:array[N,T]) = 4 | var s = "" 5 | for i in 0..0: s &= " , " 7 | s &= $x[i] 8 | echo "x = [ ",s," ] has size ",N*sizeof(T) 9 | echo "* Types with generic parameters" 10 | proc gt[T] = 11 | type 12 | M[N:static[int]] = object 13 | d:array[N,T] 14 | var A:M[3] 15 | proc g[N:static[int]](x:M[N]) = x.d.g 16 | proc `[]`[N:static[int]](x:M[N],i:int):T = x.d[i] 17 | proc `[]=`[N:static[int]](x:var M[N],i:int,y:T) = x.d[i] = y 18 | inlineProcs: 19 | for i in 0..