├── sgc ├── .rvmrc ├── .gitignore ├── lib │ ├── rubycuda.rb │ ├── rubycu.rb │ ├── cuda │ │ ├── driver │ │ │ ├── extconf.rb │ │ │ ├── rubycu.o │ │ │ ├── rubycu.bundle │ │ │ ├── mkmf.log │ │ │ ├── Makefile │ │ │ └── rubycu.cpp │ │ ├── runtime │ │ │ ├── rubycuda.rb │ │ │ ├── version.rb │ │ │ ├── error.rb │ │ │ ├── thread.rb │ │ │ ├── cuda.rb │ │ │ ├── memory.rb │ │ │ ├── stream.rb │ │ │ ├── device.rb │ │ │ ├── event.rb │ │ │ ├── function.rb │ │ │ └── ffi-cuda.rb │ │ └── ruby │ │ │ └── cu.rb │ ├── madison │ │ ├── kernel │ │ │ ├── kernel.h │ │ │ ├── libkernel.so │ │ │ ├── libkernel.10.so │ │ │ ├── kernel.cu │ │ │ └── test.cu │ │ ├── matrix.rb │ │ └── comparable.rb │ ├── ffi │ │ └── prettystruct.rb │ └── memory │ │ ├── interface │ │ ├── ipointer.rb │ │ └── ibuffer.rb │ │ ├── pointer.rb │ │ └── buffer.rb ├── visualize.sh └── visualize.gp └── .gitignore /sgc/.rvmrc: -------------------------------------------------------------------------------- 1 | rvm 1.9.2 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */libkernel.so 2 | -------------------------------------------------------------------------------- /sgc/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.png 3 | a.out 4 | -------------------------------------------------------------------------------- /sgc/lib/rubycuda.rb: -------------------------------------------------------------------------------- 1 | require 'cuda/runtime/rubycuda' 2 | -------------------------------------------------------------------------------- /sgc/lib/rubycu.rb: -------------------------------------------------------------------------------- 1 | require 'cuda/driver/rubycu' 2 | require 'cuda/ruby/cu' 3 | -------------------------------------------------------------------------------- /sgc/lib/cuda/driver/extconf.rb: -------------------------------------------------------------------------------- 1 | require 'mkmf' 2 | have_library("cuda") 3 | create_makefile("rubycu") 4 | -------------------------------------------------------------------------------- /sgc/lib/cuda/driver/rubycu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/cuda/driver/rubycu.o -------------------------------------------------------------------------------- /sgc/lib/madison/kernel/kernel.h: -------------------------------------------------------------------------------- 1 | #define DIMENSIONS 10 2 | #define BLOCK_SIZE 16 3 | #define CLUSTER_SIZE 16 4 | -------------------------------------------------------------------------------- /sgc/lib/cuda/driver/rubycu.bundle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/cuda/driver/rubycu.bundle -------------------------------------------------------------------------------- /sgc/lib/madison/kernel/libkernel.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/madison/kernel/libkernel.so -------------------------------------------------------------------------------- /sgc/lib/madison/kernel/libkernel.10.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/madison/kernel/libkernel.10.so -------------------------------------------------------------------------------- /sgc/visualize.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # visualize.sh 3 | 4 | cat result.csv|sort -n -k1 -k2 > result_sorted.csv 5 | awk '{ print;if ((NR % 512) == 0) printf("\n");}' result_sorted.csv > result_sorted_final.csv 6 | gnuplot visualize.gp ; open heatmaps.png -------------------------------------------------------------------------------- /sgc/lib/ffi/prettystruct.rb: -------------------------------------------------------------------------------- 1 | require 'ffi' 2 | 3 | 4 | module FFI 5 | 6 | # This class is obtained from ffi-tk (https://github.com/Tass/ffi-tk). 7 | class PrettyStruct < FFI::Struct 8 | ACCESSOR_CODE = <<-CODE 9 | def {name}; self[{sym}]; end 10 | def {name}=(value) self[{sym}] = value; end 11 | CODE 12 | 13 | def self.layout(*kvs) 14 | kvs.each_slice(2) do |key, value| 15 | eval ACCESSOR_CODE.gsub(/\{(.*?)\}/, '{name}' => key, '{sym}' => ":#{key}") 16 | end 17 | 18 | super 19 | end 20 | 21 | def inspect 22 | kvs = members.zip(values) 23 | kvs.map!{|key, value| "%s=%s" % [key, value.inspect] } 24 | "<%s %s>" % [self.class, kvs.join(' ')] 25 | end 26 | end 27 | 28 | end # module 29 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/rubycuda.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/cuda' 27 | require 'cuda/runtime/error' 28 | require 'cuda/runtime/version' 29 | require 'cuda/runtime/device' 30 | require 'cuda/runtime/thread' 31 | require 'cuda/runtime/memory' 32 | require 'cuda/runtime/function' 33 | require 'cuda/runtime/stream' 34 | require 'cuda/runtime/event' 35 | -------------------------------------------------------------------------------- /sgc/lib/memory/interface/ipointer.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | module SGC 26 | module Memory 27 | 28 | module IMemoryPointer 29 | 30 | def initialize(value = nil); end 31 | 32 | def ptr; raise NotImplementedError; end 33 | def ptr=(value); raise NotImplementedError; end 34 | def offset(index); raise NotImplementedError; end 35 | def ref; raise NotImplementedError; end 36 | 37 | end 38 | 39 | end # module 40 | end # module 41 | -------------------------------------------------------------------------------- /sgc/visualize.gp: -------------------------------------------------------------------------------- 1 | set terminal png transparent nocrop enhanced font arial 8 size 1000, 1000 2 | set output 'heatmaps.png' 3 | unset key 4 | set view map 5 | set style data linespoints 6 | set xtics border in scale 0,0 mirror norotate offset character 0, 0, 0 7 | set ytics border in scale 0,0 mirror norotate offset character 0, 0, 0 8 | set ztics border in scale 0,0 nomirror norotate offset character 0, 0, 0 9 | set nocbtics 10 | set title "Heat Map generated by 'plot' from a stream of XYZ values\nNB: Rows must be separated by blank lines!" 11 | set rrange [ * : * ] noreverse nowriteback # (currently [8.98847e+307:-8.98847e+307] ) 12 | set trange [ * : * ] noreverse nowriteback # (currently [-5.00000:5.00000] ) 13 | set urange [ * : * ] noreverse nowriteback # (currently [-5.00000:5.00000] ) 14 | set vrange [ * : * ] noreverse nowriteback # (currently [-5.00000:5.00000] ) 15 | set xrange [ -0.5 : * ] noreverse nowriteback 16 | set x2range [ * : * ] noreverse nowriteback # (currently [-0.500000:4.50000] ) 17 | set yrange [ -0.5 : * ] noreverse nowriteback 18 | set y2range [ * : * ] noreverse nowriteback # (currently [-0.500000:4.50000] ) 19 | set zrange [ 0.0 : 1.0 ] noreverse nowriteback # (currently [0.00000:5.00000] ) 20 | set cblabel "Score" 21 | set cbrange [ 0.00000 : * ] noreverse nowriteback 22 | set palette rgbformulae -7, 2, -7 23 | plot 'result_sorted_final.csv' using 2:1:3 with image 24 | -------------------------------------------------------------------------------- /sgc/lib/madison/kernel/kernel.cu: -------------------------------------------------------------------------------- 1 | extern "C" { 2 | #include "kernel.h" 3 | 4 | __global__ void MatPopulate(float *A, int count) 5 | { 6 | int row = blockIdx.x; 7 | int col = threadIdx.x; 8 | A[row * DIMENSIONS + col] = (float)(row * DIMENSIONS + col)/(DIMENSIONS*count); 9 | } 10 | 11 | float score(float *A, float *B){ 12 | float score = 0.0; 13 | for(int i=0; i. 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | 27 | 28 | module SGC 29 | module Cuda 30 | 31 | def driver_version 32 | p = FFI::MemoryPointer.new(:int) 33 | status = API::cudaDriverGetVersion(p) 34 | Pvt::handle_error(status) 35 | p.read_int 36 | end 37 | module_function :driver_version 38 | 39 | 40 | def runtime_version 41 | p = FFI::MemoryPointer.new(:int) 42 | status = API::cudaRuntimeGetVersion(p) 43 | Pvt::handle_error(status) 44 | p.read_int 45 | end 46 | module_function :runtime_version 47 | 48 | end # module 49 | end # module 50 | -------------------------------------------------------------------------------- /sgc/lib/memory/interface/ibuffer.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'memory/interface/ipointer' 26 | 27 | 28 | module SGC 29 | module Memory 30 | 31 | module IBuffer 32 | 33 | include IMemoryPointer 34 | 35 | def initialize(type, size); end 36 | 37 | def [](index); raise NotImplementedError; end 38 | def []=(index, value); raise NotImplementedError; end 39 | def size; raise NotImplementedError; end 40 | def element_size; raise NotImplementedError; end 41 | 42 | module ClassMethods 43 | def element_size(type); raise NotImplementedError; end 44 | end 45 | 46 | def self.included(base) 47 | base.extend(ClassMethods) 48 | end 49 | 50 | end 51 | 52 | end # module 53 | end # module 54 | -------------------------------------------------------------------------------- /sgc/lib/memory/pointer.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'ffi' 26 | require 'memory/interface/ipointer' 27 | 28 | 29 | module SGC 30 | module Memory 31 | 32 | class MemoryPointer 33 | 34 | include IMemoryPointer 35 | 36 | 37 | def initialize(v = nil) 38 | @p = FFI::MemoryPointer.new(:pointer) 39 | @p.write_pointer(v) 40 | end 41 | 42 | 43 | def ptr 44 | @p.read_pointer 45 | end 46 | 47 | 48 | def ptr=(v) 49 | @p.write_pointer(v) 50 | v 51 | end 52 | 53 | 54 | def offset(i) 55 | MemoryPointer.new(@p.read_pointer.to_i + i) 56 | end 57 | 58 | 59 | def ref 60 | @p 61 | end 62 | 63 | end 64 | 65 | end # module 66 | end # module 67 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/error.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | 27 | 28 | module SGC 29 | module Cuda 30 | 31 | def get_error_string(e) 32 | API::cudaGetErrorString(e) 33 | end 34 | module_function :get_error_string 35 | 36 | 37 | def get_last_error 38 | API::cudaGetLastError 39 | end 40 | module_function :get_last_error 41 | 42 | 43 | def peek_at_last_error 44 | API::cudaPeekAtLastError 45 | end 46 | module_function :peek_at_last_error 47 | 48 | module Pvt 49 | 50 | CUDA_SUCCESS = API::CudaError[:cudaSuccess] 51 | CUDA_ERROR_NOT_READY = API::CudaError[:cudaErrorNotReady] 52 | 53 | def self.handle_error(status) 54 | status == CUDA_SUCCESS or raise API::cudaGetErrorString(status) 55 | nil 56 | end 57 | 58 | end 59 | 60 | end # module 61 | end # module 62 | -------------------------------------------------------------------------------- /sgc/lib/madison/matrix.rb: -------------------------------------------------------------------------------- 1 | module Madison 2 | require 'rubycuda' 3 | require 'madison/comparable' 4 | 5 | 6 | class Dimension 7 | # A vectors dimension key => value 8 | attr_accessor :i, :j 9 | 10 | def initialize matrix, i, j 11 | @matrix = matrix 12 | @i = i 13 | @j = j 14 | end 15 | 16 | def value= value 17 | @matrix.values[@i*@matrix.vectors_dimension + @j] = value 18 | end 19 | 20 | def key= value 21 | @matrix.keys[@i*@matrix.vectors_dimension + @j] = value 22 | end 23 | 24 | def value 25 | @matrix.values[@i*@matrix.vectors_dimension + @j] 26 | end 27 | 28 | def key 29 | @matrix.keys[@i*@matrix.vectors_dimension + @j] 30 | end 31 | 32 | def inspect 33 | "# #{value}>" 34 | end 35 | end 36 | 37 | class Matrix 38 | include SGC::Cuda 39 | include Madison::Comparable 40 | 41 | attr_reader :vectors_dimension 42 | attr_reader :count 43 | attr_reader :size 44 | attr_accessor :keys, :values 45 | 46 | def initialize type, vectors_count, vectors_dimension 47 | @last_id = 0 48 | @count = vectors_count 49 | @vectors_dimension = vectors_dimension 50 | @size = vectors_count * vectors_dimension 51 | @type = type 52 | @type_size = Buffer.element_size(type) 53 | @dimensions = Hash.new{|h, k| h[k] = {}} 54 | 55 | # the matrix used to store the vector dimensions values 56 | @values = Buffer.new(type, @size) 57 | 58 | # the matrix used to store the vector dimensions keys 59 | @keys = Buffer.new(:int, @size) 60 | end 61 | 62 | def inspect 63 | "#" 64 | end 65 | 66 | def dimensions i, j 67 | @dimensions[i][j] ||= Dimension.new self, i, j 68 | end 69 | 70 | def << vector 71 | raise "Already full" unless @last_id <= @count 72 | (0...[vector.size, @vectors_dimension].min).each do |k| 73 | dimensions(@last_id, k).value = vector.values[k] 74 | dimensions(@last_id, k).key = vector.keys[k].hash 75 | end 76 | @last_id += 1 77 | self 78 | end 79 | end 80 | end -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/thread.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/cuda' 27 | require 'cuda/runtime/error' 28 | 29 | 30 | module SGC 31 | module Cuda 32 | 33 | class CudaThread 34 | 35 | def self.exit 36 | status = API::cudaThreadExit 37 | Pvt::handle_error(status) 38 | self 39 | end 40 | 41 | 42 | def self.cache_config 43 | p = FFI::MemoryPointer.new(:int) 44 | status = API::cudaThreadGetCacheConfig(p) 45 | Pvt::handle_error(status) 46 | CudaFuncCache[p.read_int] 47 | end 48 | 49 | 50 | def self.cache_config=(config) 51 | status = API::cudaThreadSetCacheConfig(config) 52 | Pvt::handle_error(status) 53 | config 54 | end 55 | 56 | 57 | def self.limit(limit) 58 | p = FFI::MemoryPointer.new(:size_t) 59 | status = API::cudaThreadGetLimit(p, limit) 60 | Pvt::handle_error(status) 61 | p.read_long 62 | end 63 | 64 | 65 | def self.limit=(*limit_value_pair) 66 | limit, value = limit_value_pair.flatten 67 | status = API::cudaThreadSetLimit(limit, value) 68 | Pvt::handle_error(status) 69 | limit_value_pair 70 | end 71 | 72 | 73 | def self.synchronize 74 | status = API::cudaThreadSynchronize 75 | Pvt::handle_error(status) 76 | self 77 | end 78 | 79 | end 80 | 81 | end # module 82 | end # module 83 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/cuda.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'memory/buffer' 27 | 28 | 29 | module SGC 30 | module Cuda 31 | 32 | CudaError_t = CudaError = API::CudaError 33 | CudaDeviceFlags = API::CudaDeviceFlags 34 | CudaEventFlags = API::CudaEventFlags 35 | CudaHostAllocFlags = API::CudaHostAllocFlags 36 | CudaArrayFlags = API::CudaArrayFlags 37 | CudaMemcpyKind = API::CudaMemcpyKind 38 | CudaChannelFormatKind = API::CudaChannelFormatKind 39 | CudaFuncCache = API::CudaFuncCache 40 | CudaLimit = API::CudaLimit 41 | CudaComputeMode = API::CudaComputeMode 42 | CudaSurfaceBoundaryMode = API::CudaSurfaceBoundaryMode 43 | CudaSurfaceFormatMode = API::CudaSurfaceFormatMode 44 | CudaTextureAddressMode = API::CudaTextureAddressMode 45 | CudaTextureFilterMode = API::CudaTextureFilterMode 46 | CudaTextureReadMode = API::CudaTextureReadMode 47 | 48 | Dim3 = API::Dim3 49 | CudaDeviceProp = API::CudaDeviceProp 50 | CudaFuncAttributes = API::CudaFuncAttributes 51 | CudaChannelFormatDesc = API::CudaChannelFormatDesc 52 | CudaPitchedPtr = API::CudaPitchedPtr 53 | CudaPos = API::CudaPos 54 | CudaExtent = API::CudaExtent 55 | CudaMemcpy3DParms = API::CudaMemcpy3DParms 56 | TextureReference = API::TextureReference 57 | SurfaceReference = API::SurfaceReference 58 | 59 | Buffer = SGC::Memory::Buffer 60 | 61 | end # module 62 | end # module 63 | -------------------------------------------------------------------------------- /sgc/lib/cuda/driver/mkmf.log: -------------------------------------------------------------------------------- 1 | have_library: checking for main() in -lcuda... -------------------- no 2 | 3 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long -fno-common -pipe conftest.c -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib -lruby.1.9.1-static -lpthread -ldl -lobjc " 4 | checked program was: 5 | /* begin */ 6 | 1: #include "ruby.h" 7 | 2: 8 | 3: int main() {return 0;} 9 | /* end */ 10 | 11 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long -fno-common -pipe conftest.c -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib -lruby.1.9.1-static -lcuda -lpthread -ldl -lobjc " 12 | ld: library not found for -lcuda 13 | collect2: ld returned 1 exit status 14 | checked program was: 15 | /* begin */ 16 | 1: #include "ruby.h" 17 | 2: 18 | 3: /*top*/ 19 | 4: int main() {return 0;} 20 | 5: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; } 21 | /* end */ 22 | 23 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long -fno-common -pipe conftest.c -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib -lruby.1.9.1-static -lcuda -lpthread -ldl -lobjc " 24 | ld: library not found for -lcuda 25 | collect2: ld returned 1 exit status 26 | checked program was: 27 | /* begin */ 28 | 1: #include "ruby.h" 29 | 2: 30 | 3: /*top*/ 31 | 4: int main() {return 0;} 32 | 5: int t() { main(); return 0; } 33 | /* end */ 34 | 35 | -------------------- 36 | 37 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/memory.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/error' 27 | require 'memory/pointer' 28 | 29 | 30 | module SGC 31 | module Cuda 32 | 33 | class CudaDeviceMemory 34 | 35 | def self.malloc(nbytes) 36 | p = SGC::Memory::MemoryPointer.new 37 | status = API::cudaMalloc(p.ref, nbytes) 38 | Pvt::handle_error(status) 39 | p 40 | end 41 | 42 | 43 | def self.free(devptr) 44 | status = API::cudaFree(devptr.ptr) 45 | Pvt::handle_error(status) 46 | nil 47 | end 48 | 49 | end 50 | 51 | 52 | module CudaMemory 53 | 54 | def memcpy(dst_ptr, src_ptr, nbytes, memcpy_kind) 55 | status = API::cudaMemcpy(dst_ptr.ptr, src_ptr.ptr, nbytes, memcpy_kind) 56 | Pvt::handle_error(status) 57 | end 58 | module_function :memcpy 59 | 60 | def memcpy_htoh(dst_ptr, src_ptr, nbytes) 61 | memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyHostToHost) 62 | end 63 | module_function :memcpy_htoh 64 | 65 | def memcpy_htod(dst_ptr, src_ptr, nbytes) 66 | memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyHostToDevice) 67 | end 68 | module_function :memcpy_htod 69 | 70 | def memcpy_dtoh(dst_ptr, src_ptr, nbytes) 71 | memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyDeviceToHost) 72 | end 73 | module_function :memcpy_dtoh 74 | 75 | def memcpy_dtod(dst_ptr, src_ptr, nbytes) 76 | memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyDeviceToDevice) 77 | end 78 | module_function :memcpy_dtod 79 | 80 | end 81 | 82 | end # module 83 | end # module 84 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/stream.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/error' 27 | 28 | 29 | module SGC 30 | module Cuda 31 | 32 | class CudaStream 33 | 34 | def initialize 35 | @p = FFI::MemoryPointer.new(:pointer) 36 | end 37 | 38 | 39 | def create 40 | status = API::cudaStreamCreate(@p) 41 | Pvt::handle_error(status) 42 | self 43 | end 44 | 45 | 46 | def destroy 47 | status = API::cudaStreamDestroy(@p.read_pointer) 48 | Pvt::handle_error(status) 49 | @p.write_pointer(0) 50 | nil 51 | end 52 | 53 | 54 | def query 55 | status = API::cudaStreamQuery(@p.read_pointer) 56 | if status == Pvt::CUDA_SUCCESS 57 | return true 58 | elsif status == Pvt::CUDA_ERROR_NOT_READY 59 | return false 60 | end 61 | Pvt::hanld_error(status) 62 | self 63 | end 64 | 65 | 66 | def synchronize 67 | status = API::cudaStreamSynchronize(@p.read_pointer) 68 | Pvt::handle_error(status) 69 | self 70 | end 71 | 72 | 73 | def wait_event(event, flags = 0) 74 | status = API::cudaStreamWaitEvent(@p.read_pointer, event, flags) 75 | Pvt::handle_error(status) 76 | self 77 | end 78 | 79 | 80 | def self.wait_event(event, flags = 0) 81 | p = FFI::MemoryPointer.new(:pointer) 82 | p.write_pointer(0) 83 | status = API::cudaStreamWaitEvent(p.read_pointer, event, flags) 84 | Pvt::handle_error(status) 85 | self 86 | end 87 | 88 | def to_ptr 89 | @p.read_pointer 90 | end 91 | 92 | end 93 | 94 | end # module 95 | end # module 96 | -------------------------------------------------------------------------------- /sgc/lib/memory/buffer.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'ffi' 26 | 27 | require 'memory/interface/ibuffer' 28 | require 'memory/pointer' 29 | 30 | 31 | module SGC 32 | module Memory 33 | 34 | class Buffer 35 | 36 | include IBuffer 37 | 38 | 39 | def initialize(type, size) 40 | @@reads[type] && @@writes[type] or raise "Invalid buffer element type." 41 | 42 | @reader = @@reads[type] 43 | @writer = @@writes[type] 44 | @ptr = FFI::MemoryPointer.new(type, size) 45 | @size = size 46 | end 47 | 48 | 49 | def [](i) 50 | assert_index(i) 51 | @ptr[i].send(@reader) 52 | end 53 | 54 | 55 | def []=(i, v) 56 | assert_index(i) 57 | @ptr[i].send(@writer, v) 58 | v 59 | end 60 | 61 | 62 | def size 63 | @size 64 | end 65 | 66 | 67 | def element_size 68 | @ptr.type_size 69 | end 70 | 71 | 72 | def ptr 73 | @ptr 74 | end 75 | 76 | 77 | def offset(i) 78 | assert_index(i) 79 | MemoryPointer.new(@ptr[i]) 80 | end 81 | 82 | 83 | def self.element_size(type) 84 | @@sizes[type] 85 | end 86 | 87 | protected 88 | 89 | def assert_index(i) 90 | i >= 0 && i < @size or raise IndexError, "Invalid index to buffer: index = #{i}. Expect index in 0..#{@size-1}" 91 | end 92 | 93 | @@reads = { int: :read_int, long: :read_long, float: :read_float } 94 | @@writes = { int: :write_int, long: :write_long, float: :write_float } 95 | @@sizes = { int: 4, long: FFI::TypeDefs[:long].size, float: 4 } 96 | 97 | end 98 | 99 | end # module 100 | end # module 101 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/device.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/error' 27 | 28 | 29 | module SGC 30 | module Cuda 31 | 32 | class CudaDevice 33 | 34 | def self.count 35 | p = FFI::MemoryPointer.new(:int) 36 | status = API::cudaGetDeviceCount(p) 37 | Pvt::handle_error(status) 38 | p.read_int 39 | end 40 | 41 | 42 | def self.get 43 | p = FFI::MemoryPointer.new(:int) 44 | status = API::cudaGetDevice(p) 45 | Pvt::handle_error(status) 46 | p.read_int 47 | end 48 | class << self; alias_method :current, :get; end 49 | 50 | 51 | def self.set(devid) 52 | status = API::cudaSetDevice(devid) 53 | Pvt::handle_error(status) 54 | self 55 | end 56 | class << self; alias_method :current=, :set; end 57 | 58 | 59 | def self.choose(prop) 60 | pdev = FFI::MemoryPointer.new(:int) 61 | status = API::cudaChooseDevice(pdev, prop.to_ptr) 62 | Pvt::handle_error(status) 63 | pdev.read_int 64 | end 65 | 66 | 67 | def self.properties(devid = self.get) 68 | prop = API::CudaDeviceProp.new 69 | status = API::cudaGetDeviceProperties(prop.to_ptr, devid) 70 | Pvt::handle_error(status) 71 | prop 72 | end 73 | 74 | 75 | def self.flags=(flags) 76 | if flags.is_a?(Symbol) 77 | flags = CudaDeviceFlags[flags] 78 | end 79 | 80 | status = API::cudaSetDeviceFlags(flags) 81 | Pvt::handle_error(status) 82 | flags 83 | end 84 | 85 | 86 | def self.valid_devices=(devs) 87 | p = FFI::MemoryPointer.new(:int, devs.count) 88 | devs.each_with_index do |devid, i| 89 | p[i].write_int(devid) 90 | end 91 | status = API::cudaSetValidDevices(p, devs.count) 92 | Pvt::handle_error(status) 93 | devs 94 | end 95 | 96 | end 97 | 98 | end # module 99 | end # module 100 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/event.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/error' 27 | 28 | 29 | module SGC 30 | module Cuda 31 | 32 | class CudaEvent 33 | 34 | def initialize 35 | @p = FFI::MemoryPointer.new(:pointer) 36 | end 37 | 38 | 39 | def create(flags = CUDA_EVENT_DEFAULT) 40 | if flags == CUDA_EVENT_DEFAULT 41 | status = API::cudaEventCreate(@p) 42 | else 43 | flags = CudaEventFlags[flags] if flags.is_a?(Symbol) 44 | status = API::cudaEventCreateWithFlags(@p, flags) 45 | end 46 | Pvt::handle_error(status) 47 | self 48 | end 49 | 50 | 51 | def destroy 52 | status = API::cudaEventDestroy(@p.read_pointer) 53 | Pvt::handle_error(status) 54 | @p.write_pointer(0) 55 | nil 56 | end 57 | 58 | 59 | def query 60 | status = API::cudaEventQuery(@p.read_pointer) 61 | if status == Pvt::CUDA_SUCCESS 62 | return true 63 | elsif status == Pvt::CUDA_ERROR_NOT_READ 64 | return false 65 | end 66 | Pvt::handle_error(status) 67 | self 68 | end 69 | 70 | 71 | def record(stream = 0) 72 | if stream == 0 73 | p = FFI::MemoryPointer.new(:pointer) 74 | p.write_pointer(0) 75 | stream = p.read_pointer 76 | else 77 | stream = stream.to_ptr 78 | end 79 | status = API::cudaEventRecord(@p.read_pointer, stream) 80 | Pvt::handle_error(status) 81 | self 82 | end 83 | 84 | 85 | def synchronize 86 | status = API::cudaEventSynchronize(@p.read_pointer) 87 | Pvt::handle_error(status) 88 | self 89 | end 90 | 91 | 92 | def to_ptr 93 | @p.read_pointer 94 | end 95 | 96 | 97 | def self.elapsed_time(event_start, event_end) 98 | t = FFI::MemoryPointer.new(:float) 99 | API::cudaEventElapsedTime(t, event_start.to_ptr, event_end.to_ptr) 100 | t.read_float 101 | end 102 | 103 | protected 104 | 105 | CUDA_EVENT_DEFAULT = CudaEventFlags[:cudaEventDefault] 106 | 107 | end 108 | 109 | end # module 110 | end # module 111 | -------------------------------------------------------------------------------- /sgc/lib/cuda/ruby/cu.rb: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2010 Chung Shin Yee 2 | # 3 | # shinyee@speedgocomputing.com 4 | # http://www.speedgocomputing.com 5 | # http://github.com/xman/sgc-ruby-cuda 6 | # http://rubyforge.org/projects/rubycuda 7 | # 8 | # This file is part of SGC-Ruby-CUDA. 9 | # 10 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 11 | # it under the terms of the GNU General Public License as published by 12 | # the Free Software Foundation, either version 3 of the License, or 13 | # (at your option) any later version. 14 | # 15 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | # 20 | # You should have received a copy of the GNU General Public License 21 | # along with SGC-Ruby-CUDA. If not, see . 22 | 23 | 24 | module SGC 25 | module CU 26 | 27 | 28 | class CUDevice 29 | 30 | # See CUDevice::get_count. 31 | def self.count 32 | self.get_count 33 | end 34 | 35 | # See CUDevice#get_name. 36 | def name 37 | get_name 38 | end 39 | 40 | # See CUDevice#get_attribute. 41 | def attribute(attr) 42 | get_attribute(attr) 43 | end 44 | 45 | # See CUDevice#get_properties. 46 | def properties 47 | get_properties 48 | end 49 | 50 | end 51 | 52 | 53 | class CUContext 54 | 55 | # See CUContext::get_device. 56 | def self.device 57 | self.get_device 58 | end 59 | 60 | # See CUContext::get_limit. 61 | def self.limit(lim) 62 | get_limit(lim) 63 | end 64 | 65 | # See CUContext::get_cache_config. 66 | def self.cache_config 67 | get_cache_config 68 | end 69 | 70 | # See CUContext#get_api_version. 71 | def api_version 72 | get_api_version 73 | end 74 | 75 | end 76 | 77 | 78 | class CUModule 79 | 80 | # See CUModule#get_function. 81 | def function(name_str) 82 | get_function(name_str) 83 | end 84 | 85 | # See CUModule#get_global. 86 | def global(name_str) 87 | get_global(name_str) 88 | end 89 | 90 | # See CUModule#get_texref. 91 | def texref(name_str) 92 | get_texref(name_str) 93 | end 94 | 95 | end 96 | 97 | 98 | class CUFunction 99 | 100 | # See CUFunction#get_attribute. 101 | def attribute(attr) 102 | get_attribute(attr) 103 | end 104 | 105 | end 106 | 107 | 108 | class CUTexRef 109 | 110 | # See CUTexRef#get_address. 111 | def address 112 | get_address 113 | end 114 | 115 | # See CUTexRef#get_address_mode. 116 | def address_mode(dim) 117 | get_address_mode(dim) 118 | end 119 | 120 | # See CUTexRef#get_filter_mode. 121 | def filter_mode 122 | get_filter_mode 123 | end 124 | 125 | # See CUTexRef#get_flags. 126 | def flags 127 | get_flags 128 | end 129 | 130 | end 131 | 132 | 133 | # See ::driver_get_version. 134 | def driver_version 135 | driver_get_version 136 | end 137 | module_function :driver_version 138 | 139 | 140 | end # module 141 | end # module 142 | -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/function.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'cuda/runtime/ffi-cuda' 26 | require 'cuda/runtime/cuda' 27 | require 'cuda/runtime/error' 28 | require 'memory/pointer' 29 | require 'dl' 30 | 31 | 32 | module SGC 33 | module Cuda 34 | 35 | class CudaFunction 36 | 37 | attr_reader :name 38 | 39 | 40 | def initialize(name) 41 | @name = name 42 | end 43 | 44 | 45 | def attributes 46 | a = CudaFuncAttributes.new 47 | status = API::cudaFuncGetAttributes(a.to_ptr, @name) 48 | Pvt::handle_error(status) 49 | a 50 | end 51 | 52 | 53 | def cache_config=(config) 54 | status = API::cudaFuncSetCacheConfig(@name, config) 55 | Pvt::handle_error(status) 56 | config 57 | end 58 | 59 | 60 | def launch 61 | status = API::cudaLaunch(@name) 62 | Pvt::handle_error(status) 63 | self 64 | end 65 | 66 | 67 | def self.configure(grid_dim, block_dim, shared_mem_size = 0, stream = 0) 68 | status = API::cudaConfigureCall(grid_dim, block_dim, shared_mem_size, stream) 69 | Pvt::handle_error(status) 70 | self 71 | end 72 | 73 | 74 | def self.setup(*args) 75 | offset = 0 76 | args.each do |x| 77 | case x 78 | when Fixnum 79 | p = FFI::MemoryPointer.new(:int) 80 | p.write_int(x) 81 | size = 4 82 | when Float 83 | p = FFI::MemoryPointer.new(:float) 84 | p.write_float(x) 85 | size = 4 86 | when SGC::Memory::MemoryPointer 87 | p = x.ref 88 | size = FFI::MemoryPointer.size 89 | else 90 | raise TypeError, "Invalid type of argument #{x.to_s}." 91 | end 92 | offset = align_up(offset, size) 93 | status = API::cudaSetupArgument(p, size, offset) 94 | Pvt::handle_error(status) 95 | offset += size 96 | end 97 | end 98 | 99 | 100 | def self.load_lib(name) 101 | raise NotImplementedError 102 | end 103 | 104 | 105 | def self.load_lib_file(name) 106 | @@libs << DL::dlopen(name) 107 | # API::ffi_lib(name) 108 | self 109 | end 110 | 111 | 112 | def self.unload_all_libs 113 | @@libs.each do |h| 114 | h.close 115 | end 116 | @@libs = [] 117 | self 118 | end 119 | 120 | protected 121 | 122 | def self.align_up(offset, alignment) 123 | (offset + alignment - 1) & ~(alignment - 1) 124 | end 125 | 126 | @@libs = [] 127 | 128 | end 129 | 130 | end # module 131 | end # module 132 | -------------------------------------------------------------------------------- /sgc/lib/madison/kernel/test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | //#include 3 | #include 4 | #include 5 | //#include 6 | 7 | #define DIMENSIONS 5 8 | #define BLOCK_SIZE 16 9 | 10 | 11 | // Kernel definition 12 | //__global__ void MatAdd(float A[N][N], float B[N][N], 13 | // float C[N][N]) 14 | //{ 15 | // int i = threadIdx.x; 16 | // int j = threadIdx.y; 17 | // C[i][j] = A[i][j] + B[i][j]; 18 | //} 19 | 20 | // Matrices are stored in row-major order: 21 | // M(row, col) = *(M.elements + row * M.width + col) 22 | 23 | __global__ void MatPopulate(float *A, int count) 24 | { 25 | int row = blockIdx.x; 26 | int col = threadIdx.x; 27 | A[row * DIMENSIONS + col] = (float)(row * DIMENSIONS + col)/(DIMENSIONS*count); 28 | } 29 | 30 | float score(float *A, float *B){ 31 | float score = 0.0; 32 | for(int i=0; i>>(d_elements, count); 73 | cudaMemcpy(elements, d_elements, size, 74 | cudaMemcpyDeviceToHost); 75 | for(int i=0; i 2 && !strcmp(argv[2], "raw")){ 89 | printf("\nraw\n"); 90 | float _score; 91 | for(int i=0; i>>(d_elements, d_scores, count); 107 | cudaMemcpy(scores, d_scores, size2, 108 | cudaMemcpyDeviceToHost); 109 | } 110 | float sum = 0.0; 111 | for (int i=0;i Cluster ##{c}" 49 | CudaMemory.memcpy_htod(@values_dev_1, self.values.offset(c*offset_increment), offset_increment * @type_size) 50 | CudaMemory.memcpy_htod(@keys_dev_1, self.keys.offset(c*offset_increment), offset_increment * INTEGER_SIZE) 51 | 52 | (0...other_clusters_count).each do |cc| 53 | self.class.log ">> with Cluster ##{cc}" 54 | compare_cluster_with(matrix, c, cc, CLUSTER_SIZE, CLUSTER_SIZE) 55 | end 56 | # We have to handle the leftovers => if we have 66 blocks and CLUSTER_SIZE == 64, we have to handle 2 blocks separately 57 | if other_leftovers_count > 0 58 | self.class.log ">> with the leftovers" 59 | compare_cluster_with(matrix, c, other_clusters_count, CLUSTER_SIZE, other_leftovers_count) 60 | end 61 | end 62 | if self_leftovers_count > 0 63 | self.class.log "\n> The leftovers" 64 | c = self_clusters_count 65 | CudaMemory.memcpy_htod(@values_dev_1, self.values.offset(c*offset_increment), self_leftovers_count * BLOCK_SIZE * self.vectors_dimension * @type_size) 66 | CudaMemory.memcpy_htod(@keys_dev_1, self.keys.offset(c*offset_increment), self_leftovers_count * BLOCK_SIZE * self.vectors_dimension * INTEGER_SIZE) 67 | 68 | (0...other_clusters_count).each do |cc| 69 | self.class.log ">> with Cluster ##{cc}" 70 | compare_cluster_with(matrix, self_clusters_count, cc, self_leftovers_count, CLUSTER_SIZE) 71 | end 72 | # We have to handle the leftovers => if we have 66 blocks and CLUSTER_SIZE == 64, we have to handle 2 blocks separately 73 | if other_leftovers_count > 0 74 | self.class.log ">> with the leftovers" 75 | compare_cluster_with(matrix, self_clusters_count, other_clusters_count, self_leftovers_count, other_leftovers_count) 76 | end 77 | end 78 | end 79 | 80 | def compare_cluster_with(matrix, cluster, offset, current_cluster_size, size) 81 | puts [matrix.inspect, cluster, offset, current_cluster_size, size] 82 | puts matrix.inspect 83 | puts size * BLOCK_SIZE * self.vectors_dimension * @type_size 84 | CudaMemory.memcpy_htod(@values_dev_2, matrix.values.offset(offset * CLUSTER_SIZE * BLOCK_SIZE * self.vectors_dimension), size * BLOCK_SIZE * self.vectors_dimension * @type_size) 85 | CudaMemory.memcpy_htod(@keys_dev_2, matrix.keys.offset(offset * CLUSTER_SIZE * BLOCK_SIZE * self.vectors_dimension), size * BLOCK_SIZE * self.vectors_dimension * INTEGER_SIZE) 86 | 87 | CudaFunction.configure(Dim3.new(current_cluster_size, 1, 1), Dim3.new(BLOCK_SIZE, 1, 1)) 88 | CudaFunction.setup(@values_dev_1, @values_dev_2, @keys_dev_1, @keys_dev_2, @scores_dev, size * BLOCK_SIZE) 89 | f = CudaFunction.new("ParallelScore") 90 | f.launch 91 | CudaMemory.memcpy_dtoh(@scores, @scores_dev, @scores_size * @type_size) 92 | # @scores.each do |s| puts s end 93 | 94 | $stderr.puts "#{cluster * CLUSTER_SIZE * BLOCK_SIZE} .. #{(cluster) * BLOCK_SIZE * CLUSTER_SIZE + current_cluster_size * BLOCK_SIZE - 1} x #{offset * CLUSTER_SIZE * BLOCK_SIZE} .. #{offset * CLUSTER_SIZE * BLOCK_SIZE + size * BLOCK_SIZE - 1}" 95 | self.class.output_scores(current_cluster_size * BLOCK_SIZE, size * BLOCK_SIZE, cluster * CLUSTER_SIZE * BLOCK_SIZE, offset * CLUSTER_SIZE * BLOCK_SIZE, @scores) 96 | end 97 | 98 | def prepare_kernel_lib 99 | kernel_dir = "#{File.dirname(__FILE__)}/kernel" 100 | File.open("#{kernel_dir}/kernel.h", 'w') do |f| 101 | f.write "#define DIMENSIONS #{self.vectors_dimension}\n" 102 | f.write "#define BLOCK_SIZE #{BLOCK_SIZE}\n" 103 | f.write "#define CLUSTER_SIZE #{CLUSTER_SIZE}\n" 104 | end 105 | system "cd #{kernel_dir}; rm libkernel.*.so;nvcc -shared -Xcompiler -fPIC kernel.cu -o libkernel.#{self.vectors_dimension}.so" 106 | "#{kernel_dir}/libkernel.#{self.vectors_dimension}.so" 107 | end 108 | end 109 | 110 | module ClassMethods 111 | 112 | def log message 113 | $stderr.puts message 114 | end 115 | 116 | def output_scores rows, cols, offset_x, offset_y, score 117 | (0...rows).each do |i| 118 | (0...cols).each do |j| 119 | real_i = offset_x + i 120 | real_j = offset_y + j 121 | puts "#{real_i}\t #{real_j}\t %.3f\n" % (score.is_a?(SGC::Memory::Buffer) ? score[i * cols + j] : score) 122 | end 123 | end 124 | end 125 | end 126 | end 127 | end -------------------------------------------------------------------------------- /sgc/lib/cuda/runtime/ffi-cuda.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | # 24 | 25 | require 'ffi' 26 | require 'ffi/prettystruct' 27 | 28 | 29 | module SGC 30 | module Cuda 31 | module API 32 | 33 | extend FFI::Library 34 | ffi_lib "cudart" 35 | 36 | CudaError = enum( 37 | :cudaSuccess, 0, 38 | :cudaErrorMissingConfiguration, 1, 39 | :cudaErrorMemoryAllocation, 2, 40 | :cudaErrorInitializationError, 3, 41 | :cudaErrorLaunchFailure, 4, 42 | :cudaErrorPriorLaunchFailure, 5, 43 | :cudaErrorLaunchTimeout, 6, 44 | :cudaErrorLaunchOutOfResources, 7, 45 | :cudaErrorInvalidDeviceFunction, 8, 46 | :cudaErrorInvalidConfiguration, 9, 47 | :cudaErrorInvalidDevice, 10, 48 | :cudaErrorInvalidValue, 11, 49 | :cudaErrorInvalidPitchValue, 12, 50 | :cudaErrorInvalidSymbol, 13, 51 | :cudaErrorMapBufferObjectFailed, 14, 52 | :cudaErrorUnmapBufferObjectFailed, 15, 53 | :cudaErrorInvalidHostPointer, 16, 54 | :cudaErrorInvalidDevicePointer, 17, 55 | :cudaErrorInvalidTexture, 18, 56 | :cudaErrorInvalidTextureBinding, 19, 57 | :cudaErrorInvalidChannelDescriptor, 20, 58 | :cudaErrorInvalidMemcpyDirection, 21, 59 | :cudaErrorAddressOfConstant, 22, 60 | :cudaErrorTextureFetchFailed, 23, 61 | :cudaErrorTextureNotBound, 24, 62 | :cudaErrorSynchronizationError, 25, 63 | :cudaErrorInvalidFilterSetting, 26, 64 | :cudaErrorInvalidNormSetting, 27, 65 | :cudaErrorMixedDeviceExecution, 28, 66 | :cudaErrorCudartUnloading, 29, 67 | :cudaErrorUnknown, 30, 68 | :cudaErrorNotYetImplemented, 31, 69 | :cudaErrorMemoryValueTooLarge, 32, 70 | :cudaErrorInvalidResourceHandle, 33, 71 | :cudaErrorNotReady, 34, 72 | :cudaErrorInsufficientDriver, 35, 73 | :cudaErrorSetOnActiveProcess, 36, 74 | :cudaErrorInvalidSurface, 37, 75 | :cudaErrorNoDevice, 38, 76 | :cudaErrorECCUncorrectable, 39, 77 | :cudaErrorSharedObjectSymbolNotFound, 40, 78 | :cudaErrorSharedObjectInitFailed, 41, 79 | :cudaErrorUnsupportedLimit, 42, 80 | :cudaErrorDuplicateVariableName, 43, 81 | :cudaErrorDuplicateTextureName, 44, 82 | :cudaErrorDuplicateSurfaceName, 45, 83 | :cudaErrorDevicesUnavailable, 46, 84 | :cudaErrorInvalidKernelImage, 47, 85 | :cudaErrorNoKernelImageForDevice, 48, 86 | :cudaErrorIncompatibleDriverContext, 49, 87 | :cudaErrorStartupFailure, 0x7F, 88 | :cudaErrorApiFailureBase, 10000, 89 | ) 90 | CudaError_t = CudaError 91 | 92 | CudaDeviceFlags = enum( 93 | :cudaDeviceScheduleAuto, 0, 94 | :cudaDeviceScheduleSpin, 1, 95 | :cudaDeviceScheduleYield, 2, 96 | :cudaDeviceBlockingSync, 4, 97 | :cudaDeviceMapHost, 8, 98 | :cudaDeviceLmemResizeToMax, 16, 99 | ) 100 | 101 | CudaEventFlags = enum( 102 | :cudaEventDefault, 0, 103 | :cudaEventBlockingSync, 1, 104 | :cudaEventDisableTiming, 2, 105 | ) 106 | 107 | CudaHostAllocFlags = enum( 108 | :cudaHostAllocDefault, 0, 109 | :cudaHostAllocPortable, 1, 110 | :cudaHostAllocMapped, 2, 111 | :cudaHostAllocWriteCombined, 4, 112 | ) 113 | 114 | CudaArrayFlags = enum( 115 | :cudaArrayDefault, 0x00, 116 | :cudaArraySurfaceLoadStore, 0x02, 117 | ) 118 | 119 | CudaMemcpyKind = enum( 120 | :cudaMemcpyHostToHost, 0, 121 | :cudaMemcpyHostToDevice, 1, 122 | :cudaMemcpyDeviceToHost, 2, 123 | :cudaMemcpyDeviceToDevice, 3, 124 | ) 125 | 126 | CudaChannelFormatKind = enum( 127 | :cudaChannelFormatKindSigned, 0, 128 | :cudaChannelFormatKindUnsigned, 1, 129 | :cudaChannelFormatKindFloat, 2, 130 | :cudaChannelFormatKindNone,3, 131 | ) 132 | 133 | CudaFuncCache = enum( 134 | :cudaFuncCachePreferNone, 0, 135 | :cudaFuncCachePreferShared, 1, 136 | :cudaFuncCachePreferL1, 2, 137 | ) 138 | 139 | CudaLimit = enum( 140 | :cudaLimitStackSize, 0x00, 141 | :cudaLimitPrintfFifoSize, 0x01, 142 | :cudaLimitMallocHeapSize, 0x02, 143 | ) 144 | 145 | CudaComputeMode = enum( 146 | :cudaComputeModeDefault, 0, 147 | :cudaComputeModeExclusive, 1, 148 | :cudaComputeModeProhibited, 2, 149 | ) 150 | 151 | CudaSurfaceBoundaryMode = enum( 152 | :cudaBoundaryModeZero, 0, 153 | :cudaBoundaryModeClamp, 1, 154 | :cudaBoundaryModeTrap, 2, 155 | ) 156 | 157 | CudaSurfaceFormatMode = enum( 158 | :cudaFormatModeForced, 0, 159 | :cudaFormatModeAuto, 1, 160 | ) 161 | 162 | CudaTextureAddressMode = enum( 163 | :cudaAddressModeWrap, 0, 164 | :cudaAddressModeClamp, 1, 165 | :cudaAddressModeMirror, 2, 166 | :cudaAddressModeBorder, 3, 167 | ) 168 | 169 | CudaTextureFilterMode = enum( 170 | :cudaFilterModePoint, 0, 171 | :cudaFilterModeLinear, 1, 172 | ) 173 | 174 | CudaTextureReadMode = enum( 175 | :cudaReadModeElementType, 0, 176 | :cudaReadModeNormalizedFloat, 1, 177 | ) 178 | 179 | typedef :pointer, :CudaStream 180 | typedef :pointer, :CudaEvent 181 | 182 | typedef :CudaStream, :CudaStream_t 183 | typedef :CudaEvent, :CudaEvent_t 184 | 185 | 186 | class Dim3 < FFI::Struct 187 | layout( 188 | :array, [:uint, 3], 189 | ) 190 | 191 | alias :init :initialize 192 | alias :get :[] 193 | alias :set :[]= 194 | private :init, :get, :set 195 | 196 | def initialize(x, y, z) 197 | init 198 | @array = get(:array) 199 | @array[0], @array[1], @array[2] = x, y, z 200 | end 201 | 202 | def [](index); @array[index]; end 203 | def []=(index, value); @array[index] = value; end 204 | 205 | def x; @array[0]; end 206 | def y; @array[1]; end 207 | def z; @array[2]; end 208 | 209 | def x=(value); @array[0] = value; end 210 | def y=(value); @array[1] = value; end 211 | def z=(value); @array[2] = value; end 212 | 213 | end 214 | 215 | class CudaDeviceProp < FFI::PrettyStruct 216 | layout( 217 | :name, [:char, 256], 218 | :totalGlobalMem, :size_t, 219 | :sharedMemPerBlock, :size_t, 220 | :regsPerBlock, :int, 221 | :warpSize, :int, 222 | :memPitch, :size_t, 223 | :maxThreadsPerBlock, :int, 224 | :maxThreadsDim, [:int, 3], 225 | :maxGridSize, [:int, 3], 226 | :clockRate, :int, 227 | :totalConstMem, :size_t, 228 | :major, :int, 229 | :minor, :int, 230 | :textureAlignment, :size_t, 231 | :deviceOverlap, :int, 232 | :multiProcessorCount, :int, 233 | :kernelExecTimeoutEnabled, :int, 234 | :integrated, :int, 235 | :canMapHostMemory, :int, 236 | :computeMode, :int, 237 | :maxTexture1D, :int, 238 | :maxTexture2D, [:int, 2], 239 | :maxTexture3D, [:int, 3], 240 | :maxTexture2DArray, [:int, 3], 241 | :surfaceAlignment, :size_t, 242 | :concurrentKernels, :int, 243 | :ECCEnabled, :int, 244 | :pciBusID, :int, 245 | :__cudaReserved, [:int, 21], 246 | ) 247 | end 248 | 249 | class CudaFuncAttributes < FFI::PrettyStruct 250 | layout( 251 | :sharedSizeBytes, :size_t, 252 | :constSizeBytes, :size_t, 253 | :localSizeBytes, :size_t, 254 | :maxThreadsPerBlock, :int, 255 | :numRegs, :int, 256 | :ptxVersion, :int, 257 | :binaryVersion, :int, 258 | :__cudaReserved, [:int, 6], 259 | ) 260 | end 261 | 262 | class CudaChannelFormatDesc < FFI::PrettyStruct 263 | layout( 264 | :x, :int, 265 | :y, :int, 266 | :z, :int, 267 | :w, :int, 268 | :f, CudaChannelFormatKind, 269 | ) 270 | end 271 | 272 | class CudaPitchedPtr < FFI::PrettyStruct 273 | layout( 274 | :ptr, :pointer, 275 | :pitch, :size_t, 276 | :xsize, :size_t, 277 | :ysize, :size_t, 278 | ) 279 | end 280 | 281 | class CudaPos < FFI::PrettyStruct 282 | layout( 283 | :x, :size_t, 284 | :y, :size_t, 285 | :z, :size_t, 286 | ) 287 | end 288 | 289 | class CudaExtent < FFI::PrettyStruct 290 | layout( 291 | :width, :size_t, 292 | :height, :size_t, 293 | :depth, :size_t, 294 | ) 295 | end 296 | 297 | class CudaMemcpy3DParms < FFI::PrettyStruct 298 | layout( 299 | :srcArray, :pointer, 300 | :srcPos, CudaPos, 301 | :srcPtr, CudaPitchedPtr, 302 | :dstArray, :pointer, 303 | :dstPos, CudaPos, 304 | :dstPtr, CudaPitchedPtr, 305 | :extent, CudaExtent, 306 | :kind, CudaMemcpyKind, 307 | ) 308 | end 309 | 310 | class TextureReference < FFI::PrettyStruct 311 | layout( 312 | :normalized, :int, 313 | :filterMode, CudaTextureFilterMode, 314 | :addressMode, [CudaTextureAddressMode, 3], 315 | :channelDesc, CudaChannelFormatDesc, 316 | :__cudaReserved, [:int, 16], 317 | ) 318 | end 319 | 320 | class SurfaceReference < FFI::PrettyStruct 321 | layout( 322 | :channelDesc, CudaChannelFormatDesc, 323 | ) 324 | end 325 | 326 | # CUDA Version Management. 327 | attach_function :cudaDriverGetVersion, [:pointer], :int 328 | attach_function :cudaRuntimeGetVersion, [:pointer], :int 329 | 330 | # CUDA Error Handling. 331 | attach_function :cudaGetErrorString, [CudaError], :string 332 | attach_function :cudaGetLastError, [], :int 333 | attach_function :cudaPeekAtLastError, [], :int 334 | 335 | # CUDA Device Management. 336 | attach_function :cudaChooseDevice, [:pointer, :pointer], :int 337 | attach_function :cudaGetDevice, [:pointer], :int 338 | attach_function :cudaGetDeviceCount, [:pointer], :int 339 | attach_function :cudaGetDeviceProperties, [:pointer, :int], :int 340 | attach_function :cudaSetDevice, [:int], :int 341 | attach_function :cudaSetDeviceFlags, [:uint], :int 342 | attach_function :cudaSetValidDevices, [:pointer, :int], :int 343 | 344 | # CUDA Thread Management. 345 | attach_function :cudaThreadExit, [], :int 346 | attach_function :cudaThreadGetCacheConfig, [:pointer], :int 347 | attach_function :cudaThreadGetLimit, [:pointer, CudaLimit], :int 348 | attach_function :cudaThreadSetCacheConfig, [CudaFuncCache], :int 349 | attach_function :cudaThreadSetLimit, [CudaLimit, :size_t], :int 350 | attach_function :cudaThreadSynchronize, [], :int 351 | 352 | # CUDA Memory Management. 353 | attach_function :cudaFree, [:pointer], :int 354 | attach_function :cudaFreeArray, [:pointer], :int 355 | attach_function :cudaFreeHost, [:pointer], :int 356 | attach_function :cudaGetSymbolAddress, [:pointer, :string], :int 357 | attach_function :cudaGetSymbolSize, [:pointer, :string], :int 358 | attach_function :cudaHostAlloc, [:pointer, :size_t, :uint], :int 359 | attach_function :cudaHostGetDevicePointer, [:pointer, :pointer, :uint], :int 360 | attach_function :cudaHostGetFlags, [:pointer, :pointer], :int 361 | attach_function :cudaMalloc, [:pointer, :size_t], :int 362 | attach_function :cudaMalloc3D, [:pointer, CudaExtent.by_value], :int 363 | attach_function :cudaMalloc3DArray, [:pointer, :pointer, CudaExtent.by_value, :uint], :int 364 | attach_function :cudaMallocArray, [:pointer, :pointer, :size_t, :size_t, :uint], :int 365 | attach_function :cudaMallocHost, [:pointer, :size_t], :int 366 | attach_function :cudaMallocPitch, [:pointer, :pointer, :size_t, :size_t], :int 367 | attach_function :cudaMemcpy, [:pointer, :pointer, :size_t, CudaMemcpyKind], :int 368 | attach_function :cudaMemcpy2D, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 369 | attach_function :cudaMemcpy2DArrayToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 370 | attach_function :cudaMemcpy2DAsync, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 371 | attach_function :cudaMemcpy2DFromArray, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 372 | attach_function :cudaMemcpy2DFromArrayAsync, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 373 | attach_function :cudaMemcpy2DToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 374 | attach_function :cudaMemcpy2DToArrayAsync, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 375 | attach_function :cudaMemcpy3D, [:pointer], :int 376 | attach_function :cudaMemcpy3DAsync, [:pointer, :CudaStream], :int 377 | attach_function :cudaMemcpyArrayToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 378 | attach_function :cudaMemcpyAsync, [:pointer, :pointer, :size_t, CudaMemcpyKind, :CudaStream], :int 379 | attach_function :cudaMemcpyFromArray, [:pointer, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int 380 | attach_function :cudaMemcpyFromArrayAsync, [:pointer, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 381 | attach_function :cudaMemcpyFromSymbol, [:pointer, :string, :size_t, :size_t, CudaMemcpyKind], :int 382 | attach_function :cudaMemcpyFromSymbolAsync, [:pointer, :string, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 383 | attach_function :cudaMemcpyToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, CudaMemcpyKind], :int 384 | attach_function :cudaMemcpyToArrayAsync, [:pointer, :size_t, :size_t, :pointer, :size_t, CudaMemcpyKind, :CudaStream], :int 385 | attach_function :cudaMemcpyToSymbol, [:string, :pointer, :size_t, :size_t, CudaMemcpyKind], :int 386 | attach_function :cudaMemcpyToSymbolAsync, [:string, :pointer, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int 387 | attach_function :cudaMemGetInfo, [:pointer, :pointer], :int 388 | attach_function :cudaMemset, [:pointer, :int, :size_t], :int 389 | attach_function :cudaMemset2D, [:pointer, :size_t, :int, :size_t, :size_t], :int 390 | attach_function :cudaMemset2DAsync, [:pointer, :size_t, :int, :size_t, :size_t, :CudaStream], :int 391 | attach_function :cudaMemset3D, [CudaPitchedPtr.by_value, :int, CudaExtent.by_value], :int 392 | attach_function :cudaMemset3DAsync, [CudaPitchedPtr.by_value, :int, CudaExtent.by_value, :CudaStream], :int 393 | attach_function :cudaMemsetAsync, [:pointer, :int, :size_t, :CudaStream], :int 394 | # attach_function :make_cudaExtent, [:size_t, :size_t, :size_t], CudaExtent 395 | # attach_function :make_cudaPitchedPtr, [:pointer, :size_t, :size_t, :size_t], CudaPitchedPtr 396 | # attach_function :make_cudaPos, [:size_t, :size_t, :size_t], CudaPos 397 | 398 | def make_cudaExtent(w, h, d) 399 | e = CudaExtent.new 400 | e[:width], e[:height], e[:depth] = w, h, d 401 | e 402 | end 403 | 404 | def make_cudaPitchedPtr(d, p, xsz, ysz) 405 | s = CudaPitchedPtr.new 406 | s[:ptr] = d 407 | s[:pitch] = p 408 | s[:xsize] = xsz 409 | s[:ysize] = ysz 410 | s 411 | end 412 | 413 | def make_cudaPos(x, y, z) 414 | p = CudaPos.new 415 | p[:x] = x 416 | p[:y] = y 417 | p[:z] = z 418 | p 419 | end 420 | 421 | # CUDA Execution Control. 422 | attach_function :cudaConfigureCall, [Dim3.by_value, Dim3.by_value, :size_t, :uint], :int 423 | attach_function :cudaFuncGetAttributes, [:pointer, :string], :int 424 | attach_function :cudaFuncSetCacheConfig, [:string, CudaFuncCache], :int 425 | attach_function :cudaLaunch, [:string], :int 426 | attach_function :cudaSetDoubleForDevice, [:pointer], :int 427 | attach_function :cudaSetDoubleForHost, [:pointer], :int 428 | attach_function :cudaSetupArgument, [:pointer, :size_t, :size_t], :int 429 | 430 | # CUDA Stream Management. 431 | attach_function :cudaStreamCreate, [:pointer], :int 432 | attach_function :cudaStreamDestroy, [:CudaStream], :int 433 | attach_function :cudaStreamQuery, [:CudaStream], :int 434 | attach_function :cudaStreamSynchronize, [:CudaStream], :int 435 | attach_function :cudaStreamWaitEvent, [:CudaStream, :CudaEvent, :uint], :int 436 | 437 | # CUDA Event Management. 438 | attach_function :cudaEventCreate, [:pointer], :int 439 | attach_function :cudaEventCreateWithFlags, [:pointer, :uint], :int 440 | attach_function :cudaEventDestroy, [:CudaEvent], :int 441 | attach_function :cudaEventElapsedTime, [:pointer, :CudaEvent, :CudaEvent], :int 442 | attach_function :cudaEventQuery, [:CudaEvent], :int 443 | attach_function :cudaEventRecord, [:CudaEvent, :CudaStream], :int 444 | attach_function :cudaEventSynchronize, [:CudaEvent], :int 445 | 446 | # CUDA Texture Reference Management. 447 | attach_function :cudaBindTexture, [:pointer, :pointer, :pointer, :pointer, :size_t], :int 448 | attach_function :cudaBindTexture2D, [:pointer, :pointer, :pointer, :pointer, :size_t, :size_t, :size_t], :int 449 | attach_function :cudaBindTextureToArray, [:pointer, :pointer, :pointer], :int 450 | attach_function :cudaCreateChannelDesc, [:int, :int, :int, :int, CudaChannelFormatKind], CudaChannelFormatDesc.by_value 451 | attach_function :cudaGetChannelDesc, [:pointer, :pointer], :int 452 | attach_function :cudaGetTextureAlignmentOffset, [:pointer, :pointer], :int 453 | attach_function :cudaGetTextureReference, [:pointer, :string], :int 454 | attach_function :cudaUnbindTexture, [:pointer], :int 455 | 456 | # CUDA Surface Reference Management. 457 | attach_function :cudaBindSurfaceToArray, [:pointer, :pointer, :pointer], :int 458 | attach_function :cudaGetSurfaceReference, [:pointer, :string], :int 459 | 460 | end # module 461 | end # module 462 | end # module 463 | -------------------------------------------------------------------------------- /sgc/lib/cuda/driver/rubycu.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | # Copyright (c) 2010 Chung Shin Yee 3 | # 4 | # shinyee@speedgocomputing.com 5 | # http://www.speedgocomputing.com 6 | # http://github.com/xman/sgc-ruby-cuda 7 | # http://rubyforge.org/projects/rubycuda 8 | # 9 | # This file is part of SGC-Ruby-CUDA. 10 | # 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with SGC-Ruby-CUDA. If not, see . 23 | */ 24 | 25 | #include 26 | #include "ruby.h" 27 | #include "cuda.h" 28 | 29 | namespace SGC { 30 | namespace CU { 31 | 32 | // {{{ SGC Ruby modules. 33 | static VALUE rb_mSGC; 34 | static VALUE rb_mCU; 35 | static VALUE rb_mIBuffer; 36 | static VALUE rb_mIBufferClassMethods; 37 | // }}} 38 | 39 | // {{{ CUDA Ruby classes. 40 | static VALUE rb_cCUDevice; 41 | static VALUE rb_cCUContext; 42 | static VALUE rb_cCUContextFlags; 43 | static VALUE rb_cCULimit; 44 | static VALUE rb_cCUModule; 45 | static VALUE rb_cCUFunction; 46 | static VALUE rb_cCUFunctionAttribute; 47 | static VALUE rb_cCUFunctionCache; 48 | static VALUE rb_cCUDevicePtr; 49 | static VALUE rb_cCUDeviceAttribute; 50 | static VALUE rb_cCUComputeMode; 51 | static VALUE rb_cCUStream; 52 | static VALUE rb_cCUEvent; 53 | static VALUE rb_cCUEventFlags; 54 | static VALUE rb_cCUAddressMode; 55 | static VALUE rb_cCUFilterMode; 56 | static VALUE rb_cCUTexRefFlags; 57 | static VALUE rb_cCUTexRef; 58 | static VALUE rb_cCUResult; 59 | // }}} 60 | 61 | // {{{ SGC Ruby classes. 62 | static VALUE rb_eCUStandardError; 63 | 64 | static VALUE rb_eCUDeviceError; 65 | static VALUE rb_eCUDeviceNotInitializedError; 66 | static VALUE rb_eCUDeviceDeinitializedError; 67 | static VALUE rb_eCUNoDeviceError; 68 | static VALUE rb_eCUInvalidDeviceError; 69 | 70 | static VALUE rb_eCUMapError; 71 | static VALUE rb_eCUMapFailedError; 72 | static VALUE rb_eCUUnMapFailedError; 73 | static VALUE rb_eCUArrayIsMappedError; 74 | static VALUE rb_eCUAlreadyMappedError; 75 | static VALUE rb_eCUNotMappedError; 76 | static VALUE rb_eCUNotMappedAsArrayError; 77 | static VALUE rb_eCUNotMappedAsPointerError; 78 | 79 | static VALUE rb_eCUContextError; 80 | static VALUE rb_eCUInvalidContextError; 81 | static VALUE rb_eCUContextAlreadyCurrentError; 82 | static VALUE rb_eCUUnsupportedLimitError; 83 | 84 | static VALUE rb_eCULaunchError; 85 | static VALUE rb_eCULaunchFailedError; 86 | static VALUE rb_eCULaunchOutOfResourcesError; 87 | static VALUE rb_eCULaunchTimeoutError; 88 | static VALUE rb_eCULaunchIncompatibleTexturingError; 89 | 90 | static VALUE rb_eCUParameterError; 91 | static VALUE rb_eCUInvalidValueError; 92 | static VALUE rb_eCUInvalidHandleError; 93 | 94 | static VALUE rb_eCUMemoryError; 95 | static VALUE rb_eCUOutOfMemoryError; 96 | 97 | static VALUE rb_eCULibraryError; 98 | static VALUE rb_eCUSharedObjectSymbolNotFoundError; 99 | static VALUE rb_eCUSharedObjectInitFailedError; 100 | 101 | static VALUE rb_eCUHardwareError; 102 | static VALUE rb_eCUECCUncorrectableError; 103 | 104 | static VALUE rb_eCUFileError; 105 | static VALUE rb_eCUNoBinaryForGPUError; 106 | static VALUE rb_eCUFileNotFoundError; 107 | static VALUE rb_eCUInvalidSourceError; 108 | static VALUE rb_eCUInvalidImageError; 109 | 110 | static VALUE rb_eCUReferenceError; 111 | static VALUE rb_eCUReferenceNotFoundError; 112 | 113 | static VALUE rb_eCUOtherError; 114 | static VALUE rb_eCUAlreadyAcquiredError; 115 | static VALUE rb_eCUNotReadyError; 116 | static VALUE rb_eCUOperatingSystemError; 117 | 118 | static VALUE rb_eCUUnknownError; 119 | 120 | static VALUE rb_cMemoryPointer; 121 | static VALUE rb_cMemoryBuffer; 122 | static VALUE rb_cInt32Buffer; 123 | static VALUE rb_cInt64Buffer; 124 | static VALUE rb_cFloat32Buffer; 125 | static VALUE rb_cFloat64Buffer; 126 | // }}} 127 | 128 | // {{{ SGC C/C++ structures. 129 | typedef struct { 130 | char* p; 131 | } MemoryPointer; 132 | 133 | typedef struct : MemoryPointer { 134 | size_t size; 135 | bool is_page_locked; 136 | } MemoryBuffer; 137 | 138 | template 139 | struct TypedBuffer : public MemoryBuffer {}; 140 | 141 | typedef struct TypedBuffer Int32Buffer; 142 | typedef struct TypedBuffer Int64Buffer; 143 | typedef struct TypedBuffer Float32Buffer; 144 | typedef struct TypedBuffer Float64Buffer; 145 | // }}} 146 | 147 | // {{{ Function prototypes. 148 | static VALUE device_ptr_alloc(VALUE klass); 149 | static VALUE device_ptr_initialize(int argc, VALUE* argv, VALUE self); 150 | // }}} 151 | 152 | // {{{ SGC helpers. 153 | template 154 | static void generic_free(void* p) 155 | { 156 | delete static_cast(p); 157 | } 158 | 159 | template 160 | static VALUE to_rb(T v); 161 | 162 | VALUE to_rb(bool b) 163 | { 164 | if (b) { 165 | return Qtrue; 166 | } 167 | return Qfalse; 168 | } 169 | 170 | template <> 171 | VALUE to_rb(int v) 172 | { 173 | return INT2FIX(v); 174 | } 175 | 176 | template <> 177 | VALUE to_rb(long v) 178 | { 179 | return LONG2NUM(v); 180 | } 181 | 182 | template <> 183 | VALUE to_rb(float v) 184 | { 185 | return DBL2NUM(static_cast(v)); 186 | } 187 | 188 | template <> 189 | VALUE to_rb(double v) 190 | { 191 | return DBL2NUM(v); 192 | } 193 | 194 | template 195 | static T to_ctype(VALUE v); 196 | 197 | template <> 198 | bool to_ctype(VALUE b) 199 | { 200 | if (b == Qfalse || b == Qnil) { 201 | return false; 202 | } 203 | return true; 204 | } 205 | 206 | template <> 207 | int to_ctype(VALUE v) 208 | { 209 | return NUM2INT(v); 210 | } 211 | 212 | template <> 213 | unsigned int to_ctype(VALUE v) 214 | { 215 | return NUM2UINT(v); 216 | } 217 | 218 | template <> 219 | long to_ctype(VALUE v) 220 | { 221 | return NUM2LONG(v); 222 | } 223 | 224 | template <> 225 | unsigned long to_ctype(VALUE v) 226 | { 227 | return NUM2ULONG(v); 228 | } 229 | 230 | template <> 231 | float to_ctype(VALUE v) 232 | { 233 | return static_cast(NUM2DBL(v)); 234 | } 235 | 236 | template <> 237 | double to_ctype(VALUE v) 238 | { 239 | return NUM2DBL(v); 240 | } 241 | 242 | // in ary[0]: Class contains class constants. 243 | // in ary[1]: Constant to match. 244 | // out ary[2]: Label matches with constant. 245 | static VALUE class_const_match(VALUE current_label, VALUE* ary) 246 | { 247 | const VALUE& rb_class_const = ary[0]; 248 | const VALUE& constant_value = ary[1]; 249 | VALUE& label = ary[2]; 250 | VALUE v = rb_const_get(rb_class_const, SYM2ID(current_label)); 251 | if (FIX2INT(v) == FIX2INT(constant_value)) { 252 | label = current_label; 253 | return Qtrue; 254 | } 255 | return Qfalse; 256 | } 257 | 258 | // Extend _klass_ with the module _mod::ClassMethods_. 259 | static VALUE module_included_classmethods_hook(VALUE mod, VALUE klass) 260 | { 261 | VALUE m = rb_cvar_get(mod, rb_intern("ClassMethods")); 262 | rb_extend_object(klass, m); 263 | return Qnil; 264 | } 265 | 266 | #define RAISE_CU_STD_ERROR_FORMATTED(status, format, ...) rb_raise(rb_hash_aref(rb_error_class_by_enum, INT2FIX(status)), "%s:%d " format, __FILE__, __LINE__, __VA_ARGS__) 267 | #define RAISE_CU_STD_ERROR(status, message) RAISE_CU_STD_ERROR_FORMATTED(status, "%s", message) 268 | // }}} 269 | 270 | // {{{ SGC Ruby data. 271 | static VALUE rb_error_class_by_enum; 272 | // }}} 273 | 274 | 275 | // {{{ CUdevice 276 | 277 | /* call-seq: CUDevice.get_count -> Fixnum 278 | * 279 | * Return the number of CUDA devices. 280 | */ 281 | static VALUE device_get_count(VALUE klass) 282 | { 283 | int count; 284 | CUresult status = cuDeviceGetCount(&count); 285 | if (status != CUDA_SUCCESS) { 286 | RAISE_CU_STD_ERROR(status, "Failed to get device count."); 287 | } 288 | return INT2FIX(count); 289 | } 290 | 291 | /* call-seq: CUDevice.get(index) -> CUDevice 292 | * 293 | * Return a CUDevice instance corresponding to CUDA device _index_ (0..CUDevice.get_count-1). 294 | */ 295 | static VALUE device_get(VALUE klass, VALUE num) 296 | { 297 | CUdevice* pdev; 298 | VALUE rb_pdev = rb_class_new_instance(0, NULL, rb_cCUDevice); 299 | Data_Get_Struct(rb_pdev, CUdevice, pdev); 300 | int i = FIX2INT(num); 301 | CUresult status = cuDeviceGet(pdev, i); 302 | if (status != CUDA_SUCCESS) { 303 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get device %d.", i); 304 | } 305 | return rb_pdev; 306 | } 307 | 308 | static VALUE device_alloc(VALUE klass) 309 | { 310 | CUdevice* p = new CUdevice; 311 | return Data_Wrap_Struct(klass, 0, generic_free, p); 312 | } 313 | 314 | static VALUE device_initialize(int argc, VALUE* argv, VALUE self) 315 | { 316 | return self; 317 | } 318 | 319 | /* call-seq: dev.get_name -> String 320 | * 321 | * Return the name of _self_ with a maximum of 255 characters. 322 | */ 323 | static VALUE device_get_name(VALUE self) 324 | { 325 | CUdevice* p; 326 | Data_Get_Struct(self, CUdevice, p); 327 | char name[256]; 328 | CUresult status = cuDeviceGetName(name, 256, *p); 329 | if (status != CUDA_SUCCESS) { 330 | RAISE_CU_STD_ERROR(status, "Failed to get device name."); 331 | } 332 | return rb_str_new2(name); 333 | } 334 | 335 | /* call-seq: dev.compute_capability -> Hash { major:, minor: } 336 | * 337 | * Return the compute capability of _self_. 338 | * 339 | * # For a device with compute capability 1.3: 340 | * dev.compute_capability #=> { major: 1, minor: 3 } 341 | */ 342 | static VALUE device_compute_capability(VALUE self) 343 | { 344 | CUdevice* p; 345 | Data_Get_Struct(self, CUdevice, p); 346 | int major; 347 | int minor; 348 | CUresult status = cuDeviceComputeCapability(&major, &minor, *p); 349 | if (status != CUDA_SUCCESS) { 350 | RAISE_CU_STD_ERROR(status, "Failed to query device compute capability."); 351 | } 352 | VALUE h = rb_hash_new(); 353 | rb_hash_aset(h, ID2SYM(rb_intern("major")), INT2FIX(major)); 354 | rb_hash_aset(h, ID2SYM(rb_intern("minor")), INT2FIX(minor)); 355 | return h; 356 | } 357 | 358 | /* call-seq: dev.get_attribute(attribute) -> Fixnum 359 | * 360 | * Return _attribute_ (CUDeviceAttribute) of _self_. 361 | * 362 | * dev.get_attribute(CUDeviceAttribute::MAX_THREADS_PER_BLOCK) #=> 512 363 | * dev.get_attribute(CUDeviceAttribute::MULTIPROCESSOR_COUNT) #=> 30 364 | * dev.get_attribute(CUDeviceAttribute::MAX_SHARED_MEMORY_PER_BLOCK) #=> 16384 365 | */ 366 | static VALUE device_get_attribute(VALUE self, VALUE attribute) 367 | { 368 | CUdevice* p; 369 | Data_Get_Struct(self, CUdevice, p); 370 | int v; 371 | CUresult status = cuDeviceGetAttribute(&v, static_cast(FIX2INT(attribute)), *p); 372 | if (status != CUDA_SUCCESS) { 373 | VALUE attributes = rb_funcall(rb_cCUDeviceAttribute, rb_intern("constants"), 0); 374 | VALUE ary[3] = { rb_cCUDeviceAttribute, attribute, Qnil }; 375 | rb_block_call(attributes, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 376 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to query device attribute: %s.", rb_id2name(SYM2ID(ary[2]))); 377 | } 378 | return INT2FIX(v); 379 | } 380 | 381 | /* call-seq: dev.get_properties -> Hash 382 | * 383 | * Return the properties of _self_ in a hash with the following keys: 384 | * * :clock_rate 385 | * * :max_grid_size 386 | * * :max_threads_dim 387 | * * :max_threads_per_block 388 | * * :mem_pitch 389 | * * :regs_per_block 390 | * * :shared_mem_per_block 391 | * * :simd_width 392 | * * :texture_align 393 | * * :total_constant_memory 394 | */ 395 | static VALUE device_get_properties(VALUE self) 396 | { 397 | CUdevice* pdevice; 398 | Data_Get_Struct(self, CUdevice, pdevice); 399 | CUdevprop prop; 400 | CUresult status = cuDeviceGetProperties(&prop, *pdevice); 401 | if (status != CUDA_SUCCESS) { 402 | RAISE_CU_STD_ERROR(status, "Failed to get device properties."); 403 | } 404 | 405 | VALUE max_grid_size = rb_ary_new3(3, INT2FIX(prop.maxGridSize[0]), INT2FIX(prop.maxGridSize[1]), INT2FIX(prop.maxGridSize[2])); 406 | VALUE max_threads_dim = rb_ary_new3(3, INT2FIX(prop.maxThreadsDim[0]), INT2FIX(prop.maxThreadsDim[1]), INT2FIX(prop.maxThreadsDim[2])); 407 | 408 | VALUE h = rb_hash_new(); 409 | rb_hash_aset(h, ID2SYM(rb_intern("clock_rate")), INT2FIX(prop.clockRate)); 410 | rb_hash_aset(h, ID2SYM(rb_intern("max_grid_size")), max_grid_size); 411 | rb_hash_aset(h, ID2SYM(rb_intern("max_threads_dim")), max_threads_dim); 412 | rb_hash_aset(h, ID2SYM(rb_intern("max_threads_per_block")), INT2FIX(prop.maxThreadsPerBlock)); 413 | rb_hash_aset(h, ID2SYM(rb_intern("mem_pitch")), INT2FIX(prop.memPitch)); 414 | rb_hash_aset(h, ID2SYM(rb_intern("regs_per_block")), INT2FIX(prop.regsPerBlock)); 415 | rb_hash_aset(h, ID2SYM(rb_intern("shared_mem_per_block")), INT2FIX(prop.sharedMemPerBlock)); 416 | rb_hash_aset(h, ID2SYM(rb_intern("simd_width")), INT2FIX(prop.SIMDWidth)); 417 | rb_hash_aset(h, ID2SYM(rb_intern("texture_align")), INT2FIX(prop.textureAlign)); 418 | rb_hash_aset(h, ID2SYM(rb_intern("total_constant_memory")), INT2FIX(prop.totalConstantMemory)); 419 | return h; 420 | } 421 | 422 | /* call-seq: dev.total_mem -> Numeric 423 | * 424 | * Return the total amount of device memory in bytes. 425 | */ 426 | static VALUE device_total_mem(VALUE self) 427 | { 428 | CUdevice* p; 429 | Data_Get_Struct(self, CUdevice, p); 430 | size_t nbytes; 431 | CUresult status = cuDeviceTotalMem(&nbytes, *p); 432 | if (status != CUDA_SUCCESS) { 433 | RAISE_CU_STD_ERROR(status, "Failed to get device total amount of memory available."); 434 | } 435 | return SIZET2NUM(nbytes); 436 | } 437 | 438 | // }}} 439 | 440 | 441 | // {{{ CUcontext 442 | 443 | static VALUE context_alloc(VALUE klass) 444 | { 445 | CUcontext* p = new CUcontext; 446 | return Data_Wrap_Struct(klass, 0, generic_free, p); 447 | } 448 | 449 | static VALUE context_initialize(int argc, VALUE* argv, VALUE self) 450 | { 451 | return self; 452 | } 453 | 454 | /* call-seq: ctx.create(device) -> self 455 | * ctx.create(flags, device) -> self 456 | * 457 | * Create a new CUDA context with _flags_ (CUContextFlags) and _device_ (CUDevice), 458 | * then associate it with the calling thread, and return the context. 459 | * Setting flags to 0 or ommitting flags uses SCHED_AUTO. 460 | * 461 | * dev = CUDevice.get(0) 462 | * ctx = CUContext.new 463 | * ctx.create(dev) #=> ctx 464 | * ctx.create(0, dev) #=> ctx 465 | * ctx.create(CUContextFlags::SCHED_SPIN | CUContextFlags::BLOCKING_SYNC, dev) #=> ctx 466 | */ 467 | static VALUE context_create(int argc, VALUE* argv, VALUE self) 468 | { 469 | if (argc <= 0 || argc > 2) { 470 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc); 471 | } 472 | 473 | CUcontext* pcontext; 474 | CUdevice* pdevice; 475 | unsigned int flags = 0; 476 | Data_Get_Struct(self, CUcontext, pcontext); 477 | if (argc == 2) { 478 | flags = FIX2UINT(argv[0]); 479 | Data_Get_Struct(argv[1], CUdevice, pdevice); 480 | } else { // argc == 1 481 | Data_Get_Struct(argv[0], CUdevice, pdevice); 482 | } 483 | CUresult status = cuCtxCreate(pcontext, flags, *pdevice); 484 | if (status != CUDA_SUCCESS) { 485 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create context: flags = 0x%x.", flags); 486 | } 487 | return self; 488 | } 489 | 490 | /* call-seq: ctx.destroy -> nil 491 | * 492 | * Destroy the CUDA context _self_. 493 | */ 494 | static VALUE context_destroy(VALUE self) 495 | { 496 | CUcontext* p; 497 | Data_Get_Struct(self, CUcontext, p); 498 | CUresult status = cuCtxDestroy(*p); 499 | if (status != CUDA_SUCCESS) { 500 | RAISE_CU_STD_ERROR(status, "Failed to destroy context."); 501 | } 502 | return Qnil; 503 | } 504 | 505 | /* call-seq: ctx.attach -> self 506 | * ctx.attach(flags) -> self 507 | * 508 | * Increment the reference count on _self_. 509 | * Currently, _flags_ must be set to 0. 510 | */ 511 | static VALUE context_attach(int argc, VALUE* argv, VALUE self) 512 | { 513 | CUcontext* p; 514 | unsigned int flags = 0; 515 | Data_Get_Struct(self, CUcontext, p); 516 | if (argc == 1) { 517 | flags = FIX2UINT(argv[0]); 518 | } 519 | CUresult status = cuCtxAttach(p, flags); 520 | if (status != CUDA_SUCCESS) { 521 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to attach context: flags = 0x%x.", flags); 522 | } 523 | return self; 524 | } 525 | 526 | 527 | /* call-seq: ctx.detach -> nil 528 | * 529 | * Decrement the reference count on _self_. 530 | */ 531 | static VALUE context_detach(VALUE self) 532 | { 533 | CUcontext* p; 534 | Data_Get_Struct(self, CUcontext, p); 535 | CUresult status = cuCtxDetach(*p); 536 | if (status != CUDA_SUCCESS) { 537 | RAISE_CU_STD_ERROR(status, "Failed to detach context."); 538 | } 539 | return Qnil; 540 | } 541 | 542 | /* call-seq: ctx.push_current -> self 543 | * 544 | * Push _self_ onto the context stack, which becomes currently active context. 545 | */ 546 | static VALUE context_push_current(VALUE self) 547 | { 548 | CUcontext* p; 549 | Data_Get_Struct(self, CUcontext, p); 550 | CUresult status = cuCtxPushCurrent(*p); 551 | if (status != CUDA_SUCCESS) { 552 | RAISE_CU_STD_ERROR(status, "Failed to push this context."); 553 | } 554 | return self; 555 | } 556 | 557 | /* call-seq: ctx.get_api_version -> Numeric 558 | * 559 | * Return the API version used to create _self_. 560 | */ 561 | static VALUE context_get_api_version(VALUE self) 562 | { 563 | CUcontext* p; 564 | Data_Get_Struct(self, CUcontext, p); 565 | unsigned int version; 566 | CUresult status = cuCtxGetApiVersion(*p, &version); 567 | if (status != CUDA_SUCCESS) { 568 | RAISE_CU_STD_ERROR(status, "Failed to get the API version of this context."); 569 | } 570 | return UINT2NUM(version); 571 | } 572 | 573 | /* call-seq: CUContext.get_api_version -> Numeric 574 | * 575 | * Return the API version used to create current context. 576 | */ 577 | static VALUE context_get_api_version_singleton(VALUE klass) 578 | { 579 | unsigned int version; 580 | CUresult status = cuCtxGetApiVersion(NULL, &version); 581 | if (status != CUDA_SUCCESS) { 582 | RAISE_CU_STD_ERROR(status, "Failed to get the API version of current context."); 583 | } 584 | return UINT2NUM(version); 585 | } 586 | 587 | /* call-seq: CUContext.get_device -> CUDevice 588 | * 589 | * Return the device associated to the current CUDA context. 590 | */ 591 | static VALUE context_get_device(VALUE klass) 592 | { 593 | VALUE device = rb_class_new_instance(0, NULL, rb_cCUDevice); 594 | CUdevice* pdevice; 595 | Data_Get_Struct(device, CUdevice, pdevice); 596 | CUresult status = cuCtxGetDevice(pdevice); 597 | if (status != CUDA_SUCCESS) { 598 | RAISE_CU_STD_ERROR(status, "Failed to get current context's device."); 599 | } 600 | return device; 601 | } 602 | 603 | /* call-seq: CUContext.get_limit(limit) -> Numeric 604 | * 605 | * Return the _limit_ (CULimit) of the current CUDA context. 606 | * 607 | * CUContext.get_limit(CULimit::STACK_SIZE) #=> 8192 608 | */ 609 | static VALUE context_get_limit(VALUE klass, VALUE limit) 610 | { 611 | CUlimit l = static_cast(FIX2UINT(limit)); 612 | size_t v = 0; 613 | CUresult status = cuCtxGetLimit(&v, l); 614 | if (status != CUDA_SUCCESS) { 615 | VALUE limits = rb_funcall(rb_cCULimit, rb_intern("constants"), 0); 616 | VALUE ary[3] = { rb_cCULimit, limit, Qnil }; 617 | rb_block_call(limits, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 618 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get context limit: %s.", rb_id2name(SYM2ID(ary[2]))); 619 | } 620 | return SIZET2NUM(v); 621 | } 622 | 623 | /* call-seq: CUContext.set_limit(limit, value) -> nil 624 | * 625 | * Set the _limit_ (CULimit) of the current CUDA context. 626 | * 627 | * CUContext.set_limit(CULimit::STACK_SIZE, 8192) #=> nil 628 | */ 629 | static VALUE context_set_limit(VALUE klass, VALUE limit, VALUE value) 630 | { 631 | CUlimit l = static_cast(FIX2UINT(limit)); 632 | CUresult status = cuCtxSetLimit(l, NUM2SIZET(value)); 633 | if (status != CUDA_SUCCESS) { 634 | VALUE limits = rb_funcall(rb_cCULimit, rb_intern("constants"), 0); 635 | VALUE ary[3] = { rb_cCULimit, limit, Qnil }; 636 | rb_block_call(limits, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 637 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set context limit: %s to %lu.", rb_id2name(SYM2ID(ary[2])), NUM2SIZET(value)); 638 | } 639 | return Qnil; 640 | } 641 | 642 | /* call-seq: CUContext.get_cache_config -> CUFunctionCache 643 | * 644 | * Return the cache config of the current CUDA context. 645 | * 646 | * CUContext.get_cache_config #=> 1 647 | */ 648 | static VALUE context_get_cache_config(VALUE klass) 649 | { 650 | CUfunc_cache config; 651 | CUresult status = cuCtxGetCacheConfig(&config); 652 | if (status != CUDA_SUCCESS) { 653 | RAISE_CU_STD_ERROR(status, "Failed to get context cache config."); 654 | } 655 | return UINT2NUM(static_cast(config)); 656 | } 657 | 658 | /* call-seq: CUContext.set_cache_config(config) -> nil 659 | * 660 | * Set the cache with _config_ (CUFunctionCache) for the current CUDA context. 661 | * 662 | * CUContext.set_cache_config(CUFunctionCache::PREFER_SHARED) #=> nil 663 | */ 664 | static VALUE context_set_cache_config(VALUE klass, VALUE config) 665 | { 666 | CUresult status = cuCtxSetCacheConfig(static_cast(FIX2UINT(config))); 667 | if (status != CUDA_SUCCESS) { 668 | VALUE configs = rb_funcall(rb_cCUFunctionCache, rb_intern("constants"), 0); 669 | VALUE ary[3] = { rb_cCUFunctionCache, config, Qnil }; 670 | rb_block_call(configs, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 671 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set context cache config: %s.", rb_id2name(SYM2ID(ary[2]))); 672 | } 673 | return Qnil; 674 | } 675 | 676 | /* call-seq: CUContext.pop_current -> CUContext 677 | * 678 | * Pop the current CUDA context from the context stack, which becomes inactive. 679 | */ 680 | static VALUE context_pop_current(VALUE klass) 681 | { 682 | VALUE context = rb_class_new_instance(0, NULL, rb_cCUContext); 683 | CUcontext* pcontext; 684 | Data_Get_Struct(context, CUcontext, pcontext); 685 | CUresult status = cuCtxPopCurrent(pcontext); 686 | if (status != CUDA_SUCCESS) { 687 | RAISE_CU_STD_ERROR(status, "Failed to pop current context."); 688 | } 689 | return context; 690 | } 691 | 692 | /* call-seq: CUContext.synchronize -> nil 693 | * 694 | * Block until all the tasks of the current CUDA context complete. 695 | */ 696 | static VALUE context_synchronize(VALUE klass) 697 | { 698 | CUresult status = cuCtxSynchronize(); 699 | if (status != CUDA_SUCCESS) { 700 | RAISE_CU_STD_ERROR(status, "Failed to synchronize this context."); 701 | } 702 | return Qnil; 703 | } 704 | 705 | // }}} 706 | 707 | 708 | // {{{ CUmodule 709 | 710 | static VALUE module_alloc(VALUE klass) 711 | { 712 | CUmodule* p = new CUmodule; 713 | return Data_Wrap_Struct(klass, 0, generic_free, p); 714 | } 715 | 716 | static VALUE module_initialize(int argc, VALUE* argv, VALUE self) 717 | { 718 | return self; 719 | } 720 | 721 | /* call-seq: mod.load(path) -> self 722 | * 723 | * Load a compute module from the file at _path_ into the current CUDA context. 724 | * The file should be a cubin file or a PTX file. 725 | * 726 | * A PTX file may be obtained by compiling the .cu file using nvcc with -ptx option. 727 | * $ nvcc -ptx vadd.cu 728 | */ 729 | static VALUE module_load(VALUE self, VALUE str) 730 | { 731 | CUmodule* p; 732 | Data_Get_Struct(self, CUmodule, p); 733 | CUresult status = cuModuleLoad(p, StringValuePtr(str)); 734 | if (status != CUDA_SUCCESS) { 735 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to load module: %s.", StringValuePtr(str)); 736 | } 737 | return self; 738 | } 739 | 740 | /* call-seq: mod.load_data(image_str) -> self 741 | * 742 | * Load a compute module from the String _image_str_ which contains a cubin or a PTX data 743 | * into the current CUDA context. 744 | * 745 | *
See also CUModule#load. 746 | */ 747 | static VALUE module_load_data(VALUE self, VALUE image) 748 | { 749 | CUmodule* p; 750 | Data_Get_Struct(self, CUmodule, p); 751 | CUresult status = cuModuleLoadData(p, StringValuePtr(image)); 752 | if (status != CUDA_SUCCESS) { 753 | RAISE_CU_STD_ERROR(status, "Failed to load module data."); 754 | } 755 | return self; 756 | } 757 | 758 | /* call-seq: mod.unload -> self 759 | * 760 | * Unload _self_ from the current CUDA context. 761 | */ 762 | static VALUE module_unload(VALUE self) 763 | { 764 | CUmodule* p; 765 | Data_Get_Struct(self, CUmodule, p); 766 | CUresult status = cuModuleUnload(*p); 767 | if (status != CUDA_SUCCESS) { 768 | RAISE_CU_STD_ERROR(status, "Failed to unload module."); 769 | } 770 | return self; 771 | } 772 | 773 | /* call-seq: mod.get_function(name_str) -> CUFunction 774 | * 775 | * Return a CUFunction instance corresponding to the function name _name_str_ in the loaded compute module. 776 | * A compute module was loaded with CUModule#load and alike methods. 777 | */ 778 | static VALUE module_get_function(VALUE self, VALUE str) 779 | { 780 | CUmodule* p; 781 | Data_Get_Struct(self, CUmodule, p); 782 | CUfunction* pfunc = new CUfunction; 783 | CUresult status = cuModuleGetFunction(pfunc, *p, StringValuePtr(str)); 784 | if (status != CUDA_SUCCESS) { 785 | delete pfunc; 786 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module function: %s.", StringValuePtr(str)); 787 | } 788 | return Data_Wrap_Struct(rb_cCUFunction, 0, generic_free, pfunc); 789 | } 790 | 791 | /* call-seq: mod.get_global(name_str) -> [CUDevicePtr, Numeric] 792 | * 793 | * Return the CUDevicePtr corresponding to the global variable in the loaded compute module and its size in bytes. 794 | */ 795 | static VALUE module_get_global(VALUE self, VALUE str) 796 | { 797 | CUmodule* p; 798 | Data_Get_Struct(self, CUmodule, p); 799 | VALUE rb_devptr = device_ptr_alloc(rb_cCUDevicePtr); 800 | device_ptr_initialize(0, NULL, rb_devptr); 801 | CUdeviceptr* pdevptr; 802 | Data_Get_Struct(rb_devptr, CUdeviceptr, pdevptr); 803 | size_t nbytes; 804 | CUresult status = cuModuleGetGlobal(pdevptr, &nbytes, *p, StringValuePtr(str)); 805 | if (status != CUDA_SUCCESS) { 806 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module global: %s.", StringValuePtr(str)); 807 | } 808 | return rb_ary_new3(2, rb_devptr, SIZET2NUM(nbytes)); 809 | } 810 | 811 | /* call-seq: mod.get_texref(name_str) -> CUTexRef 812 | * 813 | * Return a CUTexRef instance corresponding to the texture name _name_str_ in the loaded compute module. 814 | */ 815 | static VALUE module_get_texref(VALUE self, VALUE str) 816 | { 817 | CUmodule* pmodule; 818 | CUtexref* ptexref; 819 | Data_Get_Struct(self, CUmodule, pmodule); 820 | VALUE rb_texref = rb_class_new_instance(0, NULL, rb_cCUTexRef); 821 | Data_Get_Struct(rb_texref, CUtexref, ptexref); 822 | CUresult status = cuModuleGetTexRef(ptexref, *pmodule, StringValuePtr(str)); 823 | if (status != CUDA_SUCCESS) { 824 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module texture reference: %s.", StringValuePtr(str)); 825 | } 826 | return rb_texref; 827 | } 828 | 829 | // }}} 830 | 831 | 832 | // {{{ CUdeviceptr 833 | 834 | static VALUE device_ptr_alloc(VALUE klass) 835 | { 836 | CUdeviceptr* p = new CUdeviceptr; 837 | return Data_Wrap_Struct(klass, 0, generic_free, p); 838 | } 839 | 840 | static VALUE device_ptr_initialize(int argc, VALUE* argv, VALUE self) 841 | { 842 | CUdeviceptr* p; 843 | Data_Get_Struct(self, CUdeviceptr, p); 844 | *p = static_cast(0); 845 | return self; 846 | } 847 | 848 | /* call-seq: devptr.offset(offset) -> CUDevicePtr 849 | * 850 | * Return a CUDevicePtr instance pointing to the memory location _offset_ (bytes) from _self_. 851 | */ 852 | static VALUE device_ptr_offset(VALUE self, VALUE offset) 853 | { 854 | CUdeviceptr* pdevptr; 855 | CUdeviceptr* pdevptr_offset; 856 | Data_Get_Struct(self, CUdeviceptr, pdevptr); 857 | VALUE rb_pdevptr_offset = rb_class_new_instance(0, NULL, rb_cCUDevicePtr); 858 | Data_Get_Struct(rb_pdevptr_offset, CUdeviceptr, pdevptr_offset); 859 | *pdevptr_offset = *pdevptr + NUM2UINT(offset); 860 | return rb_pdevptr_offset; 861 | } 862 | 863 | /* call-seq: devptr.mem_alloc(nbytes) -> self 864 | * 865 | * Allocate _nbytes_ device memory and let _self_ points to this allocated memory. 866 | */ 867 | static VALUE device_ptr_mem_alloc(VALUE self, VALUE nbytes) 868 | { 869 | CUdeviceptr* p; 870 | Data_Get_Struct(self, CUdeviceptr, p); 871 | CUresult status = cuMemAlloc(p, NUM2UINT(nbytes)); 872 | if (status != CUDA_SUCCESS) { 873 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to allocate memory: size = %u.", NUM2UINT(nbytes)); 874 | } 875 | return self; 876 | } 877 | 878 | /* call-seq: devptr.mem_free -> self 879 | * 880 | * Free the allocated device memory _self_ pointing to. 881 | */ 882 | static VALUE device_ptr_mem_free(VALUE self) 883 | { 884 | CUdeviceptr* p; 885 | Data_Get_Struct(self, CUdeviceptr, p); 886 | CUresult status = cuMemFree(*p); 887 | if (status != CUDA_SUCCESS) { 888 | RAISE_CU_STD_ERROR(status, "Failed to free memory."); 889 | } 890 | return self; 891 | } 892 | 893 | // }}} 894 | 895 | 896 | // {{{ CUfunction 897 | 898 | static VALUE function_alloc(VALUE klass) 899 | { 900 | CUfunction* p = new CUfunction; 901 | return Data_Wrap_Struct(klass, 0, generic_free, p); 902 | } 903 | 904 | static VALUE function_initialize(int argc, VALUE* argv, VALUE self) 905 | { 906 | return self; 907 | } 908 | 909 | /* call-seq: func.set_param(arg1, arg2, *other_args) -> self 910 | * 911 | * Set the argument list of _self_ to _arg1_, _arg2_, *other_args. 912 | */ 913 | static VALUE function_set_param(int argc, VALUE* argv, VALUE self) 914 | { 915 | #define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1) 916 | 917 | int offset = 0; 918 | CUfunction* pfunc; 919 | Data_Get_Struct(self, CUfunction, pfunc); 920 | 921 | CUresult status = CUDA_ERROR_UNKNOWN; 922 | for (int i = 0; i < argc; ++i) { 923 | if (CLASS_OF(argv[i]) == rb_cCUDevicePtr) { 924 | CUdeviceptr* p; 925 | Data_Get_Struct(argv[i], CUdeviceptr, p); 926 | ALIGN_UP(offset, __alignof(*p)); 927 | status = cuParamSetv(*pfunc, offset, p, sizeof(*p)); 928 | if (status != CUDA_SUCCESS) break; 929 | offset += sizeof(*p); 930 | } else if (CLASS_OF(argv[i]) == rb_cFixnum) { 931 | int num = FIX2INT(argv[i]); 932 | ALIGN_UP(offset, __alignof(num)); 933 | status = cuParamSeti(*pfunc, offset, num); 934 | if (status != CUDA_SUCCESS) break; 935 | offset += sizeof(int); 936 | } else if (CLASS_OF(argv[i]) == rb_cFloat) { 937 | float num = static_cast(NUM2DBL(argv[i])); 938 | ALIGN_UP(offset, __alignof(num)); 939 | status = cuParamSetf(*pfunc, offset, num); 940 | if (status != CUDA_SUCCESS) break; 941 | offset += sizeof(float); 942 | } else { 943 | rb_raise(rb_eArgError, "Invalid type of argument %d.", i+1); 944 | } 945 | } 946 | if (argc > 0 && status != CUDA_SUCCESS) { 947 | RAISE_CU_STD_ERROR(status, "Failed to set function parameters."); 948 | } 949 | 950 | status = cuParamSetSize(*pfunc, offset); 951 | if (status != CUDA_SUCCESS) { 952 | RAISE_CU_STD_ERROR(status, "Failed to set function parameter size."); 953 | } 954 | return self; 955 | } 956 | 957 | /* call-seq: func.set_texref(texref) -> self 958 | * 959 | * Add the _texref_ to the argument list of _self_. 960 | * 961 | * Note: This method is *deprecated*. This is no longer necessary. 962 | */ 963 | static VALUE function_set_texref(VALUE self, VALUE texref) 964 | { 965 | rb_warn("CUFunction#set_texref is deprecated."); 966 | CUfunction* pfunc; 967 | CUtexref* ptexref; 968 | Data_Get_Struct(self, CUfunction, pfunc); 969 | Data_Get_Struct(texref, CUtexref, ptexref); 970 | CUresult status = cuParamSetTexRef(*pfunc, CU_PARAM_TR_DEFAULT, *ptexref); 971 | if (status != CUDA_SUCCESS) { 972 | RAISE_CU_STD_ERROR(status, "Failed to set function texture reference."); 973 | } 974 | return self; 975 | } 976 | 977 | /* call-seq: func.set_block_shape(xdim) -> self 978 | * func.set_block_shape(xdim, ydim) -> self 979 | * func.set_block_shape(xdim, ydim, zdim) -> self 980 | * 981 | * Set the block dimensions to use for next launch. _ydim_ and _zdim_ which may be omitted are default to 1. 982 | */ 983 | static VALUE function_set_block_shape(int argc, VALUE* argv, VALUE self) 984 | { 985 | if (argc <= 0 || argc > 3) { 986 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 to 3 integers).", argc); 987 | } 988 | 989 | CUfunction* pfunc; 990 | Data_Get_Struct(self, CUfunction, pfunc); 991 | 992 | int xdim = FIX2INT(argv[0]); 993 | int ydim = 1; 994 | int zdim = 1; 995 | 996 | if (argc >= 2) { 997 | ydim = FIX2INT(argv[1]); 998 | } 999 | if (argc >= 3) { 1000 | zdim = FIX2INT(argv[2]); 1001 | } 1002 | 1003 | CUresult status = cuFuncSetBlockShape(*pfunc, xdim, ydim, zdim); 1004 | if (status != CUDA_SUCCESS) { 1005 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function block shape: (x,y,z) = (%d,%d,%d).", xdim, ydim, zdim); 1006 | } 1007 | return self; 1008 | } 1009 | 1010 | /* call-seq: func.set_shared_size(nbytes) -> self 1011 | * 1012 | * Set the dynamic shared-memory size to use for next launch. 1013 | */ 1014 | static VALUE function_set_shared_size(VALUE self, VALUE nbytes) 1015 | { 1016 | CUfunction* p; 1017 | Data_Get_Struct(self, CUfunction, p); 1018 | CUresult status = cuFuncSetSharedSize(*p, NUM2UINT(nbytes)); 1019 | if (status != CUDA_SUCCESS) { 1020 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function shared memory size: %u.", NUM2UINT(nbytes)); 1021 | } 1022 | return self; 1023 | } 1024 | 1025 | /* call-seq: func.launch -> self 1026 | * 1027 | * Launch _self_ to execute on a CUDA device. 1028 | */ 1029 | static VALUE function_launch(VALUE self) 1030 | { 1031 | CUfunction* p; 1032 | Data_Get_Struct(self, CUfunction, p); 1033 | CUresult status = cuLaunch(*p); 1034 | if (status != CUDA_SUCCESS) { 1035 | RAISE_CU_STD_ERROR(status, "Failed to launch kernel function on 1x1x1 grid of blocks."); 1036 | } 1037 | return self; 1038 | } 1039 | 1040 | /* call-seq: func.launch_grid(xdim) -> self 1041 | * func.launch_grid(xdim, ydim) -> self 1042 | * 1043 | * Launch _self_ with grid dimensions (xdim, ydim) to execute on a CUDA device. 1044 | * _ydim_ which may be omitted is default to 1. 1045 | */ 1046 | static VALUE function_launch_grid(int argc, VALUE* argv, VALUE self) 1047 | { 1048 | if (argc <= 0 || argc > 2) { 1049 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2 integers).", argc); 1050 | } 1051 | 1052 | CUfunction* pfunc; 1053 | Data_Get_Struct(self, CUfunction, pfunc); 1054 | 1055 | int xdim = FIX2INT(argv[0]); 1056 | int ydim = 1; 1057 | 1058 | if (argc >= 2) { 1059 | ydim = FIX2INT(argv[1]); 1060 | } 1061 | 1062 | CUresult status = cuLaunchGrid(*pfunc, xdim, ydim); 1063 | if (status != CUDA_SUCCESS) { 1064 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to launch kernel function on %dx%d grid of blocks.", xdim, ydim); 1065 | } 1066 | return self; 1067 | } 1068 | 1069 | /* call-seq: func.launch_grid_async(xdim, stream) -> self 1070 | * func.launch_grid_async(xdim, ydim, stream) -> self 1071 | * 1072 | * Launch _self_ with grid dimensions (xdim, ydim) on _stream_ asynchronously to execute on a CUDA device. 1073 | * _ydim_ which may be omitted is default to 1. Setting _stream_ to anything other than an instance of CUStream 1074 | * will execute on the default stream 0. 1075 | */ 1076 | static VALUE function_launch_grid_async(int argc, VALUE* argv, VALUE self) 1077 | { 1078 | if (argc < 2 || argc > 3) { 1079 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 2 or 3).", argc); 1080 | } 1081 | 1082 | CUfunction* pfunc; 1083 | CUstream *pstream = NULL; 1084 | CUstream stream0 = 0; 1085 | Data_Get_Struct(self, CUfunction, pfunc); 1086 | 1087 | int xdim = FIX2INT(argv[0]); 1088 | int ydim = 1; 1089 | 1090 | if (argc == 2) { 1091 | if (CLASS_OF(argv[1]) == rb_cCUStream) { 1092 | Data_Get_Struct(argv[1], CUstream, pstream); 1093 | } else { 1094 | pstream = &stream0; 1095 | } 1096 | } else if (argc == 3) { 1097 | ydim = FIX2INT(argv[1]); 1098 | if (CLASS_OF(argv[2]) == rb_cCUStream) { 1099 | Data_Get_Struct(argv[2], CUstream, pstream); 1100 | } else { 1101 | pstream = &stream0; 1102 | } 1103 | } 1104 | 1105 | CUresult status = cuLaunchGridAsync(*pfunc, xdim, ydim, *pstream); 1106 | if (status != CUDA_SUCCESS) { 1107 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to launch kernel function asynchronously on %dx%d grid of blocks.", xdim, ydim); 1108 | } 1109 | return self; 1110 | } 1111 | 1112 | /* call-seq: func.get_attribute(attribute) -> Fixnum 1113 | * 1114 | * Return _attribute_ (CUFunctionAttribute) of _self_. 1115 | * 1116 | * func.get_attribute(CUFunctionAttribute::MAX_THREADS_PER_BLOCK) #=> 512 1117 | * func.get_attribute(CUFunctionAttribute::SHARED_SIZE_BYTES) #=> 44 1118 | * func.get_attribute(CUFunctionAttribute::NUM_REGS) #=> 3 1119 | */ 1120 | static VALUE function_get_attribute(VALUE self, VALUE attribute) 1121 | { 1122 | CUfunction* p; 1123 | Data_Get_Struct(self, CUfunction, p); 1124 | int v; 1125 | CUresult status = cuFuncGetAttribute(&v, static_cast(FIX2INT(attribute)), *p); 1126 | if (status != CUDA_SUCCESS) { 1127 | VALUE attributes = rb_funcall(rb_cCUFunctionAttribute, rb_intern("constants"), 0); 1128 | VALUE ary[3] = { rb_cCUFunctionAttribute, attribute, Qnil }; 1129 | rb_block_call(attributes, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 1130 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to query function attribute: %s.", rb_id2name(SYM2ID(ary[2]))); 1131 | } 1132 | return INT2FIX(v); 1133 | } 1134 | 1135 | /* call-seq: func.set_cache_config(config) -> self 1136 | * 1137 | * Set the preferred cache configuration (CUFunctionCache) to use for next launch. 1138 | */ 1139 | static VALUE function_set_cache_config(VALUE self, VALUE config) 1140 | { 1141 | CUfunction* p; 1142 | Data_Get_Struct(self, CUfunction, p); 1143 | CUresult status = cuFuncSetCacheConfig(*p, static_cast(FIX2UINT(config))); 1144 | if (status != CUDA_SUCCESS) { 1145 | VALUE configs = rb_funcall(rb_cCUFunctionCache, rb_intern("constants"), 0); 1146 | VALUE ary[3] = { rb_cCUFunctionCache, config, Qnil }; 1147 | rb_block_call(configs, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary); 1148 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function cache config: %s.", rb_id2name(SYM2ID(ary[2]))); 1149 | } 1150 | return self; 1151 | } 1152 | 1153 | // }}} 1154 | 1155 | 1156 | // {{{ CUstream 1157 | 1158 | static VALUE stream_alloc(VALUE klass) 1159 | { 1160 | CUstream* p = new CUstream; 1161 | return Data_Wrap_Struct(klass, 0, generic_free, p); 1162 | } 1163 | 1164 | static VALUE stream_initialize(VALUE self) 1165 | { 1166 | return self; 1167 | } 1168 | 1169 | /* call-seq: stream.create -> self 1170 | * stream.create(flags) -> self 1171 | * 1172 | * Create a stream and set _self_ to this stream. Currently, _flags_ must be set to 0. 1173 | */ 1174 | static VALUE stream_create(int argc, VALUE* argv, VALUE self) 1175 | { 1176 | if (argc < 0 || argc > 1) { 1177 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 0 or 1).", argc); 1178 | } 1179 | 1180 | CUstream* p; 1181 | unsigned int flags = 0; 1182 | Data_Get_Struct(self, CUstream, p); 1183 | if (argc == 1) { 1184 | flags = FIX2UINT(argv[0]); 1185 | } 1186 | CUresult status = cuStreamCreate(p, flags); 1187 | if (status != CUDA_SUCCESS) { 1188 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create stream: flags = 0x%x", flags); 1189 | } 1190 | return self; 1191 | } 1192 | 1193 | /* call-seq: stream.destroy -> nil 1194 | * 1195 | * Destroy the stream _self_. 1196 | */ 1197 | static VALUE stream_destroy(VALUE self) 1198 | { 1199 | CUstream* p; 1200 | Data_Get_Struct(self, CUstream, p); 1201 | CUresult status = cuStreamDestroy(*p); 1202 | if (status != CUDA_SUCCESS) { 1203 | RAISE_CU_STD_ERROR(status, "Failed to destroy stream."); 1204 | } 1205 | return Qnil; 1206 | } 1207 | 1208 | /* call-seq: stream.query -> true or false 1209 | * 1210 | * Return true if all operations in _self_ have completed. Otherwise, return false. 1211 | */ 1212 | static VALUE stream_query(VALUE self) 1213 | { 1214 | CUstream* p; 1215 | Data_Get_Struct(self, CUstream, p); 1216 | CUresult status = cuStreamQuery(*p); 1217 | if (status == CUDA_SUCCESS) { 1218 | return Qtrue; 1219 | } else if (status == CUDA_ERROR_NOT_READY) { 1220 | return Qfalse; 1221 | } else { 1222 | RAISE_CU_STD_ERROR(status, "Failed to query stream."); 1223 | } 1224 | } 1225 | 1226 | /* call-seq: stream.synchronize -> self 1227 | * 1228 | * Block until all operations in _self_ complete. 1229 | */ 1230 | static VALUE stream_synchronize(VALUE self) 1231 | { 1232 | CUstream* p; 1233 | Data_Get_Struct(self, CUstream, p); 1234 | CUresult status = cuStreamSynchronize(*p); 1235 | if (status != CUDA_SUCCESS) { 1236 | RAISE_CU_STD_ERROR(status, "Failed to synchronize stream."); 1237 | } 1238 | return self; 1239 | } 1240 | 1241 | /* call-seq: stream.wait_event(event) -> self 1242 | * stream.wait_event(event, flags) -> self 1243 | * 1244 | * Let all future operations submitted to _self_ wait until _event_ (CUEvent) complete before beginning execution. 1245 | * Currently, _flags_ must be 0. 1246 | */ 1247 | static VALUE stream_wait_event(int argc, VALUE* argv, VALUE self) 1248 | { 1249 | if (argc <= 0 || argc > 2) { 1250 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc); 1251 | } 1252 | 1253 | CUstream* pstream; 1254 | CUevent* pevent; 1255 | unsigned int flags = 0; 1256 | Data_Get_Struct(self, CUstream, pstream); 1257 | Data_Get_Struct(argv[0], CUevent, pevent); 1258 | if (argc == 2) { 1259 | flags = FIX2UINT(argv[1]); 1260 | } 1261 | CUresult status = cuStreamWaitEvent(*pstream, *pevent, flags); 1262 | if (status != CUDA_SUCCESS) { 1263 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to make stream's future operations to wait event: flags = 0x%x", flags); 1264 | } 1265 | return self; 1266 | } 1267 | 1268 | /* call-seq: CUStream.wait_event(event) -> nil 1269 | * CUStream.wait_event(event, flags) -> nil 1270 | * 1271 | * Let all future operations submitted to stream 0 (NULL stream) wait until _event_ (CUEvent) complete before beginning execution. 1272 | * Currently, _flags_ must be 0. 1273 | */ 1274 | static VALUE stream_wait_event_singleton(int argc, VALUE* argv, VALUE klass) 1275 | { 1276 | if (argc <= 0 || argc > 2) { 1277 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc); 1278 | } 1279 | 1280 | CUevent* pevent; 1281 | unsigned int flags = 0; 1282 | Data_Get_Struct(argv[0], CUevent, pevent); 1283 | if (argc == 2) { 1284 | flags = FIX2UINT(argv[1]); 1285 | } 1286 | CUresult status = cuStreamWaitEvent(0, *pevent, flags); 1287 | if (status != CUDA_SUCCESS) { 1288 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to make current stream's future operations to wait event: flags = 0x%x", flags); 1289 | } 1290 | return Qnil; 1291 | } 1292 | 1293 | // }}} 1294 | 1295 | 1296 | // {{{ CUevent 1297 | 1298 | static VALUE event_alloc(VALUE klass) 1299 | { 1300 | CUevent* p = new CUevent; 1301 | return Data_Wrap_Struct(klass, 0, generic_free, p); 1302 | } 1303 | 1304 | static VALUE event_initialize(VALUE self) 1305 | { 1306 | return self; 1307 | } 1308 | 1309 | /* call-seq: event.create -> self 1310 | * event.create(flags) -> self 1311 | * 1312 | * Create an event with _flags_ (CUEventFlags) and set _self_ to this event. 1313 | * The _flags_ is default to CUEventFlags::DEFAULT. 1314 | * 1315 | * event.create #=> self 1316 | * event.create(CUEventFlags::DEFAULT) #=> self 1317 | * event.create(CUEventFlags::BLOCKING_SYNC) #=> self 1318 | */ 1319 | static VALUE event_create(int argc, VALUE* argv, VALUE self) 1320 | { 1321 | if (argc < 0 || argc > 1) { 1322 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 0 or 1).", argc); 1323 | } 1324 | 1325 | CUevent* p; 1326 | unsigned int flags = CU_EVENT_DEFAULT; 1327 | Data_Get_Struct(self, CUevent, p); 1328 | if (argc == 1) { 1329 | flags = FIX2UINT(argv[0]); 1330 | } 1331 | CUresult status = cuEventCreate(p, flags); 1332 | if (status != CUDA_SUCCESS) { 1333 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create event: flags = 0x%x.", flags); 1334 | } 1335 | return self; 1336 | } 1337 | 1338 | /* call-seq: event.destroy -> nil 1339 | * 1340 | * Destroy the event _self_. 1341 | */ 1342 | static VALUE event_destroy(VALUE self) 1343 | { 1344 | CUevent* p; 1345 | Data_Get_Struct(self, CUevent, p); 1346 | CUresult status = cuEventDestroy(*p); 1347 | if (status != CUDA_SUCCESS) { 1348 | RAISE_CU_STD_ERROR(status, "Failed to destroy event."); 1349 | } 1350 | return Qnil; 1351 | } 1352 | 1353 | /* call-seq: event.query -> true or false 1354 | * 1355 | * Return true if _self_ has been recorded. Otherwise, return false. 1356 | */ 1357 | static VALUE event_query(VALUE self) 1358 | { 1359 | CUevent* p; 1360 | Data_Get_Struct(self, CUevent, p); 1361 | CUresult status = cuEventQuery(*p); 1362 | if (status == CUDA_SUCCESS) { 1363 | return Qtrue; 1364 | } else if (status == CUDA_ERROR_NOT_READY) { 1365 | return Qfalse; 1366 | } else if (status == CUDA_ERROR_INVALID_VALUE) { 1367 | RAISE_CU_STD_ERROR(status, "Failed to query event: cuEventRecord() has not been called on this event."); 1368 | } else { 1369 | RAISE_CU_STD_ERROR(status, "Failed to query event."); 1370 | } 1371 | } 1372 | 1373 | /* call-seq: event.record(stream) -> self 1374 | * 1375 | * Record event _self_ asynchronously in _stream_. 1376 | * Setting _stream_ to anything other than an instance of CUStream will record on the default stream 0. 1377 | */ 1378 | static VALUE event_record(VALUE self, VALUE rb_stream) 1379 | { 1380 | CUevent* pevent = NULL; 1381 | CUstream* pstream = NULL; 1382 | CUresult status; 1383 | Data_Get_Struct(self, CUevent, pevent); 1384 | if (CLASS_OF(rb_stream) == rb_cCUStream) { 1385 | Data_Get_Struct(rb_stream, CUstream, pstream); 1386 | status = cuEventRecord(*pevent, *pstream); 1387 | } else { 1388 | status = cuEventRecord(*pevent, 0); 1389 | } 1390 | if (status == CUDA_ERROR_INVALID_VALUE) { 1391 | RAISE_CU_STD_ERROR(status, "Failed to record event: cuEventRecord() has been called and has not been recorded yet."); 1392 | } else if (status != CUDA_SUCCESS) { 1393 | RAISE_CU_STD_ERROR(status, "Failed to record event."); 1394 | } 1395 | return self; 1396 | } 1397 | 1398 | /* call-seq: event.synchronize -> self 1399 | * 1400 | * Block until _self_ has been recorded. 1401 | */ 1402 | static VALUE event_synchronize(VALUE self) 1403 | { 1404 | CUevent* p; 1405 | Data_Get_Struct(self, CUevent, p); 1406 | CUresult status = cuEventSynchronize(*p); 1407 | // TODO: Handle status == CUDA_ERROR_INVALID_VALUE 1408 | if (status != CUDA_SUCCESS) { 1409 | RAISE_CU_STD_ERROR(status, "Failed to synchronize event."); 1410 | } 1411 | return self; 1412 | } 1413 | 1414 | /* call-seq: event.elapsed_time(event_start, event_end) -> Numeric 1415 | * 1416 | * Return the elapsed time (ms) from _event_start_ (CUEvent) to _event_end_ (CUEvent). 1417 | */ 1418 | static VALUE event_elapsed_time(VALUE klass, VALUE event_start, VALUE event_end) 1419 | { 1420 | CUevent* pevent_start; 1421 | CUevent* pevent_end; 1422 | Data_Get_Struct(event_start, CUevent, pevent_start); 1423 | Data_Get_Struct(event_end, CUevent, pevent_end); 1424 | float etime; 1425 | CUresult status = cuEventElapsedTime(&etime, *pevent_start, *pevent_end); 1426 | if (status == CUDA_ERROR_NOT_READY) { 1427 | RAISE_CU_STD_ERROR(status, "Failed to get elapsed time of events: either event has not been recorded yet."); 1428 | } else if (status != CUDA_SUCCESS) { 1429 | RAISE_CU_STD_ERROR(status, "Failed to get elapsed time of events."); 1430 | } 1431 | return DBL2NUM(etime); 1432 | } 1433 | 1434 | // }}} 1435 | 1436 | 1437 | // {{{ CUtexref 1438 | 1439 | static VALUE texref_alloc(VALUE klass) 1440 | { 1441 | CUtexref* p = new CUtexref; 1442 | return Data_Wrap_Struct(klass, 0, generic_free, p); 1443 | } 1444 | 1445 | static VALUE texref_initialize(VALUE self) 1446 | { 1447 | return self; 1448 | } 1449 | 1450 | /* call-seq: texref.create -> self 1451 | * 1452 | * Create a texture reference and set _self_ to this texture reference. 1453 | * 1454 | * Note: This method is *deprecated*. 1455 | */ 1456 | static VALUE texref_create(VALUE self) 1457 | { 1458 | rb_warn("CUTexRef#create is deprecated."); 1459 | CUtexref* p; 1460 | Data_Get_Struct(self, CUtexref, p); 1461 | CUresult status = cuTexRefCreate(p); 1462 | if (status != CUDA_SUCCESS) { 1463 | RAISE_CU_STD_ERROR(status, "Failed to create texture."); 1464 | } 1465 | return self; 1466 | } 1467 | 1468 | /* call-seq: texref.destroy -> nil 1469 | * 1470 | * Destroy the texture reference _self_. 1471 | * 1472 | * Note: This method is *deprecated*. 1473 | */ 1474 | static VALUE texref_destroy(VALUE self) 1475 | { 1476 | rb_warn("CUTexRef#destroy is deprecated."); 1477 | CUtexref* p; 1478 | Data_Get_Struct(self, CUtexref, p); 1479 | CUresult status = cuTexRefDestroy(*p); 1480 | if (status != CUDA_SUCCESS) { 1481 | RAISE_CU_STD_ERROR(status, "Failed to destroy texture."); 1482 | } 1483 | return Qnil; 1484 | } 1485 | 1486 | /* call-seq: texref.get_address -> CUDevicePtr 1487 | * 1488 | * Return a CUDevicePtr instance bound to the texture reference. 1489 | */ 1490 | static VALUE texref_get_address(VALUE self) 1491 | { 1492 | CUtexref* ptexref; 1493 | CUdeviceptr* pdevptr; 1494 | Data_Get_Struct(self, CUtexref, ptexref); 1495 | VALUE rb_devptr = rb_class_new_instance(0, NULL, rb_cCUDevicePtr); 1496 | Data_Get_Struct(rb_devptr, CUdeviceptr, pdevptr); 1497 | CUresult status = cuTexRefGetAddress(pdevptr, *ptexref); 1498 | if (status != CUDA_SUCCESS) { 1499 | RAISE_CU_STD_ERROR(status, "Failed to get texture address."); 1500 | } 1501 | return rb_devptr; 1502 | } 1503 | 1504 | /* call-seq: texref.get_address_mode(dim) -> Fixnum 1505 | * 1506 | * Return the address mode of the dimension _dim_ (0..2) of _self_. 1507 | */ 1508 | static VALUE texref_get_address_mode(VALUE self, VALUE dim) 1509 | { 1510 | CUtexref* p; 1511 | CUaddress_mode mode; 1512 | Data_Get_Struct(self, CUtexref, p); 1513 | CUresult status = cuTexRefGetAddressMode(&mode, *p, FIX2INT(dim)); 1514 | if (status != CUDA_SUCCESS) { 1515 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get texture address mode: dim = %d.", FIX2INT(dim)); 1516 | } 1517 | return INT2FIX(mode); 1518 | } 1519 | 1520 | /* call-seq: texref.get_filter_mode -> Fixnum 1521 | * 1522 | * Return the filter mode of _self_. 1523 | */ 1524 | static VALUE texref_get_filter_mode(VALUE self) 1525 | { 1526 | CUtexref* p; 1527 | CUfilter_mode mode; 1528 | Data_Get_Struct(self, CUtexref, p); 1529 | CUresult status = cuTexRefGetFilterMode(&mode, *p); 1530 | if (status != CUDA_SUCCESS) { 1531 | RAISE_CU_STD_ERROR(status, "Failed to get texture filter mode."); 1532 | } 1533 | return INT2FIX(mode); 1534 | } 1535 | 1536 | /* call-seq: texref.get_flags -> Numeric 1537 | * 1538 | * Return the flags of _self_. 1539 | */ 1540 | static VALUE texref_get_flags(VALUE self) 1541 | { 1542 | CUtexref* p; 1543 | unsigned int flags; 1544 | Data_Get_Struct(self, CUtexref, p); 1545 | CUresult status = cuTexRefGetFlags(&flags, *p); 1546 | if (status != CUDA_SUCCESS) { 1547 | RAISE_CU_STD_ERROR(status, "Failed to get texture flags."); 1548 | } 1549 | return UINT2NUM(flags); 1550 | } 1551 | 1552 | /* call-seq: texref.set_address(devptr, nbytes) -> Numeric 1553 | * 1554 | * Bind _devptr_ (CUDevicePtr) with _nbytes_ to _self_. 1555 | */ 1556 | static VALUE texref_set_address(VALUE self, VALUE rb_device_ptr, VALUE nbytes) 1557 | { 1558 | CUtexref* ptexref; 1559 | CUdeviceptr* pdevptr; 1560 | size_t offset; 1561 | Data_Get_Struct(self, CUtexref, ptexref); 1562 | Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevptr); 1563 | CUresult status = cuTexRefSetAddress(&offset, *ptexref, *pdevptr, NUM2UINT(nbytes)); 1564 | if (status != CUDA_SUCCESS) { 1565 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture address: nbytes = %u.", NUM2UINT(nbytes)); 1566 | } 1567 | return SIZET2NUM(offset); 1568 | } 1569 | 1570 | /* call-seq: texref.set_address_mode(dim, mode) -> self 1571 | * 1572 | * Set the address mode of _self_ with _dim_ (0..2) and _mode_ (CUAddressMode). 1573 | */ 1574 | static VALUE texref_set_address_mode(VALUE self, VALUE dim, VALUE mode) 1575 | { 1576 | CUtexref* p; 1577 | Data_Get_Struct(self, CUtexref, p); 1578 | CUresult status = cuTexRefSetAddressMode(*p, FIX2INT(dim), static_cast(FIX2INT(mode))); 1579 | if (status != CUDA_SUCCESS) { 1580 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture address mode: dim = %d, mode = %d", FIX2INT(dim), FIX2INT(mode)); 1581 | } 1582 | return self; 1583 | } 1584 | 1585 | /* call-seq: texref.set_filter_mode(mode) -> self 1586 | * 1587 | * Set the filter mode of _self_ with _mode_ (CUFilterMode). 1588 | */ 1589 | static VALUE texref_set_filter_mode(VALUE self, VALUE mode) 1590 | { 1591 | CUtexref* p; 1592 | Data_Get_Struct(self, CUtexref, p); 1593 | CUresult status = cuTexRefSetFilterMode(*p, static_cast(FIX2INT(mode))); 1594 | if (status != CUDA_SUCCESS) { 1595 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture filter mode: mode = %d.", FIX2INT(mode)); 1596 | } 1597 | return self; 1598 | } 1599 | 1600 | /* call-seq: texref.set_flags(flags) -> self 1601 | * 1602 | * Set the _flags_ (CUTexRefFlags) of _self_. 1603 | */ 1604 | static VALUE texref_set_flags(VALUE self, VALUE flags) 1605 | { 1606 | CUtexref* p; 1607 | Data_Get_Struct(self, CUtexref, p); 1608 | CUresult status = cuTexRefSetFlags(*p, NUM2UINT(flags)); 1609 | if (status != CUDA_SUCCESS) { 1610 | RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture flags: flags = 0x%x.", NUM2UINT(flags)); 1611 | } 1612 | return self; 1613 | } 1614 | 1615 | // }}} 1616 | 1617 | 1618 | // {{{ Memory pointer 1619 | static VALUE memory_pointer_alloc(VALUE klass) 1620 | { 1621 | MemoryPointer* ppointer = new MemoryPointer; 1622 | ppointer->p = NULL; 1623 | return Data_Wrap_Struct(klass, 0, generic_free, ppointer); 1624 | } 1625 | 1626 | static VALUE memory_pointer_initialize(VALUE self) 1627 | { 1628 | return self; 1629 | } 1630 | // }}} 1631 | 1632 | 1633 | // {{{ Buffer 1634 | 1635 | /* call-seq: Buffer.new(size, options = {}) -> Buffer 1636 | * 1637 | * Create a buffer with _size_ elements. 1638 | * 1639 | * Options: 1640 | * * _page_locked_ - Allocate page-locked memory if _:page_locked_ is true. Otherwise, allocate pageable memory. 1641 | * 1642 | * Buffer.new(10) # Allocate 10 elements with pageable memory. 1643 | * Buffer.new(20, page_locked: true) # Allocate 20 elements with page-locked memory. 1644 | */ 1645 | static VALUE ibuffer_initialize(int argc, VALUE* argv, VALUE self) 1646 | { 1647 | // This function exists for documentation only. 1648 | rb_notimplement(); 1649 | return Qnil; 1650 | } 1651 | 1652 | /* call-seq: Buffer.element_size 1653 | * 1654 | * Return the size of an element of this Buffer in bytes. 1655 | */ 1656 | static VALUE ibuffer_element_size(VALUE klass) 1657 | { 1658 | rb_notimplement(); 1659 | return Qnil; 1660 | } 1661 | 1662 | /* call-seq: buffer.size -> Numeric 1663 | * 1664 | * Return the number of elements in this buffer. 1665 | */ 1666 | static VALUE ibuffer_size(VALUE self) 1667 | { 1668 | rb_notimplement(); 1669 | return Qnil; 1670 | } 1671 | 1672 | /* call-seq: buffer.page_locked? -> true or false 1673 | * 1674 | * Return true if this buffer is page-locked allocated. 1675 | * Otherwise, return false. 1676 | */ 1677 | static VALUE ibuffer_is_page_locked(VALUE self) 1678 | { 1679 | rb_notimplement(); 1680 | return Qnil; 1681 | } 1682 | 1683 | /* call-seq: buffer.offset(index) -> MemoryPointer 1684 | * 1685 | * Return the memory pointer of the element at _index_ (0...size) in this buffer. 1686 | */ 1687 | static VALUE ibuffer_offset(VALUE self, VALUE offset) 1688 | { 1689 | rb_notimplement(); 1690 | return Qnil; 1691 | } 1692 | 1693 | /* call-seq: buffer[index] -> Object 1694 | * 1695 | * Return the element at _index_ (0...size) in this buffer. 1696 | */ 1697 | static VALUE ibuffer_element_get(VALUE self, VALUE index) 1698 | { 1699 | rb_notimplement(); 1700 | return Qnil; 1701 | } 1702 | 1703 | /* call-seq: buffer[index] = value -> Object 1704 | * 1705 | * Set the element at _index_ (0...size) in this buffer to _value_. 1706 | * Return _value_. 1707 | */ 1708 | static VALUE ibuffer_element_set(VALUE self, VALUE index, VALUE value) 1709 | { 1710 | rb_notimplement(); 1711 | return Qnil; 1712 | } 1713 | 1714 | static void memory_buffer_free(void* p) 1715 | { 1716 | MemoryBuffer* pbuffer = static_cast(p); 1717 | if (pbuffer->is_page_locked) { 1718 | cuMemFreeHost(reinterpret_cast(pbuffer->p)); 1719 | } else { 1720 | delete[] pbuffer->p; 1721 | } 1722 | delete pbuffer; 1723 | } 1724 | 1725 | static VALUE memory_buffer_alloc(VALUE klass) 1726 | { 1727 | MemoryBuffer* pbuffer = new MemoryBuffer; 1728 | pbuffer->size = 0; 1729 | pbuffer->is_page_locked = false; 1730 | pbuffer->p = NULL; 1731 | return Data_Wrap_Struct(klass, 0, memory_buffer_free, pbuffer); 1732 | } 1733 | 1734 | static VALUE memory_buffer_element_size(VALUE klass) 1735 | { 1736 | return INT2FIX(1); 1737 | } 1738 | 1739 | static VALUE memory_buffer_initialize(int argc, VALUE* argv, VALUE self) 1740 | { 1741 | if (argc < 1 || argc > 2) { 1742 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc); 1743 | } 1744 | 1745 | bool use_page_locked = false; 1746 | size_t nbytes = NUM2SIZET(argv[0]); 1747 | if (argc == 2 && CLASS_OF(argv[1]) == rb_cHash) { 1748 | if (rb_hash_aref(argv[1], ID2SYM(rb_intern("page_locked"))) == Qtrue) { 1749 | use_page_locked = true; 1750 | } 1751 | } 1752 | 1753 | MemoryBuffer* pbuffer; 1754 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1755 | pbuffer->size = nbytes; 1756 | if (use_page_locked) { 1757 | CUresult status = cuMemAllocHost(reinterpret_cast(&pbuffer->p), nbytes); 1758 | if (status != CUDA_SUCCESS) { 1759 | RAISE_CU_STD_ERROR(status, "Failed to allocate page-locked host memory."); 1760 | } 1761 | pbuffer->is_page_locked = true; 1762 | } else { 1763 | pbuffer->p = new char[nbytes]; 1764 | pbuffer->is_page_locked = false; 1765 | } 1766 | std::memset(static_cast(pbuffer->p), 0, pbuffer->size); 1767 | return self; 1768 | } 1769 | 1770 | static VALUE memory_buffer_size(VALUE self) 1771 | { 1772 | MemoryBuffer* pbuffer; 1773 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1774 | return SIZET2NUM(pbuffer->size); 1775 | } 1776 | 1777 | static VALUE memory_buffer_is_page_locked(VALUE self) 1778 | { 1779 | MemoryBuffer* pbuffer; 1780 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1781 | return to_rb(pbuffer->is_page_locked); 1782 | } 1783 | 1784 | static VALUE memory_buffer_offset(VALUE self, VALUE offset) 1785 | { 1786 | MemoryBuffer* pbuffer; 1787 | MemoryPointer* ppointer_offset; 1788 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1789 | VALUE rb_ppointer_offset = rb_class_new_instance(0, NULL, rb_cMemoryPointer); 1790 | Data_Get_Struct(rb_ppointer_offset, MemoryPointer, ppointer_offset); 1791 | ppointer_offset->p = pbuffer->p + NUM2SIZET(offset); 1792 | return rb_ppointer_offset; 1793 | } 1794 | 1795 | static VALUE memory_buffer_element_get(VALUE self, VALUE index) 1796 | { 1797 | size_t i = NUM2SIZET(index); 1798 | MemoryBuffer* pbuffer; 1799 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1800 | int element = static_cast(pbuffer->p[i]); 1801 | return to_rb(element); 1802 | } 1803 | 1804 | static VALUE memory_buffer_element_set(VALUE self, VALUE index, VALUE value) 1805 | { 1806 | size_t i = NUM2SIZET(index); 1807 | MemoryBuffer* pbuffer; 1808 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1809 | pbuffer->p[i] = static_cast(FIX2INT(value)); 1810 | return value; 1811 | } 1812 | 1813 | template 1814 | static void buffer_free(void* p) 1815 | { 1816 | typedef struct TypedBuffer TBuffer; 1817 | TBuffer* pbuffer = static_cast(p); 1818 | if (pbuffer->is_page_locked) { 1819 | cuMemFreeHost(reinterpret_cast(pbuffer->p)); 1820 | } else { 1821 | delete[] pbuffer->p; 1822 | } 1823 | delete pbuffer; 1824 | } 1825 | 1826 | template 1827 | static VALUE buffer_alloc(VALUE klass) 1828 | { 1829 | typedef struct TypedBuffer TBuffer; 1830 | TBuffer* pbuffer = new TBuffer; 1831 | pbuffer->size = 0; 1832 | pbuffer->p = NULL; 1833 | return Data_Wrap_Struct(klass, 0, &buffer_free, pbuffer); 1834 | } 1835 | 1836 | template 1837 | static VALUE buffer_element_size(VALUE klass) 1838 | { 1839 | return INT2FIX(sizeof(TElement)); 1840 | } 1841 | typedef VALUE (*BufferElementSizeFunctionType)(VALUE); 1842 | 1843 | template 1844 | static VALUE buffer_initialize(int argc, VALUE* argv, VALUE self) 1845 | { 1846 | if (argc <= 0 || argc >= 3) { 1847 | rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc); 1848 | } 1849 | 1850 | bool use_page_locked = false; 1851 | VALUE n = NUM2SIZET(argv[0]); 1852 | if (argc == 2 && CLASS_OF(argv[1]) == rb_cHash) { 1853 | if (rb_hash_aref(argv[1], ID2SYM(rb_intern("page_locked"))) == Qtrue) { 1854 | use_page_locked = true; 1855 | } 1856 | } 1857 | 1858 | typedef struct TypedBuffer TBuffer; 1859 | TBuffer* pbuffer; 1860 | Data_Get_Struct(self, TBuffer, pbuffer); 1861 | pbuffer->size = n*sizeof(TElement); 1862 | if (use_page_locked) { 1863 | CUresult status = cuMemAllocHost(reinterpret_cast(&pbuffer->p), n*sizeof(TElement)); 1864 | if (status != CUDA_SUCCESS) { 1865 | RAISE_CU_STD_ERROR(status, "Failed to allocate page-locked host memory."); 1866 | } 1867 | pbuffer->is_page_locked = true; 1868 | } else { 1869 | pbuffer->p = reinterpret_cast(new TElement[n]); 1870 | pbuffer->is_page_locked = false; 1871 | } 1872 | std::memset(static_cast(pbuffer->p), 0, pbuffer->size); 1873 | return self; 1874 | } 1875 | typedef VALUE (*BufferInitializeFunctionType)(int, VALUE*, VALUE); 1876 | 1877 | template 1878 | static VALUE buffer_size(VALUE self) 1879 | { 1880 | MemoryBuffer* pbuffer; 1881 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1882 | return SIZET2NUM(pbuffer->size / sizeof(TElement)); 1883 | } 1884 | typedef VALUE (*BufferSizeFunctionType)(VALUE); 1885 | 1886 | template 1887 | static VALUE buffer_is_page_locked(VALUE self) 1888 | { 1889 | MemoryBuffer* pbuffer; 1890 | Data_Get_Struct(self, MemoryBuffer, pbuffer); 1891 | return to_rb(pbuffer->is_page_locked); 1892 | } 1893 | typedef VALUE (*BufferIsPageLocked)(VALUE); 1894 | 1895 | template 1896 | static VALUE buffer_offset(VALUE self, VALUE offset) 1897 | { 1898 | typedef struct TypedBuffer TBuffer; 1899 | TBuffer* pbuffer; 1900 | MemoryPointer* ppointer_offset; 1901 | Data_Get_Struct(self, TBuffer, pbuffer); 1902 | VALUE rb_ppointer_offset = rb_class_new_instance(0, NULL, rb_cMemoryPointer); 1903 | Data_Get_Struct(rb_ppointer_offset, MemoryPointer, ppointer_offset); 1904 | ppointer_offset->p = pbuffer->p + NUM2SIZET(offset)*sizeof(TElement); 1905 | return rb_ppointer_offset; 1906 | } 1907 | typedef VALUE (*BufferOffsetFunctionType)(VALUE, VALUE); 1908 | 1909 | template 1910 | static VALUE buffer_element_get(VALUE self, VALUE index) 1911 | { 1912 | typedef struct TypedBuffer TBuffer; 1913 | size_t i = NUM2SIZET(index); 1914 | TBuffer* pbuffer; 1915 | Data_Get_Struct(self, TBuffer, pbuffer); 1916 | TElement* e = reinterpret_cast(pbuffer->p); 1917 | TElement element = e[i]; 1918 | return to_rb(element); 1919 | } 1920 | typedef VALUE (*BufferElementGetFunctionType)(VALUE, VALUE); 1921 | 1922 | template 1923 | static VALUE buffer_element_set(VALUE self, VALUE index, VALUE value) 1924 | { 1925 | typedef struct TypedBuffer TBuffer; 1926 | size_t i = NUM2SIZET(index); 1927 | TElement v = to_ctype(value); 1928 | TBuffer* pbuffer; 1929 | Data_Get_Struct(self, TBuffer, pbuffer); 1930 | TElement* e = reinterpret_cast(pbuffer->p); 1931 | e[i] = v; 1932 | return value; 1933 | } 1934 | typedef VALUE (*BufferElementSetFunctionType)(VALUE, VALUE, VALUE); 1935 | 1936 | // }}} 1937 | 1938 | 1939 | // {{{ Memory 1940 | 1941 | /* call-seq: memcpy_htod(dst_devptr, src_mem, nbytes) -> nil 1942 | * 1943 | * Copy _nbytes_ from host memory at _src_mem_ to device memory at _dst_devptr_. 1944 | */ 1945 | static VALUE memcpy_htod(VALUE self, VALUE rb_device_ptr, VALUE rb_memory, VALUE nbytes) 1946 | { 1947 | CUdeviceptr* pdevice_ptr; 1948 | MemoryPointer* pmem; 1949 | Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr); 1950 | Data_Get_Struct(rb_memory, MemoryPointer, pmem); 1951 | CUresult status = cuMemcpyHtoD(*pdevice_ptr, static_cast(pmem->p), NUM2UINT(nbytes)); 1952 | if (status != CUDA_SUCCESS) { 1953 | RAISE_CU_STD_ERROR(status, "Failed to copy memory from host to device."); 1954 | } 1955 | return Qnil; 1956 | } 1957 | 1958 | /* call-seq: memcpy_htod_async(dst_devptr, src_mem, nbytes, stream) -> nil 1959 | * 1960 | * Copy _nbytes_ from host memory at _src_mem_ to device memory at _dst_devptr_ in _stream_ asynchronously. 1961 | * 1962 | * Note: The _src_mem_ should be *page-locked* memory. 1963 | */ 1964 | static VALUE memcpy_htod_async(VALUE self, VALUE rb_device_ptr, VALUE rb_memory, VALUE nbytes, VALUE rb_stream) 1965 | { 1966 | CUdeviceptr* pdevice_ptr; 1967 | MemoryPointer* pmem; 1968 | CUstream* pstream; 1969 | CUstream stream0 = 0; 1970 | Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr); 1971 | Data_Get_Struct(rb_memory, MemoryPointer, pmem); 1972 | if (CLASS_OF(rb_stream) == rb_cCUStream) { 1973 | Data_Get_Struct(rb_stream, CUstream, pstream); 1974 | } else { 1975 | pstream = &stream0; 1976 | } 1977 | CUresult status = cuMemcpyHtoDAsync(*pdevice_ptr, static_cast(pmem->p), NUM2UINT(nbytes), *pstream); 1978 | if (status != CUDA_SUCCESS) { 1979 | RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from host to device."); 1980 | } 1981 | return Qnil; 1982 | } 1983 | 1984 | /* call-seq: memcpy_dtoh(dst_mem, src_devptr, nbytes) -> nil 1985 | * 1986 | * Copy _nbytes_ from device memory at _src_devptr_ to host memory at _dst_mem_. 1987 | */ 1988 | static VALUE memcpy_dtoh(VALUE self, VALUE rb_memory, VALUE rb_device_ptr, VALUE nbytes) 1989 | { 1990 | MemoryPointer* pmem; 1991 | CUdeviceptr* pdevice_ptr; 1992 | Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr); 1993 | Data_Get_Struct(rb_memory, MemoryPointer, pmem); 1994 | CUresult status = cuMemcpyDtoH(static_cast(pmem->p), *pdevice_ptr, NUM2UINT(nbytes)); 1995 | if (status != CUDA_SUCCESS) { 1996 | RAISE_CU_STD_ERROR(status, "Failed to copy memory from device to host."); 1997 | } 1998 | return Qnil; 1999 | } 2000 | 2001 | /* call-seq: memcpy_dtoh_async(dst_mem, src_devptr, nbytes, stream) -> nil 2002 | * 2003 | * Copy _nbytes_ from device memory at _src_devptr_ to host memory at _dst_mem_ in _stream_ asynchronously. 2004 | * 2005 | * Note: The _dst_mem_ should be *page-locked* memory. 2006 | */ 2007 | static VALUE memcpy_dtoh_async(VALUE self, VALUE rb_memory, VALUE rb_device_ptr, VALUE nbytes, VALUE rb_stream) 2008 | { 2009 | MemoryPointer* pmem; 2010 | CUdeviceptr* pdevice_ptr; 2011 | CUstream* pstream; 2012 | CUstream stream0 = 0; 2013 | Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr); 2014 | Data_Get_Struct(rb_memory, MemoryPointer, pmem); 2015 | if (CLASS_OF(rb_stream) == rb_cCUStream) { 2016 | Data_Get_Struct(rb_stream, CUstream, pstream); 2017 | } else { 2018 | pstream = &stream0; 2019 | } 2020 | CUresult status = cuMemcpyDtoHAsync(static_cast(pmem->p), *pdevice_ptr, NUM2UINT(nbytes), *pstream); 2021 | if (status != CUDA_SUCCESS) { 2022 | RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from device to host."); 2023 | } 2024 | return Qnil; 2025 | } 2026 | 2027 | /* call-seq: memcpy_dtod(dst_devptr, src_devptr, nbytes) -> nil 2028 | * 2029 | * Copy _nbytes_ from device memory at _src_devptr_ to device memory at _dst_devptr_ asynchronously. 2030 | */ 2031 | static VALUE memcpy_dtod(VALUE self, VALUE rb_device_ptr_dst, VALUE rb_device_ptr_src, VALUE nbytes) 2032 | { 2033 | CUdeviceptr* dst; 2034 | CUdeviceptr* src; 2035 | Data_Get_Struct(rb_device_ptr_dst, CUdeviceptr, dst); 2036 | Data_Get_Struct(rb_device_ptr_src, CUdeviceptr, src); 2037 | CUresult status = cuMemcpyDtoD(*dst, *src, NUM2UINT(nbytes)); 2038 | if (status != CUDA_SUCCESS) { 2039 | RAISE_CU_STD_ERROR(status, "Failed to copy memory from device to device."); 2040 | } 2041 | return Qnil; 2042 | } 2043 | 2044 | /* call-seq: memcpy_dtod_async(dst_devptr, src_devptr, nbytes, stream) -> nil 2045 | * 2046 | * Copy _nbytes_ from device memory at _src_devptr_ to device memory at _dst_devptr_ in _stream_ asynchronously. 2047 | */ 2048 | static VALUE memcpy_dtod_async(VALUE self, VALUE rb_device_ptr_dst, VALUE rb_device_ptr_src, VALUE nbytes, VALUE rb_stream) 2049 | { 2050 | CUdeviceptr* dst; 2051 | CUdeviceptr* src; 2052 | CUstream *pstream; 2053 | CUstream stream0 = 0; 2054 | Data_Get_Struct(rb_device_ptr_dst, CUdeviceptr, dst); 2055 | Data_Get_Struct(rb_device_ptr_src, CUdeviceptr, src); 2056 | if (CLASS_OF(rb_stream) == rb_cCUStream) { 2057 | Data_Get_Struct(rb_stream, CUstream, pstream); 2058 | } else { 2059 | pstream = &stream0; 2060 | } 2061 | CUresult status = cuMemcpyDtoDAsync(*dst, *src, NUM2UINT(nbytes), *pstream); 2062 | if (status != CUDA_SUCCESS) { 2063 | RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from device to device."); 2064 | } 2065 | return Qnil; 2066 | } 2067 | 2068 | /* call-seq: mem_get_info -> Hash { free:, total: } 2069 | * 2070 | * Return a hash { free:, total: } with the amount of free and total device memory in bytes. 2071 | */ 2072 | static VALUE mem_get_info(VALUE self) 2073 | { 2074 | size_t free_memory; 2075 | size_t total_memory; 2076 | CUresult status = cuMemGetInfo(&free_memory, &total_memory); 2077 | if (status != CUDA_SUCCESS) { 2078 | RAISE_CU_STD_ERROR(status, "Failed to get memory information."); 2079 | } 2080 | VALUE h = rb_hash_new(); 2081 | rb_hash_aset(h, ID2SYM(rb_intern("free")), UINT2NUM(free_memory)); 2082 | rb_hash_aset(h, ID2SYM(rb_intern("total")), UINT2NUM(total_memory)); 2083 | return h; 2084 | } 2085 | 2086 | // }}} 2087 | 2088 | 2089 | // {{{ Driver 2090 | 2091 | /* call-seq: driver_get_version -> Fixnum 2092 | * 2093 | * Return the version number of the installed CUDA driver. 2094 | */ 2095 | static VALUE driver_get_version() 2096 | { 2097 | int v; 2098 | cuDriverGetVersion(&v); 2099 | return INT2FIX(v); 2100 | } 2101 | 2102 | // }}} 2103 | 2104 | 2105 | // {{{ Doc 2106 | 2107 | /* Document-class: SGC::CU::MemoryBuffer 2108 | * See IBuffer and IBuffer::ClassMethods. 2109 | * 2110 | * Note: ELEMENT_SIZE is *deprecated*. Use MemoryBuffer.element_size. 2111 | */ 2112 | 2113 | /* Document-class: SGC::CU::Int32Buffer 2114 | * See IBuffer and IBuffer::ClassMethods. 2115 | * 2116 | * Note: ELEMENT_SIZE is *deprecated*. Use Int32Buffer.element_size. 2117 | */ 2118 | 2119 | /* Document-class: SGC::CU::Int64Buffer 2120 | * See IBuffer and IBuffer::ClassMethods. 2121 | * 2122 | * Note: ELEMENT_SIZE is *deprecated*. Use Int64Buffer.element_size. 2123 | */ 2124 | 2125 | /* Document-class: SGC::CU::Float32Buffer 2126 | * See IBuffer and IBuffer::ClassMethods. 2127 | * 2128 | * Note: ELEMENT_SIZE is *deprecated*. Use Float32Buffer.element_size. 2129 | */ 2130 | 2131 | /* Document-class: SGC::CU::Float64Buffer 2132 | * See IBuffer and IBuffer::ClassMethods. 2133 | * 2134 | * Note: ELEMENT_SIZE is *deprecated*. Use Float64Buffer.element_size. 2135 | */ 2136 | 2137 | // }}} 2138 | 2139 | 2140 | extern "C" void Init_rubycu() 2141 | { 2142 | rb_mSGC = rb_define_module("SGC"); 2143 | rb_mCU = rb_define_module_under(rb_mSGC, "CU"); 2144 | 2145 | rb_cCUDevice = rb_define_class_under(rb_mCU, "CUDevice", rb_cObject); 2146 | rb_define_singleton_method(rb_cCUDevice, "get_count", RUBY_METHOD_FUNC(device_get_count), 0); 2147 | rb_define_singleton_method(rb_cCUDevice, "get", RUBY_METHOD_FUNC(device_get), 1); 2148 | rb_define_alloc_func(rb_cCUDevice, device_alloc); 2149 | rb_define_method(rb_cCUDevice, "initialize", RUBY_METHOD_FUNC(device_initialize), -1); 2150 | rb_define_method(rb_cCUDevice, "get_name", RUBY_METHOD_FUNC(device_get_name), 0); 2151 | rb_define_method(rb_cCUDevice, "compute_capability", RUBY_METHOD_FUNC(device_compute_capability), 0); 2152 | rb_define_method(rb_cCUDevice, "get_attribute", RUBY_METHOD_FUNC(device_get_attribute), 1); 2153 | rb_define_method(rb_cCUDevice, "get_properties", RUBY_METHOD_FUNC(device_get_properties), 0); 2154 | rb_define_method(rb_cCUDevice, "total_mem", RUBY_METHOD_FUNC(device_total_mem), 0); 2155 | 2156 | rb_cCUComputeMode = rb_define_class_under(rb_mCU, "CUComputeMode", rb_cObject); 2157 | rb_define_const(rb_cCUComputeMode, "DEFAULT", INT2FIX(CU_COMPUTEMODE_DEFAULT)); 2158 | rb_define_const(rb_cCUComputeMode, "EXCLUSIVE", INT2FIX(CU_COMPUTEMODE_EXCLUSIVE)); 2159 | rb_define_const(rb_cCUComputeMode, "PROHIBITED", INT2FIX(CU_COMPUTEMODE_PROHIBITED)); 2160 | 2161 | rb_cCUDeviceAttribute = rb_define_class_under(rb_mCU, "CUDeviceAttribute", rb_cObject); 2162 | rb_define_const(rb_cCUDeviceAttribute, "MAX_THREADS_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)); 2163 | rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_X", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)); 2164 | rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_Y", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)); 2165 | rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_Z", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z)); 2166 | rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_X", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)); 2167 | rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_Y", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)); 2168 | rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_Z", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z)); 2169 | rb_define_const(rb_cCUDeviceAttribute, "MAX_REGISTERS_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK)); 2170 | rb_define_const(rb_cCUDeviceAttribute, "MAX_SHARED_MEMORY_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK)); 2171 | rb_define_const(rb_cCUDeviceAttribute, "TOTAL_CONSTANT_MEMORY", INT2FIX(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY)); 2172 | rb_define_const(rb_cCUDeviceAttribute, "WARP_SIZE", INT2FIX(CU_DEVICE_ATTRIBUTE_WARP_SIZE)); 2173 | rb_define_const(rb_cCUDeviceAttribute, "MAX_PITCH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_PITCH)); 2174 | rb_define_const(rb_cCUDeviceAttribute, "CLOCK_RATE", INT2FIX(CU_DEVICE_ATTRIBUTE_CLOCK_RATE)); 2175 | rb_define_const(rb_cCUDeviceAttribute, "TEXTURE_ALIGNMENT", INT2FIX(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT)); 2176 | rb_define_const(rb_cCUDeviceAttribute, "GPU_OVERLAP", INT2FIX(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP)); 2177 | rb_define_const(rb_cCUDeviceAttribute, "MULTIPROCESSOR_COUNT", INT2FIX(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)); 2178 | rb_define_const(rb_cCUDeviceAttribute, "KERNEL_EXEC_TIMEOUT", INT2FIX(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT)); 2179 | rb_define_const(rb_cCUDeviceAttribute, "INTEGRATED", INT2FIX(CU_DEVICE_ATTRIBUTE_INTEGRATED)); 2180 | rb_define_const(rb_cCUDeviceAttribute, "CAN_MAP_HOST_MEMORY", INT2FIX(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY)); 2181 | rb_define_const(rb_cCUDeviceAttribute, "COMPUTE_MODE", INT2FIX(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE)); 2182 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE1D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH)); 2183 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH)); 2184 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH)); 2185 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT)); 2186 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT)); 2187 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_DEPTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH)); 2188 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH)); 2189 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT)); 2190 | rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES)); 2191 | rb_define_const(rb_cCUDeviceAttribute, "SURFACE_ALIGNMENT", INT2FIX(CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT)); 2192 | rb_define_const(rb_cCUDeviceAttribute, "CONCURRENT_KERNELS", INT2FIX(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS)); 2193 | rb_define_const(rb_cCUDeviceAttribute, "ECC_ENABLED", INT2FIX(CU_DEVICE_ATTRIBUTE_ECC_ENABLED)); 2194 | rb_define_const(rb_cCUDeviceAttribute, "PCI_BUS_ID", INT2FIX(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID)); 2195 | rb_define_const(rb_cCUDeviceAttribute, "PCI_DEVICE_ID", INT2FIX(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID)); 2196 | rb_define_const(rb_cCUDeviceAttribute, "TCC_DRIVER", INT2FIX(CU_DEVICE_ATTRIBUTE_TCC_DRIVER)); 2197 | 2198 | rb_cCUContext = rb_define_class_under(rb_mCU, "CUContext", rb_cObject); 2199 | rb_define_alloc_func(rb_cCUContext, context_alloc); 2200 | rb_define_method(rb_cCUContext, "initialize", RUBY_METHOD_FUNC(context_initialize), -1); 2201 | rb_define_method(rb_cCUContext, "create", RUBY_METHOD_FUNC(context_create), -1); 2202 | rb_define_method(rb_cCUContext, "destroy", RUBY_METHOD_FUNC(context_destroy), 0); 2203 | rb_define_method(rb_cCUContext, "attach", RUBY_METHOD_FUNC(context_attach), -1); 2204 | rb_define_method(rb_cCUContext, "detach", RUBY_METHOD_FUNC(context_detach), 0); 2205 | rb_define_method(rb_cCUContext, "push_current", RUBY_METHOD_FUNC(context_push_current), 0); 2206 | rb_define_method(rb_cCUContext, "get_api_version", RUBY_METHOD_FUNC(context_get_api_version), 0); 2207 | rb_define_singleton_method(rb_cCUContext, "get_device", RUBY_METHOD_FUNC(context_get_device), 0); 2208 | rb_define_singleton_method(rb_cCUContext, "get_limit", RUBY_METHOD_FUNC(context_get_limit), 1); 2209 | rb_define_singleton_method(rb_cCUContext, "set_limit", RUBY_METHOD_FUNC(context_set_limit), 2); 2210 | rb_define_singleton_method(rb_cCUContext, "get_cache_config", RUBY_METHOD_FUNC(context_get_cache_config), 0); 2211 | rb_define_singleton_method(rb_cCUContext, "set_cache_config", RUBY_METHOD_FUNC(context_set_cache_config), 1); 2212 | rb_define_singleton_method(rb_cCUContext, "get_api_version", RUBY_METHOD_FUNC(context_get_api_version_singleton), 0); 2213 | rb_define_singleton_method(rb_cCUContext, "pop_current", RUBY_METHOD_FUNC(context_pop_current), 0); 2214 | rb_define_singleton_method(rb_cCUContext, "synchronize", RUBY_METHOD_FUNC(context_synchronize), 0); 2215 | 2216 | rb_cCUContextFlags = rb_define_class_under(rb_mCU, "CUContextFlags", rb_cObject); 2217 | rb_define_const(rb_cCUContextFlags, "SCHED_AUTO", INT2FIX(CU_CTX_SCHED_AUTO)); 2218 | rb_define_const(rb_cCUContextFlags, "SCHED_SPIN", INT2FIX(CU_CTX_SCHED_SPIN)); 2219 | rb_define_const(rb_cCUContextFlags, "SCHED_YIELD", INT2FIX(CU_CTX_SCHED_YIELD)); 2220 | rb_define_const(rb_cCUContextFlags, "BLOCKING_SYNC", INT2FIX(CU_CTX_BLOCKING_SYNC)); 2221 | rb_define_const(rb_cCUContextFlags, "MAP_HOST", INT2FIX(CU_CTX_MAP_HOST)); 2222 | rb_define_const(rb_cCUContextFlags, "LMEM_RESIZE_TO_MAX", INT2FIX(CU_CTX_LMEM_RESIZE_TO_MAX)); 2223 | 2224 | rb_cCULimit = rb_define_class_under(rb_mCU, "CULimit", rb_cObject); 2225 | rb_define_const(rb_cCULimit, "STACK_SIZE", INT2FIX(CU_LIMIT_STACK_SIZE)); 2226 | rb_define_const(rb_cCULimit, "PRINTF_FIFO_SIZE", INT2FIX(CU_LIMIT_PRINTF_FIFO_SIZE)); 2227 | rb_define_const(rb_cCULimit, "MALLOC_HEAP_SIZE", INT2FIX(CU_LIMIT_MALLOC_HEAP_SIZE)); 2228 | 2229 | rb_cCUModule = rb_define_class_under(rb_mCU, "CUModule", rb_cObject); 2230 | rb_define_alloc_func(rb_cCUModule, module_alloc); 2231 | rb_define_method(rb_cCUModule, "initialize", RUBY_METHOD_FUNC(module_initialize), -1); 2232 | rb_define_method(rb_cCUModule, "load", RUBY_METHOD_FUNC(module_load), 1); 2233 | rb_define_method(rb_cCUModule, "load_data", RUBY_METHOD_FUNC(module_load_data), 1); 2234 | rb_define_method(rb_cCUModule, "unload", RUBY_METHOD_FUNC(module_unload), 0); 2235 | rb_define_method(rb_cCUModule, "get_function", RUBY_METHOD_FUNC(module_get_function), 1); 2236 | rb_define_method(rb_cCUModule, "get_global", RUBY_METHOD_FUNC(module_get_global), 1); 2237 | rb_define_method(rb_cCUModule, "get_texref", RUBY_METHOD_FUNC(module_get_texref), 1); 2238 | 2239 | rb_cCUDevicePtr = rb_define_class_under(rb_mCU, "CUDevicePtr", rb_cObject); 2240 | rb_define_alloc_func(rb_cCUDevicePtr, device_ptr_alloc); 2241 | rb_define_method(rb_cCUDevicePtr, "initialize", RUBY_METHOD_FUNC(device_ptr_initialize), -1); 2242 | rb_define_method(rb_cCUDevicePtr, "offset", RUBY_METHOD_FUNC(device_ptr_offset), 1); 2243 | rb_define_method(rb_cCUDevicePtr, "mem_alloc", RUBY_METHOD_FUNC(device_ptr_mem_alloc), 1); 2244 | rb_define_method(rb_cCUDevicePtr, "mem_free", RUBY_METHOD_FUNC(device_ptr_mem_free), 0); 2245 | 2246 | rb_cCUFunction = rb_define_class_under(rb_mCU, "CUFunction", rb_cObject); 2247 | rb_define_alloc_func(rb_cCUFunction, function_alloc); 2248 | rb_define_method(rb_cCUFunction, "initialize", RUBY_METHOD_FUNC(function_initialize), -1); 2249 | rb_define_method(rb_cCUFunction, "set_param", RUBY_METHOD_FUNC(function_set_param), -1); 2250 | rb_define_method(rb_cCUFunction, "set_texref", RUBY_METHOD_FUNC(function_set_texref), 1); 2251 | rb_define_method(rb_cCUFunction, "set_block_shape", RUBY_METHOD_FUNC(function_set_block_shape), -1); 2252 | rb_define_method(rb_cCUFunction, "set_shared_size", RUBY_METHOD_FUNC(function_set_shared_size), 1); 2253 | rb_define_method(rb_cCUFunction, "launch", RUBY_METHOD_FUNC(function_launch), 0); 2254 | rb_define_method(rb_cCUFunction, "launch_grid", RUBY_METHOD_FUNC(function_launch_grid), -1); 2255 | rb_define_method(rb_cCUFunction, "launch_grid_async", RUBY_METHOD_FUNC(function_launch_grid_async), -1); 2256 | rb_define_method(rb_cCUFunction, "get_attribute", RUBY_METHOD_FUNC(function_get_attribute), 1); 2257 | rb_define_method(rb_cCUFunction, "set_cache_config", RUBY_METHOD_FUNC(function_set_cache_config), 1); 2258 | 2259 | rb_cCUFunctionAttribute = rb_define_class_under(rb_mCU, "CUFunctionAttribute", rb_cObject); 2260 | rb_define_const(rb_cCUFunctionAttribute, "MAX_THREADS_PER_BLOCK", INT2FIX(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)); 2261 | rb_define_const(rb_cCUFunctionAttribute, "SHARED_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)); 2262 | rb_define_const(rb_cCUFunctionAttribute, "CONST_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)); 2263 | rb_define_const(rb_cCUFunctionAttribute, "LOCAL_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)); 2264 | rb_define_const(rb_cCUFunctionAttribute, "NUM_REGS", INT2FIX(CU_FUNC_ATTRIBUTE_NUM_REGS)); 2265 | rb_define_const(rb_cCUFunctionAttribute, "PTX_VERSION", INT2FIX(CU_FUNC_ATTRIBUTE_PTX_VERSION)); 2266 | rb_define_const(rb_cCUFunctionAttribute, "BINARY_VERSION", INT2FIX(CU_FUNC_ATTRIBUTE_BINARY_VERSION)); 2267 | 2268 | rb_cCUFunctionCache = rb_define_class_under(rb_mCU, "CUFunctionCache", rb_cObject); 2269 | rb_define_const(rb_cCUFunctionCache, "PREFER_NONE", INT2FIX(CU_FUNC_CACHE_PREFER_NONE)); 2270 | rb_define_const(rb_cCUFunctionCache, "PREFER_SHARED", INT2FIX(CU_FUNC_CACHE_PREFER_SHARED)); 2271 | rb_define_const(rb_cCUFunctionCache, "PREFER_L1", INT2FIX(CU_FUNC_CACHE_PREFER_L1)); 2272 | 2273 | rb_cCUStream = rb_define_class_under(rb_mCU, "CUStream", rb_cObject); 2274 | rb_define_alloc_func(rb_cCUStream, stream_alloc); 2275 | rb_define_method(rb_cCUStream, "initialize", RUBY_METHOD_FUNC(stream_initialize), 0); 2276 | rb_define_method(rb_cCUStream, "create", RUBY_METHOD_FUNC(stream_create), -1); 2277 | rb_define_method(rb_cCUStream, "destroy", RUBY_METHOD_FUNC(stream_destroy), 0); 2278 | rb_define_method(rb_cCUStream, "query", RUBY_METHOD_FUNC(stream_query), 0); 2279 | rb_define_method(rb_cCUStream, "synchronize", RUBY_METHOD_FUNC(stream_synchronize), 0); 2280 | rb_define_method(rb_cCUStream, "wait_event", RUBY_METHOD_FUNC(stream_wait_event), -1); 2281 | rb_define_singleton_method(rb_cCUStream, "wait_event", RUBY_METHOD_FUNC(stream_wait_event_singleton), -1); 2282 | 2283 | rb_cCUEvent = rb_define_class_under(rb_mCU, "CUEvent", rb_cObject); 2284 | rb_define_alloc_func(rb_cCUEvent, event_alloc); 2285 | rb_define_method(rb_cCUEvent, "initialize", RUBY_METHOD_FUNC(event_initialize), 0); 2286 | rb_define_method(rb_cCUEvent, "create", RUBY_METHOD_FUNC(event_create), -1); 2287 | rb_define_method(rb_cCUEvent, "destroy", RUBY_METHOD_FUNC(event_destroy), 0); 2288 | rb_define_method(rb_cCUEvent, "query", RUBY_METHOD_FUNC(event_query), 0); 2289 | rb_define_method(rb_cCUEvent, "record", RUBY_METHOD_FUNC(event_record), 1); 2290 | rb_define_method(rb_cCUEvent, "synchronize", RUBY_METHOD_FUNC(event_synchronize), 0); 2291 | rb_define_singleton_method(rb_cCUEvent, "elapsed_time", RUBY_METHOD_FUNC(event_elapsed_time), 2); 2292 | 2293 | rb_cCUEventFlags = rb_define_class_under(rb_mCU, "CUEventFlags", rb_cObject); 2294 | rb_define_const(rb_cCUEventFlags, "DEFAULT", INT2FIX(CU_EVENT_DEFAULT)); 2295 | rb_define_const(rb_cCUEventFlags, "BLOCKING_SYNC", INT2FIX(CU_EVENT_BLOCKING_SYNC)); 2296 | rb_define_const(rb_cCUEventFlags, "DISABLE_TIMING", INT2FIX(CU_EVENT_DISABLE_TIMING)); 2297 | 2298 | rb_cCUAddressMode = rb_define_class_under(rb_mCU, "CUAddressMode", rb_cObject); 2299 | rb_define_const(rb_cCUAddressMode, "WRAP", INT2FIX(CU_TR_ADDRESS_MODE_WRAP)); 2300 | rb_define_const(rb_cCUAddressMode, "CLAMP", INT2FIX(CU_TR_ADDRESS_MODE_CLAMP)); 2301 | rb_define_const(rb_cCUAddressMode, "MIRROR", INT2FIX(CU_TR_ADDRESS_MODE_MIRROR)); 2302 | rb_define_const(rb_cCUAddressMode, "BORDER", INT2FIX(CU_TR_ADDRESS_MODE_BORDER)); 2303 | 2304 | rb_cCUFilterMode = rb_define_class_under(rb_mCU, "CUFilterMode", rb_cObject); 2305 | rb_define_const(rb_cCUFilterMode, "POINT", INT2FIX(CU_TR_FILTER_MODE_POINT)); 2306 | rb_define_const(rb_cCUFilterMode, "LINEAR", INT2FIX(CU_TR_FILTER_MODE_LINEAR)); 2307 | 2308 | rb_cCUTexRefFlags = rb_define_class_under(rb_mCU, "CUTexRefFlags", rb_cObject); 2309 | rb_define_const(rb_cCUTexRefFlags, "READ_AS_INTEGER", INT2FIX(CU_TRSF_READ_AS_INTEGER)); 2310 | rb_define_const(rb_cCUTexRefFlags, "NORMALIZED_COORDINATES", INT2FIX(CU_TRSF_NORMALIZED_COORDINATES)); 2311 | 2312 | rb_cCUTexRef = rb_define_class_under(rb_mCU, "CUTexRef", rb_cObject); 2313 | rb_define_alloc_func(rb_cCUTexRef, texref_alloc); 2314 | rb_define_method(rb_cCUTexRef, "initialize", RUBY_METHOD_FUNC(texref_initialize), 0); 2315 | rb_define_method(rb_cCUTexRef, "create", RUBY_METHOD_FUNC(texref_create), 0); 2316 | rb_define_method(rb_cCUTexRef, "destroy", RUBY_METHOD_FUNC(texref_destroy), 0); 2317 | rb_define_method(rb_cCUTexRef, "get_address", RUBY_METHOD_FUNC(texref_get_address), 0); 2318 | rb_define_method(rb_cCUTexRef, "get_address_mode", RUBY_METHOD_FUNC(texref_get_address_mode), 1); 2319 | rb_define_method(rb_cCUTexRef, "get_filter_mode", RUBY_METHOD_FUNC(texref_get_filter_mode), 0); 2320 | rb_define_method(rb_cCUTexRef, "get_flags", RUBY_METHOD_FUNC(texref_get_flags), 0); 2321 | rb_define_method(rb_cCUTexRef, "set_address", RUBY_METHOD_FUNC(texref_set_address), 2); 2322 | rb_define_method(rb_cCUTexRef, "set_address_mode", RUBY_METHOD_FUNC(texref_set_address_mode), 2); 2323 | rb_define_method(rb_cCUTexRef, "set_filter_mode", RUBY_METHOD_FUNC(texref_set_filter_mode), 1); 2324 | rb_define_method(rb_cCUTexRef, "set_flags", RUBY_METHOD_FUNC(texref_set_flags), 1); 2325 | 2326 | rb_cCUResult = rb_define_class_under(rb_mCU, "CUResult", rb_cObject); 2327 | rb_define_const(rb_cCUResult, "SUCCESS", INT2FIX(CUDA_SUCCESS)); 2328 | rb_define_const(rb_cCUResult, "ERROR_INVALID_VALUE", INT2FIX(CUDA_ERROR_INVALID_VALUE)); 2329 | rb_define_const(rb_cCUResult, "ERROR_OUT_OF_MEMORY", INT2FIX(CUDA_ERROR_OUT_OF_MEMORY)); 2330 | rb_define_const(rb_cCUResult, "ERROR_NOT_INITIALIZED", INT2FIX(CUDA_ERROR_NOT_INITIALIZED)); 2331 | rb_define_const(rb_cCUResult, "ERROR_DEINITIALIZED", INT2FIX(CUDA_ERROR_DEINITIALIZED)); 2332 | rb_define_const(rb_cCUResult, "ERROR_NO_DEVICE", INT2FIX(CUDA_ERROR_NO_DEVICE)); 2333 | rb_define_const(rb_cCUResult, "ERROR_INVALID_DEVICE", INT2FIX(CUDA_ERROR_INVALID_DEVICE)); 2334 | rb_define_const(rb_cCUResult, "ERROR_INVALID_IMAGE", INT2FIX(CUDA_ERROR_INVALID_IMAGE)); 2335 | rb_define_const(rb_cCUResult, "ERROR_INVALID_CONTEXT", INT2FIX(CUDA_ERROR_INVALID_CONTEXT)); 2336 | rb_define_const(rb_cCUResult, "ERROR_CONTEXT_ALREADY_CURRENT", INT2FIX(CUDA_ERROR_CONTEXT_ALREADY_CURRENT)); 2337 | rb_define_const(rb_cCUResult, "ERROR_MAP_FAILED", INT2FIX(CUDA_ERROR_MAP_FAILED)); 2338 | rb_define_const(rb_cCUResult, "ERROR_UNMAP_FAILED", INT2FIX(CUDA_ERROR_UNMAP_FAILED)); 2339 | rb_define_const(rb_cCUResult, "ERROR_ARRAY_IS_MAPPED", INT2FIX(CUDA_ERROR_ARRAY_IS_MAPPED)); 2340 | rb_define_const(rb_cCUResult, "ERROR_ALREADY_MAPPED", INT2FIX(CUDA_ERROR_ALREADY_MAPPED)); 2341 | rb_define_const(rb_cCUResult, "ERROR_NO_BINARY_FOR_GPU", INT2FIX(CUDA_ERROR_NO_BINARY_FOR_GPU)); 2342 | rb_define_const(rb_cCUResult, "ERROR_ALREADY_ACQUIRED", INT2FIX(CUDA_ERROR_ALREADY_ACQUIRED)); 2343 | rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED", INT2FIX(CUDA_ERROR_NOT_MAPPED)); 2344 | rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED_AS_ARRAY", INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_ARRAY)); 2345 | rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED_AS_POINTER", INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_POINTER)); 2346 | rb_define_const(rb_cCUResult, "ERROR_ECC_UNCORRECTABLE", INT2FIX(CUDA_ERROR_ECC_UNCORRECTABLE)); 2347 | rb_define_const(rb_cCUResult, "ERROR_UNSUPPORTED_LIMIT", INT2FIX(CUDA_ERROR_UNSUPPORTED_LIMIT)); 2348 | rb_define_const(rb_cCUResult, "ERROR_INVALID_SOURCE", INT2FIX(CUDA_ERROR_INVALID_SOURCE)); 2349 | rb_define_const(rb_cCUResult, "ERROR_FILE_NOT_FOUND", INT2FIX(CUDA_ERROR_FILE_NOT_FOUND)); 2350 | rb_define_const(rb_cCUResult, "ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", INT2FIX(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND)); 2351 | rb_define_const(rb_cCUResult, "ERROR_SHARED_OBJECT_INIT_FAILED", INT2FIX(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED)); 2352 | rb_define_const(rb_cCUResult, "ERROR_OPERATING_SYSTEM", INT2FIX(CUDA_ERROR_OPERATING_SYSTEM)); 2353 | rb_define_const(rb_cCUResult, "ERROR_INVALID_HANDLE", INT2FIX(CUDA_ERROR_INVALID_HANDLE)); 2354 | rb_define_const(rb_cCUResult, "ERROR_NOT_FOUND", INT2FIX(CUDA_ERROR_NOT_FOUND)); 2355 | rb_define_const(rb_cCUResult, "ERROR_NOT_READY", INT2FIX(CUDA_ERROR_NOT_READY)); 2356 | rb_define_const(rb_cCUResult, "ERROR_LAUNCH_FAILED", INT2FIX(CUDA_ERROR_LAUNCH_FAILED)); 2357 | rb_define_const(rb_cCUResult, "ERROR_LAUNCH_OUT_OF_RESOURCES", INT2FIX(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES)); 2358 | rb_define_const(rb_cCUResult, "ERROR_LAUNCH_TIMEOUT", INT2FIX(CUDA_ERROR_LAUNCH_TIMEOUT)); 2359 | rb_define_const(rb_cCUResult, "ERROR_LAUNCH_INCOMPATIBLE_TEXTURING" , INT2FIX(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING)); 2360 | rb_define_const(rb_cCUResult, "ERROR_UNKNOWN", INT2FIX(CUDA_ERROR_UNKNOWN)); 2361 | 2362 | rb_eCUStandardError = rb_define_class_under(rb_mCU, "CUStandardError", rb_eStandardError); 2363 | 2364 | rb_eCUDeviceError = rb_define_class_under(rb_mCU, "CUDeviceError", rb_eCUStandardError); 2365 | rb_eCUDeviceNotInitializedError = rb_define_class_under(rb_mCU, "CUDeviceNotInitializedError", rb_eCUDeviceError); 2366 | rb_eCUDeviceDeinitializedError = rb_define_class_under(rb_mCU, "CUDeviceDeinitializedError", rb_eCUDeviceError); 2367 | rb_eCUNoDeviceError = rb_define_class_under(rb_mCU, "CUNoDeviceError", rb_eCUDeviceError); 2368 | rb_eCUInvalidDeviceError = rb_define_class_under(rb_mCU, "CUInvalidDeviceError", rb_eCUDeviceError); 2369 | 2370 | rb_eCUMapError = rb_define_class_under(rb_mCU, "CUMapError", rb_eCUStandardError); 2371 | rb_eCUMapFailedError = rb_define_class_under(rb_mCU, "CUMapFailedError", rb_eCUMapError); 2372 | rb_eCUUnMapFailedError = rb_define_class_under(rb_mCU, "CUUnMapFailedError", rb_eCUMapError); 2373 | rb_eCUArrayIsMappedError = rb_define_class_under(rb_mCU, "CUArrayIsMappedError", rb_eCUMapError); 2374 | rb_eCUAlreadyMappedError = rb_define_class_under(rb_mCU, "CUAlreadyMappedError", rb_eCUMapError); 2375 | rb_eCUNotMappedError = rb_define_class_under(rb_mCU, "CUNotMappedError", rb_eCUMapError); 2376 | rb_eCUNotMappedAsArrayError = rb_define_class_under(rb_mCU, "CUNotMappedAsArrayError", rb_eCUMapError); 2377 | rb_eCUNotMappedAsPointerError = rb_define_class_under(rb_mCU, "CUNotMappedAsPointerError", rb_eCUMapError); 2378 | 2379 | rb_eCUContextError = rb_define_class_under(rb_mCU, "CUContextError", rb_eCUStandardError); 2380 | rb_eCUInvalidContextError = rb_define_class_under(rb_mCU, "CUInvalidContextError", rb_eCUContextError); 2381 | rb_eCUContextAlreadyCurrentError = rb_define_class_under(rb_mCU, "CUContextAlreadyCurrentError", rb_eCUContextError); 2382 | rb_eCUUnsupportedLimitError = rb_define_class_under(rb_mCU, "CUUnsupportedLimitError", rb_eCUContextError); 2383 | 2384 | rb_eCULaunchError = rb_define_class_under(rb_mCU, "CULaunchError", rb_eCUStandardError); 2385 | rb_eCULaunchFailedError = rb_define_class_under(rb_mCU, "CULaunchFailedError", rb_eCULaunchError); 2386 | rb_eCULaunchOutOfResourcesError = rb_define_class_under(rb_mCU, "CULaunchOutOfResourcesError", rb_eCULaunchError); 2387 | rb_eCULaunchTimeoutError = rb_define_class_under(rb_mCU, "CULaunchTimeoutError", rb_eCULaunchError); 2388 | rb_eCULaunchIncompatibleTexturingError = rb_define_class_under(rb_mCU, "CULaunchIncompatibleTexturingError", rb_eCULaunchError); 2389 | 2390 | rb_eCUParameterError = rb_define_class_under(rb_mCU, "CUParameterError", rb_eCUStandardError); 2391 | rb_eCUInvalidValueError = rb_define_class_under(rb_mCU, "CUInvalidValueError", rb_eCUParameterError); 2392 | rb_eCUInvalidHandleError = rb_define_class_under(rb_mCU, "CUInvalidHandleError", rb_eCUParameterError); 2393 | 2394 | rb_eCUMemoryError = rb_define_class_under(rb_mCU, "CUMemoryError", rb_eCUStandardError); 2395 | rb_eCUOutOfMemoryError = rb_define_class_under(rb_mCU, "CUOutOfMemoryError", rb_eCUMemoryError); 2396 | 2397 | rb_eCULibraryError = rb_define_class_under(rb_mCU, "CULibraryError", rb_eCUStandardError); 2398 | rb_eCUSharedObjectSymbolNotFoundError = rb_define_class_under(rb_mCU, "CUSharedObjectSymbolNotFoundError", rb_eCULibraryError); 2399 | rb_eCUSharedObjectInitFailedError = rb_define_class_under(rb_mCU, "CUSharedObjectInitFailedError", rb_eCULibraryError); 2400 | 2401 | rb_eCUHardwareError = rb_define_class_under(rb_mCU, "CUHardwareError", rb_eCUStandardError); 2402 | rb_eCUECCUncorrectableError = rb_define_class_under(rb_mCU, "CUECCUncorrectableError", rb_eCUHardwareError); 2403 | 2404 | rb_eCUFileError = rb_define_class_under(rb_mCU, "CUFileError", rb_eCUStandardError); 2405 | rb_eCUNoBinaryForGPUError = rb_define_class_under(rb_mCU, "CUNoBinaryForGPUError", rb_eCUFileError); 2406 | rb_eCUFileNotFoundError = rb_define_class_under(rb_mCU, "CUFileNotFoundError", rb_eCUFileError); 2407 | rb_eCUInvalidSourceError = rb_define_class_under(rb_mCU, "CUInvalidSourceError", rb_eCUFileError); 2408 | rb_eCUInvalidImageError = rb_define_class_under(rb_mCU, "CUInvalidImageError", rb_eCUFileError); 2409 | 2410 | rb_eCUReferenceError = rb_define_class_under(rb_mCU, "CUReferenceError", rb_eCUStandardError); 2411 | rb_eCUReferenceNotFoundError = rb_define_class_under(rb_mCU, "CUReferenceNotFoundError", rb_eCUReferenceError); 2412 | 2413 | rb_eCUOtherError = rb_define_class_under(rb_mCU, "CUOtherError", rb_eCUStandardError); 2414 | rb_eCUAlreadyAcquiredError = rb_define_class_under(rb_mCU, "CUAlreadyAcquiredError", rb_eCUOtherError); 2415 | rb_eCUNotReadyError = rb_define_class_under(rb_mCU, "CUNotReadyError", rb_eCUOtherError); 2416 | rb_eCUOperatingSystemError = rb_define_class_under(rb_mCU, "CUOperatingSystemError", rb_eCUOtherError); 2417 | 2418 | rb_eCUUnknownError = rb_define_class_under(rb_mCU, "CUUnknownError", rb_eCUStandardError); 2419 | 2420 | rb_error_class_by_enum = rb_hash_new(); 2421 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_INITIALIZED), rb_eCUDeviceNotInitializedError); 2422 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_DEINITIALIZED) , rb_eCUDeviceDeinitializedError); 2423 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NO_DEVICE) , rb_eCUNoDeviceError); 2424 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_DEVICE) , rb_eCUInvalidDeviceError); 2425 | 2426 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_MAP_FAILED) , rb_eCUMapFailedError); 2427 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNMAP_FAILED) , rb_eCUUnMapFailedError); 2428 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ARRAY_IS_MAPPED) , rb_eCUArrayIsMappedError); 2429 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ALREADY_MAPPED) , rb_eCUAlreadyMappedError); 2430 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED) , rb_eCUNotMappedError); 2431 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_ARRAY) , rb_eCUNotMappedAsArrayError); 2432 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_POINTER), rb_eCUNotMappedAsPointerError); 2433 | 2434 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_CONTEXT) , rb_eCUInvalidContextError); 2435 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_CONTEXT_ALREADY_CURRENT), rb_eCUContextAlreadyCurrentError); 2436 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNSUPPORTED_LIMIT) , rb_eCUUnsupportedLimitError); 2437 | 2438 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_FAILED) , rb_eCULaunchFailedError); 2439 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) , rb_eCULaunchOutOfResourcesError); 2440 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_TIMEOUT) , rb_eCULaunchTimeoutError); 2441 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING), rb_eCULaunchIncompatibleTexturingError); 2442 | 2443 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_VALUE) , rb_eCUInvalidValueError); 2444 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_HANDLE) , rb_eCUInvalidHandleError); 2445 | 2446 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_OUT_OF_MEMORY), rb_eCUOutOfMemoryError); 2447 | 2448 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND), rb_eCUSharedObjectSymbolNotFoundError); 2449 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED) , rb_eCUSharedObjectInitFailedError); 2450 | 2451 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ECC_UNCORRECTABLE), rb_eCUECCUncorrectableError); 2452 | 2453 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NO_BINARY_FOR_GPU), rb_eCUNoBinaryForGPUError); 2454 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_FILE_NOT_FOUND) , rb_eCUFileNotFoundError); 2455 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_SOURCE) , rb_eCUInvalidSourceError); 2456 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_IMAGE) , rb_eCUInvalidImageError); 2457 | 2458 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_FOUND), rb_eCUReferenceNotFoundError); 2459 | 2460 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ALREADY_ACQUIRED), rb_eCUAlreadyAcquiredError); 2461 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_READY) , rb_eCUNotReadyError); 2462 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_OPERATING_SYSTEM), rb_eCUOperatingSystemError); 2463 | 2464 | rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNKNOWN), rb_eCUUnknownError); 2465 | 2466 | rb_cMemoryPointer = rb_define_class_under(rb_mCU, "MemoryPointer", rb_cObject); 2467 | rb_define_alloc_func(rb_cMemoryPointer, memory_pointer_alloc); 2468 | rb_define_method(rb_cMemoryPointer, "initialize", RUBY_METHOD_FUNC(memory_pointer_initialize), 0); 2469 | 2470 | rb_mIBuffer = rb_define_module_under(rb_mCU, "IBuffer"); 2471 | rb_define_singleton_method(rb_mIBuffer, "included", RUBY_METHOD_FUNC(module_included_classmethods_hook), 1); 2472 | rb_define_method(rb_mIBuffer, "initialize", RUBY_METHOD_FUNC(ibuffer_initialize), -1); 2473 | rb_define_method(rb_mIBuffer, "size", RUBY_METHOD_FUNC(ibuffer_size), 0); 2474 | rb_define_method(rb_mIBuffer, "page_locked?", RUBY_METHOD_FUNC(ibuffer_is_page_locked), 0); 2475 | rb_define_method(rb_mIBuffer, "offset", RUBY_METHOD_FUNC(ibuffer_offset), 1); 2476 | rb_define_method(rb_mIBuffer, "[]", RUBY_METHOD_FUNC(ibuffer_element_get), 1); 2477 | rb_define_method(rb_mIBuffer, "[]=", RUBY_METHOD_FUNC(ibuffer_element_set), 2); 2478 | 2479 | rb_mIBufferClassMethods = rb_define_module_under(rb_mIBuffer, "ClassMethods"); 2480 | rb_define_method(rb_mIBufferClassMethods, "element_size", RUBY_METHOD_FUNC(ibuffer_element_size), 0); 2481 | 2482 | rb_cMemoryBuffer = rb_define_class_under(rb_mCU, "MemoryBuffer", rb_cMemoryPointer); 2483 | rb_include_module(rb_cMemoryBuffer, rb_mIBuffer); 2484 | module_included_classmethods_hook(rb_mIBuffer, rb_cMemoryBuffer); 2485 | rb_define_alloc_func(rb_cMemoryBuffer, memory_buffer_alloc); 2486 | rb_define_singleton_method(rb_cMemoryBuffer, "element_size", RUBY_METHOD_FUNC(memory_buffer_element_size), 0); 2487 | rb_define_method(rb_cMemoryBuffer, "initialize", RUBY_METHOD_FUNC(memory_buffer_initialize), -1); 2488 | rb_define_method(rb_cMemoryBuffer, "size", RUBY_METHOD_FUNC(memory_buffer_size), 0); 2489 | rb_define_method(rb_cMemoryBuffer, "page_locked?", RUBY_METHOD_FUNC(memory_buffer_is_page_locked), 0); 2490 | rb_define_method(rb_cMemoryBuffer, "offset", RUBY_METHOD_FUNC(memory_buffer_offset), 1); 2491 | rb_define_method(rb_cMemoryBuffer, "[]", RUBY_METHOD_FUNC(memory_buffer_element_get), 1); 2492 | rb_define_method(rb_cMemoryBuffer, "[]=", RUBY_METHOD_FUNC(memory_buffer_element_set), 2); 2493 | 2494 | rb_cInt32Buffer = rb_define_class_under(rb_mCU, "Int32Buffer", rb_cMemoryBuffer); 2495 | rb_define_alloc_func(rb_cInt32Buffer, buffer_alloc); 2496 | rb_define_const(rb_cInt32Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(int))); 2497 | rb_define_singleton_method(rb_cInt32Buffer, "element_size", RUBY_METHOD_FUNC(static_cast(&buffer_element_size)), 0); 2498 | rb_define_method(rb_cInt32Buffer, "initialize", RUBY_METHOD_FUNC(static_cast(&buffer_initialize)) , -1); 2499 | rb_define_method(rb_cInt32Buffer, "size", RUBY_METHOD_FUNC(static_cast(&buffer_size)), 0); 2500 | rb_define_method(rb_cInt32Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast(&buffer_is_page_locked)), 0); 2501 | rb_define_method(rb_cInt32Buffer, "offset", RUBY_METHOD_FUNC(static_cast(&buffer_offset)), 1); 2502 | rb_define_method(rb_cInt32Buffer, "[]", RUBY_METHOD_FUNC(static_cast(&buffer_element_get)), 1); 2503 | rb_define_method(rb_cInt32Buffer, "[]=", RUBY_METHOD_FUNC(static_cast(&buffer_element_set)), 2); 2504 | 2505 | rb_cInt64Buffer = rb_define_class_under(rb_mCU, "Int64Buffer", rb_cMemoryBuffer); 2506 | rb_define_alloc_func(rb_cInt64Buffer, buffer_alloc); 2507 | rb_define_const(rb_cInt64Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(long))); 2508 | rb_define_singleton_method(rb_cInt64Buffer, "element_size", RUBY_METHOD_FUNC(static_cast(&buffer_element_size)), 0); 2509 | rb_define_method(rb_cInt64Buffer, "initialize", RUBY_METHOD_FUNC(static_cast(&buffer_initialize)) , -1); 2510 | rb_define_method(rb_cInt64Buffer, "size", RUBY_METHOD_FUNC(static_cast(&buffer_size)), 0); 2511 | rb_define_method(rb_cInt64Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast(&buffer_is_page_locked)), 0); 2512 | rb_define_method(rb_cInt64Buffer, "offset", RUBY_METHOD_FUNC(static_cast(&buffer_offset)), 1); 2513 | rb_define_method(rb_cInt64Buffer, "[]", RUBY_METHOD_FUNC(static_cast(&buffer_element_get)), 1); 2514 | rb_define_method(rb_cInt64Buffer, "[]=", RUBY_METHOD_FUNC(static_cast(&buffer_element_set)), 2); 2515 | 2516 | rb_cFloat32Buffer = rb_define_class_under(rb_mCU, "Float32Buffer", rb_cMemoryBuffer); 2517 | rb_define_alloc_func(rb_cFloat32Buffer, buffer_alloc); 2518 | rb_define_const(rb_cFloat32Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(float))); 2519 | rb_define_singleton_method(rb_cFloat32Buffer, "element_size", RUBY_METHOD_FUNC(static_cast(&buffer_element_size)), 0); 2520 | rb_define_method(rb_cFloat32Buffer, "initialize", RUBY_METHOD_FUNC(static_cast(&buffer_initialize)) , -1); 2521 | rb_define_method(rb_cFloat32Buffer, "size", RUBY_METHOD_FUNC(static_cast(&buffer_size)), 0); 2522 | rb_define_method(rb_cFloat32Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast(&buffer_is_page_locked)), 0); 2523 | rb_define_method(rb_cFloat32Buffer, "offset", RUBY_METHOD_FUNC(static_cast(&buffer_offset)), 1); 2524 | rb_define_method(rb_cFloat32Buffer, "[]", RUBY_METHOD_FUNC(static_cast(&buffer_element_get)), 1); 2525 | rb_define_method(rb_cFloat32Buffer, "[]=", RUBY_METHOD_FUNC(static_cast(&buffer_element_set)), 2); 2526 | 2527 | rb_cFloat64Buffer = rb_define_class_under(rb_mCU, "Float64Buffer", rb_cMemoryBuffer); 2528 | rb_define_alloc_func(rb_cFloat64Buffer, buffer_alloc); 2529 | rb_define_const(rb_cFloat64Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(double))); 2530 | rb_define_method(rb_cFloat64Buffer, "initialize", RUBY_METHOD_FUNC(static_cast(&buffer_initialize)) , -1); 2531 | rb_define_singleton_method(rb_cFloat64Buffer, "element_size", RUBY_METHOD_FUNC(static_cast(&buffer_element_size)), 0); 2532 | rb_define_method(rb_cFloat64Buffer, "size", RUBY_METHOD_FUNC(static_cast(&buffer_size)), 0); 2533 | rb_define_method(rb_cFloat64Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast(&buffer_is_page_locked)), 0); 2534 | rb_define_method(rb_cFloat64Buffer, "offset", RUBY_METHOD_FUNC(static_cast(&buffer_offset)), 1); 2535 | rb_define_method(rb_cFloat64Buffer, "[]", RUBY_METHOD_FUNC(static_cast(&buffer_element_get)), 1); 2536 | rb_define_method(rb_cFloat64Buffer, "[]=", RUBY_METHOD_FUNC(static_cast(&buffer_element_set)), 2); 2537 | 2538 | rb_define_module_function(rb_mCU, "memcpy_htod", RUBY_METHOD_FUNC(memcpy_htod), 3); 2539 | rb_define_module_function(rb_mCU, "memcpy_dtoh", RUBY_METHOD_FUNC(memcpy_dtoh), 3); 2540 | rb_define_module_function(rb_mCU, "memcpy_dtod", RUBY_METHOD_FUNC(memcpy_dtod), 3); 2541 | rb_define_module_function(rb_mCU, "memcpy_htod_async", RUBY_METHOD_FUNC(memcpy_htod_async), 4); 2542 | rb_define_module_function(rb_mCU, "memcpy_dtoh_async", RUBY_METHOD_FUNC(memcpy_dtoh_async), 4); 2543 | rb_define_module_function(rb_mCU, "memcpy_dtod_async", RUBY_METHOD_FUNC(memcpy_dtod_async), 4); 2544 | rb_define_module_function(rb_mCU, "mem_get_info", RUBY_METHOD_FUNC(mem_get_info), 0); 2545 | 2546 | rb_define_module_function(rb_mCU, "driver_get_version", RUBY_METHOD_FUNC(driver_get_version), 0); 2547 | 2548 | CUresult status = cuInit(0); 2549 | if (status != CUDA_SUCCESS) { 2550 | RAISE_CU_STD_ERROR(status, "Failed to initialize the CUDA driver API."); 2551 | } 2552 | } 2553 | 2554 | } // namespace 2555 | } // namespace 2556 | --------------------------------------------------------------------------------