├── sgc
    ├── .rvmrc
    ├── .gitignore
    ├── lib
    │   ├── rubycuda.rb
    │   ├── rubycu.rb
    │   ├── cuda
    │   │   ├── driver
    │   │   │   ├── extconf.rb
    │   │   │   ├── rubycu.o
    │   │   │   ├── rubycu.bundle
    │   │   │   ├── mkmf.log
    │   │   │   ├── Makefile
    │   │   │   └── rubycu.cpp
    │   │   ├── runtime
    │   │   │   ├── rubycuda.rb
    │   │   │   ├── version.rb
    │   │   │   ├── error.rb
    │   │   │   ├── thread.rb
    │   │   │   ├── cuda.rb
    │   │   │   ├── memory.rb
    │   │   │   ├── stream.rb
    │   │   │   ├── device.rb
    │   │   │   ├── event.rb
    │   │   │   ├── function.rb
    │   │   │   └── ffi-cuda.rb
    │   │   └── ruby
    │   │   │   └── cu.rb
    │   ├── madison
    │   │   ├── kernel
    │   │   │   ├── kernel.h
    │   │   │   ├── libkernel.so
    │   │   │   ├── libkernel.10.so
    │   │   │   ├── kernel.cu
    │   │   │   └── test.cu
    │   │   ├── matrix.rb
    │   │   └── comparable.rb
    │   ├── ffi
    │   │   └── prettystruct.rb
    │   └── memory
    │   │   ├── interface
    │   │       ├── ipointer.rb
    │   │       └── ibuffer.rb
    │   │   ├── pointer.rb
    │   │   └── buffer.rb
    ├── visualize.sh
    └── visualize.gp
└── .gitignore


/sgc/.rvmrc:
--------------------------------------------------------------------------------
1 | rvm 1.9.2
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | */libkernel.so
2 | 


--------------------------------------------------------------------------------
/sgc/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.png
3 | a.out
4 | 


--------------------------------------------------------------------------------
/sgc/lib/rubycuda.rb:
--------------------------------------------------------------------------------
1 | require 'cuda/runtime/rubycuda'
2 | 


--------------------------------------------------------------------------------
/sgc/lib/rubycu.rb:
--------------------------------------------------------------------------------
1 | require 'cuda/driver/rubycu'
2 | require 'cuda/ruby/cu'
3 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/extconf.rb:
--------------------------------------------------------------------------------
1 | require 'mkmf'
2 | have_library("cuda")
3 | create_makefile("rubycu")
4 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/rubycu.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/cuda/driver/rubycu.o


--------------------------------------------------------------------------------
/sgc/lib/madison/kernel/kernel.h:
--------------------------------------------------------------------------------
1 | #define DIMENSIONS 10
2 | #define BLOCK_SIZE 16
3 | #define CLUSTER_SIZE 16
4 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/rubycu.bundle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/cuda/driver/rubycu.bundle


--------------------------------------------------------------------------------
/sgc/lib/madison/kernel/libkernel.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/madison/kernel/libkernel.so


--------------------------------------------------------------------------------
/sgc/lib/madison/kernel/libkernel.10.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zog/Sandbox/master/sgc/lib/madison/kernel/libkernel.10.so


--------------------------------------------------------------------------------
/sgc/visualize.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | # visualize.sh
3 | 
4 | cat result.csv|sort -n -k1 -k2 > result_sorted.csv
5 | awk '{ print;if ((NR % 512) == 0)  printf("\n");}' result_sorted.csv > result_sorted_final.csv 
6 | gnuplot visualize.gp ; open heatmaps.png


--------------------------------------------------------------------------------
/sgc/lib/ffi/prettystruct.rb:
--------------------------------------------------------------------------------
 1 | require 'ffi'
 2 | 
 3 | 
 4 | module FFI
 5 | 
 6 | # This class is obtained from ffi-tk (https://github.com/Tass/ffi-tk).
 7 | class PrettyStruct < FFI::Struct
 8 |     ACCESSOR_CODE = <<-CODE
 9 |         def {name}; self[{sym}]; end
10 |         def {name}=(value) self[{sym}] = value; end
11 |     CODE
12 | 
13 |     def self.layout(*kvs)
14 |         kvs.each_slice(2) do |key, value|
15 |             eval ACCESSOR_CODE.gsub(/\{(.*?)\}/, '{name}' => key, '{sym}' => ":#{key}")
16 |         end
17 | 
18 |         super
19 |     end
20 | 
21 |     def inspect
22 |         kvs = members.zip(values)
23 |         kvs.map!{|key, value| "%s=%s" % [key, value.inspect] }
24 |         "<%s %s>" % [self.class, kvs.join(' ')]
25 |     end
26 | end
27 | 
28 | end # module
29 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/rubycuda.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | require 'cuda/runtime/cuda'
27 | require 'cuda/runtime/error'
28 | require 'cuda/runtime/version'
29 | require 'cuda/runtime/device'
30 | require 'cuda/runtime/thread'
31 | require 'cuda/runtime/memory'
32 | require 'cuda/runtime/function'
33 | require 'cuda/runtime/stream'
34 | require 'cuda/runtime/event'
35 | 


--------------------------------------------------------------------------------
/sgc/lib/memory/interface/ipointer.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | module SGC
26 | module Memory
27 | 
28 | module IMemoryPointer
29 | 
30 |     def initialize(value = nil); end
31 | 
32 |     def ptr; raise NotImplementedError; end
33 |     def ptr=(value); raise NotImplementedError; end
34 |     def offset(index); raise NotImplementedError; end
35 |     def ref; raise NotImplementedError; end
36 | 
37 | end
38 | 
39 | end # module
40 | end # module
41 | 


--------------------------------------------------------------------------------
/sgc/visualize.gp:
--------------------------------------------------------------------------------
 1 | set terminal png transparent nocrop enhanced font arial 8 size 1000, 1000 
 2 | set output 'heatmaps.png'
 3 | unset key
 4 | set view map
 5 | set style data linespoints
 6 | set xtics border in scale 0,0 mirror norotate  offset character 0, 0, 0
 7 | set ytics border in scale 0,0 mirror norotate  offset character 0, 0, 0
 8 | set ztics border in scale 0,0 nomirror norotate  offset character 0, 0, 0
 9 | set nocbtics
10 | set title "Heat Map generated by 'plot' from a stream of XYZ values\nNB: Rows must be separated by blank lines!" 
11 | set rrange [ * : * ] noreverse nowriteback  # (currently [8.98847e+307:-8.98847e+307] )
12 | set trange [ * : * ] noreverse nowriteback  # (currently [-5.00000:5.00000] )
13 | set urange [ * : * ] noreverse nowriteback  # (currently [-5.00000:5.00000] )
14 | set vrange [ * : * ] noreverse nowriteback  # (currently [-5.00000:5.00000] )
15 | set xrange [ -0.5 : * ] noreverse nowriteback
16 | set x2range [ * : * ] noreverse nowriteback  # (currently [-0.500000:4.50000] )
17 | set yrange [ -0.5 : * ] noreverse nowriteback
18 | set y2range [ * : * ] noreverse nowriteback  # (currently [-0.500000:4.50000] )
19 | set zrange [ 0.0 : 1.0 ] noreverse nowriteback  # (currently [0.00000:5.00000] )
20 | set cblabel "Score" 
21 | set cbrange [ 0.00000 : * ] noreverse nowriteback
22 | set palette rgbformulae -7, 2, -7
23 | plot 'result_sorted_final.csv' using 2:1:3 with image
24 | 


--------------------------------------------------------------------------------
/sgc/lib/madison/kernel/kernel.cu:
--------------------------------------------------------------------------------
 1 | extern "C" {
 2 |   #include "kernel.h"
 3 | 
 4 |   __global__ void MatPopulate(float *A, int count) 
 5 |   { 
 6 |     int row = blockIdx.x;
 7 |     int col = threadIdx.x;
 8 |     A[row * DIMENSIONS + col] = (float)(row * DIMENSIONS + col)/(DIMENSIONS*count);
 9 |   } 
10 |   
11 |   float score(float *A, float *B){
12 |     float score = 0.0;
13 |     for(int i=0; i<DIMENSIONS; i++){
14 |       score += A[i] * B[i];
15 |     }
16 |     return score;
17 |   }
18 |   
19 |   __global__ void ParallelScore(float *A, float *B, int* keysA, int* keysB, float *scores, int count)
20 |   { 
21 |     float *q = &A[(blockIdx.x*blockDim.x + threadIdx.x)*DIMENSIONS];
22 |     int *keys_q = &keysA[(blockIdx.x*blockDim.x + threadIdx.x)*DIMENSIONS];
23 |     
24 |     float *p;
25 |     int *keys_p;
26 |     __syncthreads();
27 |     float _score;
28 |     for(int j=0; j<count; j++){
29 |       _score = 0.0;
30 |       p = &B[j*DIMENSIONS];
31 |       keys_p = &keysB[j*DIMENSIONS];
32 |       
33 |       __syncthreads();
34 |       for(int k=0; k<DIMENSIONS; k++){
35 |         int key = keys_q[k];
36 |         for(int l=0; l<DIMENSIONS; l++){
37 |           if(keys_p[l] == key){
38 |             _score += q[k] * p[l];
39 |           }
40 |         }
41 |         
42 |       }
43 |       __syncthreads();
44 |       scores[(blockIdx.x*blockDim.x + threadIdx.x) * count + j] = sqrt(_score);
45 |       //scores[(blockIdx.x*blockDim.x + threadIdx.x) * count + j] = DIMENSIONS+1;
46 |     }
47 |   } 
48 | }
49 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/version.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | 
27 | 
28 | module SGC
29 | module Cuda
30 | 
31 | def driver_version
32 |     p = FFI::MemoryPointer.new(:int)
33 |     status = API::cudaDriverGetVersion(p)
34 |     Pvt::handle_error(status)
35 |     p.read_int
36 | end
37 | module_function :driver_version
38 | 
39 | 
40 | def runtime_version
41 |     p = FFI::MemoryPointer.new(:int)
42 |     status = API::cudaRuntimeGetVersion(p)
43 |     Pvt::handle_error(status)
44 |     p.read_int
45 | end
46 | module_function :runtime_version
47 | 
48 | end # module
49 | end # module
50 | 


--------------------------------------------------------------------------------
/sgc/lib/memory/interface/ibuffer.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'memory/interface/ipointer'
26 | 
27 | 
28 | module SGC
29 | module Memory
30 | 
31 | module IBuffer
32 | 
33 |     include IMemoryPointer
34 | 
35 |     def initialize(type, size); end
36 | 
37 |     def [](index); raise NotImplementedError; end
38 |     def []=(index, value); raise NotImplementedError; end
39 |     def size; raise NotImplementedError; end
40 |     def element_size; raise NotImplementedError; end
41 | 
42 |     module ClassMethods
43 |         def element_size(type); raise NotImplementedError; end
44 |     end
45 | 
46 |     def self.included(base)
47 |         base.extend(ClassMethods)
48 |     end
49 | 
50 | end
51 | 
52 | end # module
53 | end # module
54 | 


--------------------------------------------------------------------------------
/sgc/lib/memory/pointer.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'ffi'
26 | require 'memory/interface/ipointer'
27 | 
28 | 
29 | module SGC
30 | module Memory
31 | 
32 | class MemoryPointer
33 | 
34 |     include IMemoryPointer
35 | 
36 | 
37 |     def initialize(v = nil)
38 |         @p = FFI::MemoryPointer.new(:pointer)
39 |         @p.write_pointer(v)
40 |     end
41 | 
42 | 
43 |     def ptr
44 |         @p.read_pointer
45 |     end
46 | 
47 | 
48 |     def ptr=(v)
49 |         @p.write_pointer(v)
50 |         v
51 |     end
52 | 
53 | 
54 |     def offset(i)
55 |         MemoryPointer.new(@p.read_pointer.to_i + i)
56 |     end
57 | 
58 | 
59 |     def ref
60 |         @p
61 |     end
62 | 
63 | end
64 | 
65 | end # module
66 | end # module
67 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/error.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | 
27 | 
28 | module SGC
29 | module Cuda
30 | 
31 |     def get_error_string(e)
32 |         API::cudaGetErrorString(e)
33 |     end
34 |     module_function :get_error_string
35 | 
36 | 
37 |     def get_last_error
38 |         API::cudaGetLastError
39 |     end
40 |     module_function :get_last_error
41 | 
42 | 
43 |     def peek_at_last_error
44 |         API::cudaPeekAtLastError
45 |     end
46 |     module_function :peek_at_last_error
47 | 
48 | module Pvt
49 | 
50 |     CUDA_SUCCESS = API::CudaError[:cudaSuccess]
51 |     CUDA_ERROR_NOT_READY = API::CudaError[:cudaErrorNotReady]
52 | 
53 |     def self.handle_error(status)
54 |         status == CUDA_SUCCESS or raise API::cudaGetErrorString(status)
55 |         nil
56 |     end
57 | 
58 | end
59 | 
60 | end # module
61 | end # module
62 | 


--------------------------------------------------------------------------------
/sgc/lib/madison/matrix.rb:
--------------------------------------------------------------------------------
 1 | module Madison
 2 |   require 'rubycuda'
 3 |   require 'madison/comparable'
 4 |   
 5 |   
 6 |   class Dimension
 7 |     # A vectors dimension key => value
 8 |     attr_accessor :i, :j
 9 |     
10 |     def initialize matrix, i, j
11 |       @matrix = matrix
12 |       @i = i
13 |       @j = j
14 |     end
15 |     
16 |     def value= value
17 |       @matrix.values[@i*@matrix.vectors_dimension + @j] = value
18 |     end
19 |     
20 |     def key= value
21 |       @matrix.keys[@i*@matrix.vectors_dimension + @j] = value
22 |     end
23 |     
24 |     def value
25 |       @matrix.values[@i*@matrix.vectors_dimension + @j]
26 |     end
27 |     
28 |     def key
29 |       @matrix.keys[@i*@matrix.vectors_dimension + @j]
30 |     end
31 |     
32 |     def inspect
33 |       "#<Madison::Dimension (#{i}, #{j}) #{key} => #{value}>"
34 |     end
35 |   end
36 |   
37 |   class Matrix
38 |     include SGC::Cuda
39 |     include Madison::Comparable
40 |     
41 |     attr_reader :vectors_dimension
42 |     attr_reader :count
43 |     attr_reader :size
44 |     attr_accessor :keys, :values
45 | 
46 |     def initialize type, vectors_count, vectors_dimension
47 |       @last_id = 0
48 |       @count = vectors_count
49 |       @vectors_dimension = vectors_dimension
50 |       @size = vectors_count * vectors_dimension
51 |       @type = type
52 |       @type_size = Buffer.element_size(type)
53 |       @dimensions = Hash.new{|h, k| h[k] = {}}
54 |       
55 |       # the matrix used to store the vector dimensions values
56 |       @values = Buffer.new(type, @size)
57 |       
58 |       # the matrix used to store the vector dimensions keys
59 |       @keys = Buffer.new(:int, @size)
60 |     end
61 |     
62 |     def inspect
63 |       "#<Madison::Matrix #{count}x#{vectors_dimension} @last_id=#{@last_id}>"
64 |     end
65 |     
66 |     def dimensions i, j
67 |       @dimensions[i][j] ||= Dimension.new self, i, j
68 |     end
69 | 
70 |     def << vector
71 |       raise "Already full" unless @last_id <= @count
72 |       (0...[vector.size, @vectors_dimension].min).each do |k| 
73 |         dimensions(@last_id, k).value = vector.values[k]
74 |         dimensions(@last_id, k).key = vector.keys[k].hash
75 |       end
76 |       @last_id += 1
77 |       self
78 |     end
79 |   end
80 | end


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/thread.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | require 'cuda/runtime/cuda'
27 | require 'cuda/runtime/error'
28 | 
29 | 
30 | module SGC
31 | module Cuda
32 | 
33 | class CudaThread
34 | 
35 |     def self.exit
36 |         status = API::cudaThreadExit
37 |         Pvt::handle_error(status)
38 |         self
39 |     end
40 | 
41 | 
42 |     def self.cache_config
43 |         p = FFI::MemoryPointer.new(:int)
44 |         status = API::cudaThreadGetCacheConfig(p)
45 |         Pvt::handle_error(status)
46 |         CudaFuncCache[p.read_int]
47 |     end
48 | 
49 | 
50 |     def self.cache_config=(config)
51 |         status = API::cudaThreadSetCacheConfig(config)
52 |         Pvt::handle_error(status)
53 |         config
54 |     end
55 | 
56 | 
57 |     def self.limit(limit)
58 |         p = FFI::MemoryPointer.new(:size_t)
59 |         status = API::cudaThreadGetLimit(p, limit)
60 |         Pvt::handle_error(status)
61 |         p.read_long
62 |     end
63 | 
64 | 
65 |     def self.limit=(*limit_value_pair)
66 |         limit, value = limit_value_pair.flatten
67 |         status = API::cudaThreadSetLimit(limit, value)
68 |         Pvt::handle_error(status)
69 |         limit_value_pair
70 |     end
71 | 
72 | 
73 |     def self.synchronize
74 |         status = API::cudaThreadSynchronize
75 |         Pvt::handle_error(status)
76 |         self
77 |     end
78 | 
79 | end
80 | 
81 | end # module
82 | end # module
83 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/cuda.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | require 'memory/buffer'
27 | 
28 | 
29 | module SGC
30 | module Cuda
31 | 
32 |     CudaError_t = CudaError = API::CudaError
33 |     CudaDeviceFlags = API::CudaDeviceFlags
34 |     CudaEventFlags = API::CudaEventFlags
35 |     CudaHostAllocFlags = API::CudaHostAllocFlags
36 |     CudaArrayFlags = API::CudaArrayFlags
37 |     CudaMemcpyKind = API::CudaMemcpyKind
38 |     CudaChannelFormatKind = API::CudaChannelFormatKind
39 |     CudaFuncCache = API::CudaFuncCache
40 |     CudaLimit = API::CudaLimit
41 |     CudaComputeMode = API::CudaComputeMode
42 |     CudaSurfaceBoundaryMode = API::CudaSurfaceBoundaryMode
43 |     CudaSurfaceFormatMode = API::CudaSurfaceFormatMode
44 |     CudaTextureAddressMode = API::CudaTextureAddressMode
45 |     CudaTextureFilterMode = API::CudaTextureFilterMode
46 |     CudaTextureReadMode = API::CudaTextureReadMode
47 | 
48 |     Dim3 = API::Dim3
49 |     CudaDeviceProp = API::CudaDeviceProp
50 |     CudaFuncAttributes = API::CudaFuncAttributes
51 |     CudaChannelFormatDesc = API::CudaChannelFormatDesc
52 |     CudaPitchedPtr = API::CudaPitchedPtr
53 |     CudaPos = API::CudaPos
54 |     CudaExtent = API::CudaExtent
55 |     CudaMemcpy3DParms = API::CudaMemcpy3DParms
56 |     TextureReference = API::TextureReference
57 |     SurfaceReference = API::SurfaceReference
58 | 
59 |     Buffer = SGC::Memory::Buffer
60 | 
61 | end # module
62 | end # module
63 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/mkmf.log:
--------------------------------------------------------------------------------
 1 | have_library: checking for main() in -lcuda... -------------------- no
 2 | 
 3 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE    -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long  -fno-common -pipe conftest.c  -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib     -lruby.1.9.1-static  -lpthread -ldl -lobjc "
 4 | checked program was:
 5 | /* begin */
 6 | 1: #include "ruby.h"
 7 | 2: 
 8 | 3: int main() {return 0;}
 9 | /* end */
10 | 
11 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE    -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long  -fno-common -pipe conftest.c  -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib     -lruby.1.9.1-static -lcuda  -lpthread -ldl -lobjc "
12 | ld: library not found for -lcuda
13 | collect2: ld returned 1 exit status
14 | checked program was:
15 | /* begin */
16 | 1: #include "ruby.h"
17 | 2: 
18 | 3: /*top*/
19 | 4: int main() {return 0;}
20 | 5: int t() { void ((*volatile p)()); p = (void ((*)()))main; return 0; }
21 | /* end */
22 | 
23 | "gcc -o conftest -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/i386-darwin9.8.0 -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/ruby/backward -I/Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1 -I. -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE    -O3 -ggdb -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long  -fno-common -pipe conftest.c  -L. -L/Users/zog/.rvm/rubies/ruby-1.9.2-p136/lib -L. -L/usr/local/lib     -lruby.1.9.1-static -lcuda  -lpthread -ldl -lobjc "
24 | ld: library not found for -lcuda
25 | collect2: ld returned 1 exit status
26 | checked program was:
27 | /* begin */
28 | 1: #include "ruby.h"
29 | 2: 
30 | 3: /*top*/
31 | 4: int main() {return 0;}
32 | 5: int t() { main(); return 0; }
33 | /* end */
34 | 
35 | --------------------
36 | 
37 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/memory.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | require 'cuda/runtime/error'
27 | require 'memory/pointer'
28 | 
29 | 
30 | module SGC
31 | module Cuda
32 | 
33 | class CudaDeviceMemory
34 | 
35 |     def self.malloc(nbytes)
36 |         p = SGC::Memory::MemoryPointer.new
37 |         status = API::cudaMalloc(p.ref, nbytes)
38 |         Pvt::handle_error(status)
39 |         p
40 |     end
41 | 
42 | 
43 |     def self.free(devptr)
44 |         status = API::cudaFree(devptr.ptr)
45 |         Pvt::handle_error(status)
46 |         nil
47 |     end
48 | 
49 | end
50 | 
51 | 
52 | module CudaMemory
53 | 
54 |     def memcpy(dst_ptr, src_ptr, nbytes, memcpy_kind)
55 |         status = API::cudaMemcpy(dst_ptr.ptr, src_ptr.ptr, nbytes, memcpy_kind)
56 |         Pvt::handle_error(status)
57 |     end
58 |     module_function :memcpy
59 | 
60 |     def memcpy_htoh(dst_ptr, src_ptr, nbytes)
61 |         memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyHostToHost)
62 |     end
63 |     module_function :memcpy_htoh
64 | 
65 |     def memcpy_htod(dst_ptr, src_ptr, nbytes)
66 |         memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyHostToDevice)
67 |     end
68 |     module_function :memcpy_htod
69 | 
70 |     def memcpy_dtoh(dst_ptr, src_ptr, nbytes)
71 |         memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyDeviceToHost)
72 |     end
73 |     module_function :memcpy_dtoh
74 | 
75 |     def memcpy_dtod(dst_ptr, src_ptr, nbytes)
76 |         memcpy(dst_ptr, src_ptr, nbytes, :cudaMemcpyDeviceToDevice)
77 |     end
78 |     module_function :memcpy_dtod
79 | 
80 | end
81 | 
82 | end # module
83 | end # module
84 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/stream.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2010 Chung Shin Yee
 3 | #
 4 | #       shinyee@speedgocomputing.com
 5 | #       http://www.speedgocomputing.com
 6 | #       http://github.com/xman/sgc-ruby-cuda
 7 | #       http://rubyforge.org/projects/rubycuda
 8 | #
 9 | # This file is part of SGC-Ruby-CUDA.
10 | #
11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
12 | # it under the terms of the GNU General Public License as published by
13 | # the Free Software Foundation, either version 3 of the License, or
14 | # (at your option) any later version.
15 | #
16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 | # GNU General Public License for more details.
20 | #
21 | # You should have received a copy of the GNU General Public License
22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
23 | #
24 | 
25 | require 'cuda/runtime/ffi-cuda'
26 | require 'cuda/runtime/error'
27 | 
28 | 
29 | module SGC
30 | module Cuda
31 | 
32 | class CudaStream
33 | 
34 |     def initialize
35 |         @p = FFI::MemoryPointer.new(:pointer)
36 |     end
37 | 
38 | 
39 |     def create
40 |         status = API::cudaStreamCreate(@p)
41 |         Pvt::handle_error(status)
42 |         self
43 |     end
44 | 
45 | 
46 |     def destroy
47 |         status = API::cudaStreamDestroy(@p.read_pointer)
48 |         Pvt::handle_error(status)
49 |         @p.write_pointer(0)
50 |         nil
51 |     end
52 | 
53 | 
54 |     def query
55 |         status = API::cudaStreamQuery(@p.read_pointer)
56 |         if status == Pvt::CUDA_SUCCESS
57 |             return true
58 |         elsif status == Pvt::CUDA_ERROR_NOT_READY
59 |             return false
60 |         end
61 |         Pvt::hanld_error(status)
62 |         self
63 |     end
64 | 
65 | 
66 |     def synchronize
67 |         status = API::cudaStreamSynchronize(@p.read_pointer)
68 |         Pvt::handle_error(status)
69 |         self
70 |     end
71 | 
72 | 
73 |     def wait_event(event, flags = 0)
74 |         status = API::cudaStreamWaitEvent(@p.read_pointer, event, flags)
75 |         Pvt::handle_error(status)
76 |         self
77 |     end
78 | 
79 | 
80 |     def self.wait_event(event, flags = 0)
81 |         p = FFI::MemoryPointer.new(:pointer)
82 |         p.write_pointer(0)
83 |         status = API::cudaStreamWaitEvent(p.read_pointer, event, flags)
84 |         Pvt::handle_error(status)
85 |         self
86 |     end
87 | 
88 |     def to_ptr
89 |         @p.read_pointer
90 |     end
91 | 
92 | end
93 | 
94 | end # module
95 | end # module
96 | 


--------------------------------------------------------------------------------
/sgc/lib/memory/buffer.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2010 Chung Shin Yee
  3 | #
  4 | #       shinyee@speedgocomputing.com
  5 | #       http://www.speedgocomputing.com
  6 | #       http://github.com/xman/sgc-ruby-cuda
  7 | #       http://rubyforge.org/projects/rubycuda
  8 | #
  9 | # This file is part of SGC-Ruby-CUDA.
 10 | #
 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 12 | # it under the terms of the GNU General Public License as published by
 13 | # the Free Software Foundation, either version 3 of the License, or
 14 | # (at your option) any later version.
 15 | #
 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 | # GNU General Public License for more details.
 20 | #
 21 | # You should have received a copy of the GNU General Public License
 22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 23 | #
 24 | 
 25 | require 'ffi'
 26 | 
 27 | require 'memory/interface/ibuffer'
 28 | require 'memory/pointer'
 29 | 
 30 | 
 31 | module SGC
 32 | module Memory
 33 | 
 34 | class Buffer
 35 | 
 36 |     include IBuffer
 37 | 
 38 | 
 39 |     def initialize(type, size)
 40 |         @@reads[type] && @@writes[type] or raise "Invalid buffer element type."
 41 | 
 42 |         @reader = @@reads[type]
 43 |         @writer = @@writes[type]
 44 |         @ptr = FFI::MemoryPointer.new(type, size)
 45 |         @size = size
 46 |     end
 47 | 
 48 | 
 49 |     def [](i)
 50 |         assert_index(i)
 51 |         @ptr[i].send(@reader)
 52 |     end
 53 | 
 54 | 
 55 |     def []=(i, v)
 56 |         assert_index(i)
 57 |         @ptr[i].send(@writer, v)
 58 |         v
 59 |     end
 60 | 
 61 | 
 62 |     def size
 63 |         @size
 64 |     end
 65 | 
 66 | 
 67 |     def element_size
 68 |         @ptr.type_size
 69 |     end
 70 | 
 71 | 
 72 |     def ptr
 73 |         @ptr
 74 |     end
 75 | 
 76 | 
 77 |     def offset(i)
 78 |         assert_index(i)
 79 |         MemoryPointer.new(@ptr[i])
 80 |     end
 81 | 
 82 | 
 83 |     def self.element_size(type)
 84 |         @@sizes[type]
 85 |     end
 86 | 
 87 | protected
 88 | 
 89 |     def assert_index(i)
 90 |         i >= 0 && i < @size or raise IndexError, "Invalid index to buffer: index = #{i}. Expect index in 0..#{@size-1}"
 91 |     end
 92 |     
 93 |     @@reads = { int: :read_int, long: :read_long, float: :read_float }
 94 |     @@writes = { int: :write_int, long: :write_long, float: :write_float }
 95 |     @@sizes = { int: 4, long: FFI::TypeDefs[:long].size, float: 4 }
 96 | 
 97 | end
 98 | 
 99 | end # module
100 | end # module
101 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/device.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2010 Chung Shin Yee
  3 | #
  4 | #       shinyee@speedgocomputing.com
  5 | #       http://www.speedgocomputing.com
  6 | #       http://github.com/xman/sgc-ruby-cuda
  7 | #       http://rubyforge.org/projects/rubycuda
  8 | #
  9 | # This file is part of SGC-Ruby-CUDA.
 10 | #
 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 12 | # it under the terms of the GNU General Public License as published by
 13 | # the Free Software Foundation, either version 3 of the License, or
 14 | # (at your option) any later version.
 15 | #
 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 | # GNU General Public License for more details.
 20 | #
 21 | # You should have received a copy of the GNU General Public License
 22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 23 | #
 24 | 
 25 | require 'cuda/runtime/ffi-cuda'
 26 | require 'cuda/runtime/error'
 27 | 
 28 | 
 29 | module SGC
 30 | module Cuda
 31 | 
 32 | class CudaDevice
 33 | 
 34 |     def self.count
 35 |         p = FFI::MemoryPointer.new(:int)
 36 |         status = API::cudaGetDeviceCount(p)
 37 |         Pvt::handle_error(status)
 38 |         p.read_int
 39 |     end
 40 | 
 41 | 
 42 |     def self.get
 43 |         p = FFI::MemoryPointer.new(:int)
 44 |         status = API::cudaGetDevice(p)
 45 |         Pvt::handle_error(status)
 46 |         p.read_int
 47 |     end
 48 |     class << self; alias_method :current, :get; end
 49 | 
 50 | 
 51 |     def self.set(devid)
 52 |         status = API::cudaSetDevice(devid)
 53 |         Pvt::handle_error(status)
 54 |         self
 55 |     end
 56 |     class << self; alias_method :current=, :set; end
 57 | 
 58 | 
 59 |     def self.choose(prop)
 60 |         pdev = FFI::MemoryPointer.new(:int)
 61 |         status = API::cudaChooseDevice(pdev, prop.to_ptr)
 62 |         Pvt::handle_error(status)
 63 |         pdev.read_int
 64 |     end
 65 | 
 66 | 
 67 |     def self.properties(devid = self.get)
 68 |         prop = API::CudaDeviceProp.new
 69 |         status = API::cudaGetDeviceProperties(prop.to_ptr, devid)
 70 |         Pvt::handle_error(status)
 71 |         prop
 72 |     end
 73 | 
 74 | 
 75 |     def self.flags=(flags)
 76 |         if flags.is_a?(Symbol)
 77 |             flags = CudaDeviceFlags[flags]
 78 |         end
 79 | 
 80 |         status = API::cudaSetDeviceFlags(flags)
 81 |         Pvt::handle_error(status)
 82 |         flags
 83 |     end
 84 | 
 85 | 
 86 |     def self.valid_devices=(devs)
 87 |         p = FFI::MemoryPointer.new(:int, devs.count)
 88 |         devs.each_with_index do |devid, i|
 89 |             p[i].write_int(devid)
 90 |         end
 91 |         status = API::cudaSetValidDevices(p, devs.count)
 92 |         Pvt::handle_error(status)
 93 |         devs
 94 |     end
 95 | 
 96 | end
 97 | 
 98 | end # module
 99 | end # module
100 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/event.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2010 Chung Shin Yee
  3 | #
  4 | #       shinyee@speedgocomputing.com
  5 | #       http://www.speedgocomputing.com
  6 | #       http://github.com/xman/sgc-ruby-cuda
  7 | #       http://rubyforge.org/projects/rubycuda
  8 | #
  9 | # This file is part of SGC-Ruby-CUDA.
 10 | #
 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 12 | # it under the terms of the GNU General Public License as published by
 13 | # the Free Software Foundation, either version 3 of the License, or
 14 | # (at your option) any later version.
 15 | #
 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 | # GNU General Public License for more details.
 20 | #
 21 | # You should have received a copy of the GNU General Public License
 22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 23 | #
 24 | 
 25 | require 'cuda/runtime/ffi-cuda'
 26 | require 'cuda/runtime/error'
 27 | 
 28 | 
 29 | module SGC
 30 | module Cuda
 31 | 
 32 | class CudaEvent
 33 | 
 34 |     def initialize
 35 |         @p = FFI::MemoryPointer.new(:pointer)
 36 |     end
 37 | 
 38 | 
 39 |     def create(flags = CUDA_EVENT_DEFAULT)
 40 |         if flags == CUDA_EVENT_DEFAULT
 41 |             status = API::cudaEventCreate(@p)
 42 |         else
 43 |             flags = CudaEventFlags[flags] if flags.is_a?(Symbol)
 44 |             status = API::cudaEventCreateWithFlags(@p, flags)
 45 |         end
 46 |         Pvt::handle_error(status)
 47 |         self
 48 |     end
 49 | 
 50 | 
 51 |     def destroy
 52 |         status = API::cudaEventDestroy(@p.read_pointer)
 53 |         Pvt::handle_error(status)
 54 |         @p.write_pointer(0)
 55 |         nil
 56 |     end
 57 | 
 58 | 
 59 |     def query
 60 |         status = API::cudaEventQuery(@p.read_pointer)
 61 |         if status == Pvt::CUDA_SUCCESS
 62 |             return true
 63 |         elsif status == Pvt::CUDA_ERROR_NOT_READ
 64 |             return false
 65 |         end
 66 |         Pvt::handle_error(status)
 67 |         self
 68 |     end
 69 | 
 70 | 
 71 |     def record(stream = 0)
 72 |         if stream == 0
 73 |             p = FFI::MemoryPointer.new(:pointer)
 74 |             p.write_pointer(0)
 75 |             stream = p.read_pointer
 76 |         else
 77 |             stream = stream.to_ptr
 78 |         end
 79 |         status = API::cudaEventRecord(@p.read_pointer, stream)
 80 |         Pvt::handle_error(status)
 81 |         self
 82 |     end
 83 | 
 84 | 
 85 |     def synchronize
 86 |         status = API::cudaEventSynchronize(@p.read_pointer)
 87 |         Pvt::handle_error(status)
 88 |         self
 89 |     end
 90 | 
 91 | 
 92 |     def to_ptr
 93 |         @p.read_pointer
 94 |     end
 95 | 
 96 | 
 97 |     def self.elapsed_time(event_start, event_end)
 98 |         t = FFI::MemoryPointer.new(:float)
 99 |         API::cudaEventElapsedTime(t, event_start.to_ptr, event_end.to_ptr)
100 |         t.read_float
101 |     end
102 | 
103 | protected
104 | 
105 |     CUDA_EVENT_DEFAULT = CudaEventFlags[:cudaEventDefault]
106 | 
107 | end
108 | 
109 | end # module
110 | end # module
111 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/ruby/cu.rb:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2010 Chung Shin Yee
  2 | #
  3 | #       shinyee@speedgocomputing.com
  4 | #       http://www.speedgocomputing.com
  5 | #       http://github.com/xman/sgc-ruby-cuda
  6 | #       http://rubyforge.org/projects/rubycuda
  7 | #
  8 | # This file is part of SGC-Ruby-CUDA.
  9 | #
 10 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 11 | # it under the terms of the GNU General Public License as published by
 12 | # the Free Software Foundation, either version 3 of the License, or
 13 | # (at your option) any later version.
 14 | #
 15 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 | # GNU General Public License for more details.
 19 | #
 20 | # You should have received a copy of the GNU General Public License
 21 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 22 | 
 23 | 
 24 | module SGC
 25 | module CU
 26 | 
 27 | 
 28 | class CUDevice
 29 | 
 30 |     # See CUDevice::get_count.
 31 |     def self.count
 32 |         self.get_count
 33 |     end
 34 | 
 35 |     # See CUDevice#get_name.
 36 |     def name
 37 |         get_name
 38 |     end
 39 | 
 40 |     # See CUDevice#get_attribute.
 41 |     def attribute(attr)
 42 |         get_attribute(attr)
 43 |     end
 44 | 
 45 |     # See CUDevice#get_properties.
 46 |     def properties
 47 |         get_properties
 48 |     end
 49 | 
 50 | end
 51 | 
 52 | 
 53 | class CUContext
 54 | 
 55 |     # See CUContext::get_device.
 56 |     def self.device
 57 |         self.get_device
 58 |     end
 59 | 
 60 |     # See CUContext::get_limit.
 61 |     def self.limit(lim)
 62 |         get_limit(lim)
 63 |     end
 64 | 
 65 |     # See CUContext::get_cache_config.
 66 |     def self.cache_config
 67 |         get_cache_config
 68 |     end
 69 | 
 70 |     # See CUContext#get_api_version.
 71 |     def api_version
 72 |         get_api_version
 73 |     end
 74 | 
 75 | end
 76 | 
 77 | 
 78 | class CUModule
 79 | 
 80 |     # See CUModule#get_function.
 81 |     def function(name_str)
 82 |         get_function(name_str)
 83 |     end
 84 | 
 85 |     # See CUModule#get_global.
 86 |     def global(name_str)
 87 |         get_global(name_str)
 88 |     end
 89 | 
 90 |     # See CUModule#get_texref.
 91 |     def texref(name_str)
 92 |         get_texref(name_str)
 93 |     end
 94 | 
 95 | end
 96 | 
 97 | 
 98 | class CUFunction
 99 | 
100 |     # See CUFunction#get_attribute.
101 |     def attribute(attr)
102 |         get_attribute(attr)
103 |     end
104 | 
105 | end
106 | 
107 | 
108 | class CUTexRef
109 | 
110 |     # See CUTexRef#get_address.
111 |     def address
112 |         get_address
113 |     end
114 | 
115 |     # See CUTexRef#get_address_mode.
116 |     def address_mode(dim)
117 |         get_address_mode(dim)
118 |     end
119 | 
120 |     # See CUTexRef#get_filter_mode.
121 |     def filter_mode
122 |         get_filter_mode
123 |     end
124 | 
125 |     # See CUTexRef#get_flags.
126 |     def flags
127 |         get_flags
128 |     end
129 | 
130 | end
131 | 
132 | 
133 | # See ::driver_get_version.
134 | def driver_version
135 |     driver_get_version
136 | end
137 | module_function :driver_version
138 | 
139 | 
140 | end # module
141 | end # module
142 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/function.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2010 Chung Shin Yee
  3 | #
  4 | #       shinyee@speedgocomputing.com
  5 | #       http://www.speedgocomputing.com
  6 | #       http://github.com/xman/sgc-ruby-cuda
  7 | #       http://rubyforge.org/projects/rubycuda
  8 | #
  9 | # This file is part of SGC-Ruby-CUDA.
 10 | #
 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 12 | # it under the terms of the GNU General Public License as published by
 13 | # the Free Software Foundation, either version 3 of the License, or
 14 | # (at your option) any later version.
 15 | #
 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 | # GNU General Public License for more details.
 20 | #
 21 | # You should have received a copy of the GNU General Public License
 22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 23 | #
 24 | 
 25 | require 'cuda/runtime/ffi-cuda'
 26 | require 'cuda/runtime/cuda'
 27 | require 'cuda/runtime/error'
 28 | require 'memory/pointer'
 29 | require 'dl'
 30 | 
 31 | 
 32 | module SGC
 33 | module Cuda
 34 | 
 35 | class CudaFunction
 36 | 
 37 |     attr_reader :name
 38 | 
 39 | 
 40 |     def initialize(name)
 41 |         @name = name
 42 |     end
 43 | 
 44 | 
 45 |     def attributes
 46 |         a = CudaFuncAttributes.new
 47 |         status = API::cudaFuncGetAttributes(a.to_ptr, @name)
 48 |         Pvt::handle_error(status)
 49 |         a
 50 |     end
 51 | 
 52 | 
 53 |     def cache_config=(config)
 54 |         status = API::cudaFuncSetCacheConfig(@name, config)
 55 |         Pvt::handle_error(status)
 56 |         config
 57 |     end
 58 | 
 59 | 
 60 |     def launch
 61 |         status = API::cudaLaunch(@name)
 62 |         Pvt::handle_error(status)
 63 |         self
 64 |     end
 65 | 
 66 | 
 67 |     def self.configure(grid_dim, block_dim, shared_mem_size = 0, stream = 0)
 68 |         status = API::cudaConfigureCall(grid_dim, block_dim, shared_mem_size, stream)
 69 |         Pvt::handle_error(status)
 70 |         self
 71 |     end
 72 | 
 73 | 
 74 |     def self.setup(*args)
 75 |         offset = 0
 76 |         args.each do |x|
 77 |             case x
 78 |                 when Fixnum
 79 |                     p = FFI::MemoryPointer.new(:int)
 80 |                     p.write_int(x)
 81 |                     size = 4
 82 |                 when Float
 83 |                     p = FFI::MemoryPointer.new(:float)
 84 |                     p.write_float(x)
 85 |                     size = 4
 86 |                 when SGC::Memory::MemoryPointer
 87 |                     p = x.ref
 88 |                     size = FFI::MemoryPointer.size
 89 |                 else
 90 |                     raise TypeError, "Invalid type of argument #{x.to_s}."
 91 |             end
 92 |             offset = align_up(offset, size)
 93 |             status = API::cudaSetupArgument(p, size, offset)
 94 |             Pvt::handle_error(status)
 95 |             offset += size
 96 |         end
 97 |     end
 98 | 
 99 | 
100 |     def self.load_lib(name)
101 |         raise NotImplementedError
102 |     end
103 | 
104 | 
105 |     def self.load_lib_file(name)
106 |         @@libs << DL::dlopen(name)
107 |         # API::ffi_lib(name)
108 |         self
109 |     end
110 | 
111 | 
112 |     def self.unload_all_libs
113 |         @@libs.each do |h|
114 |             h.close
115 |         end
116 |         @@libs = []
117 |         self
118 |     end
119 | 
120 | protected
121 | 
122 |     def self.align_up(offset, alignment)
123 |         (offset + alignment - 1) & ~(alignment - 1)
124 |     end
125 | 
126 |     @@libs = []
127 | 
128 | end
129 | 
130 | end # module
131 | end # module
132 | 


--------------------------------------------------------------------------------
/sgc/lib/madison/kernel/test.cu:
--------------------------------------------------------------------------------
  1 | #include <cutil_inline.h>
  2 | //#include <shrUtils.h>
  3 | #include <cuda.h>
  4 | #include <math.h>
  5 | //#include <mongo.h>
  6 | 
  7 | #define DIMENSIONS 5
  8 | #define BLOCK_SIZE 16 
  9 | 
 10 | 
 11 | // Kernel definition 
 12 | //__global__ void MatAdd(float A[N][N], float B[N][N], 
 13 | //                       float C[N][N]) 
 14 | //{ 
 15 | //    int i = threadIdx.x; 
 16 | //    int j = threadIdx.y; 
 17 | //    C[i][j] = A[i][j] + B[i][j];
 18 | //}
 19 | 
 20 | // Matrices are stored in row-major order: 
 21 | // M(row, col) = *(M.elements + row * M.width + col) 
 22 | 
 23 | __global__ void MatPopulate(float *A, int count) 
 24 | { 
 25 |     int row = blockIdx.x;
 26 |     int col = threadIdx.x;
 27 |     A[row * DIMENSIONS + col] = (float)(row * DIMENSIONS + col)/(DIMENSIONS*count);
 28 | } 
 29 |  
 30 | float score(float *A, float *B){
 31 |   float score = 0.0;
 32 |   for(int i=0; i<DIMENSIONS; i++){
 33 |     //printf("%f * %f\n", A[i], B[i]);
 34 |     score += pow(A[i] * B[i], 2);
 35 |   }
 36 |   return score;
 37 | }
 38 | 
 39 | __global__ void ParallelScore(float *A, float *scores, int count) 
 40 | { 
 41 |     float *q = &A[blockIdx.x*BLOCK_SIZE + threadIdx.x];
 42 |     float *p;
 43 |     __syncthreads();
 44 |     float score;
 45 |     score = 0.0;
 46 |     for(int j=0; j<count*DIMENSIONS; j+=DIMENSIONS){
 47 |       p = &A[j];
 48 |       __syncthreads();
 49 |       for(int i=0; i<DIMENSIONS; i++){
 50 |         score += pow(q[i] * p[i], 2);
 51 |       }
 52 |       //scores[blockIdx.x * COUNT + threadIdx.x] = score;
 53 |       __syncthreads();
 54 |     }
 55 |     scores[threadIdx.x] = score;
 56 | } 
 57 |  
 58 | int main(int argc, char *argv[])
 59 | { 
 60 |     int count = (int)atoi(argv[1]) * 16;
 61 |     
 62 |     size_t size = DIMENSIONS * count * sizeof(float); 
 63 |     size_t size2 = BLOCK_SIZE * sizeof(float); 
 64 |     float *elements = (float *)malloc(size);
 65 |     float *d_elements;
 66 |     float *scores = (float *)malloc(size2);
 67 |     float *d_scores;
 68 |     cudaMalloc(&d_elements, size);
 69 |     cudaMalloc(&d_scores, size2);
 70 |     int threadsPerBlock = DIMENSIONS; 
 71 |     int numBlocks = count;
 72 |     MatPopulate<<<numBlocks, threadsPerBlock>>>(d_elements, count);
 73 |     cudaMemcpy(elements, d_elements, size, 
 74 |                cudaMemcpyDeviceToHost);
 75 |     for(int i=0; i<BLOCK_SIZE; i++){
 76 |       scores[i] = 0.0;
 77 |     }
 78 |     cudaMemcpy(d_scores, scores, size2, 
 79 |                cudaMemcpyHostToDevice);
 80 |     
 81 |     //for(int i=0; i<COUNT*DIMENSIONS; i+=DIMENSIONS){
 82 |     //  for(int j=0; j<DIMENSIONS; j++){
 83 |     //    printf("%f ", elements[i + j]);
 84 |     //  }
 85 |     //  printf("\n");
 86 |     //}
 87 |     
 88 |     if(argc > 2 && !strcmp(argv[2], "raw")){
 89 |       printf("\nraw\n");
 90 |       float _score;
 91 |       for(int i=0; i<count*DIMENSIONS; i+=DIMENSIONS){
 92 |         float *q = &elements[i];
 93 |         _score = 0.0;
 94 |         for(int j=0; j<count*DIMENSIONS; j+=DIMENSIONS){
 95 |           _score += score(q, &elements[j]);
 96 |         }
 97 |         scores[i % BLOCK_SIZE] = _score;
 98 |       }
 99 |     }
100 |     else{
101 |       printf("\nparallel\n");
102 |       
103 |       threadsPerBlock = BLOCK_SIZE; 
104 |       numBlocks = count / threadsPerBlock;
105 |       printf("\n%i blocks\n\n", numBlocks);
106 |       ParallelScore<<<numBlocks, threadsPerBlock>>>(d_elements, d_scores, count);
107 |       cudaMemcpy(scores, d_scores, size2, 
108 |                cudaMemcpyDeviceToHost);
109 |     }
110 |     float sum = 0.0;
111 |     for (int i=0;i<BLOCK_SIZE;i++)
112 |     {
113 |       sum += scores[i];
114 |       printf("%f\n", scores[i]);
115 |     }
116 |     
117 |     printf("%f\n", sum);
118 |     cudaFree(d_elements);
119 |     cudaFree(d_scores);
120 |     free(elements);
121 |     return EXIT_SUCCESS;
122 | }


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | SHELL = /bin/sh
  3 | 
  4 | #### Start of system configuration section. ####
  5 | 
  6 | srcdir = .
  7 | topdir = /Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1
  8 | hdrdir = /Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1
  9 | arch_hdrdir = /Users/zog/.rvm/rubies/ruby-1.9.2-p136/include/ruby-1.9.1/$(arch)
 10 | VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
 11 | prefix = $(DESTDIR)/Users/zog/.rvm/rubies/ruby-1.9.2-p136
 12 | rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
 13 | exec_prefix = $(prefix)
 14 | vendorhdrdir = $(rubyhdrdir)/vendor_ruby
 15 | sitehdrdir = $(rubyhdrdir)/site_ruby
 16 | rubyhdrdir = $(includedir)/$(RUBY_BASE_NAME)-$(ruby_version)
 17 | vendordir = $(rubylibprefix)/vendor_ruby
 18 | sitedir = $(rubylibprefix)/site_ruby
 19 | ridir = $(datarootdir)/$(RI_BASE_NAME)
 20 | mandir = $(datarootdir)/man
 21 | localedir = $(datarootdir)/locale
 22 | libdir = $(exec_prefix)/lib
 23 | psdir = $(docdir)
 24 | pdfdir = $(docdir)
 25 | dvidir = $(docdir)
 26 | htmldir = $(docdir)
 27 | infodir = $(datarootdir)/info
 28 | docdir = $(datarootdir)/doc/$(PACKAGE)
 29 | oldincludedir = $(DESTDIR)/usr/include
 30 | includedir = $(prefix)/include
 31 | localstatedir = $(prefix)/var
 32 | sharedstatedir = $(prefix)/com
 33 | sysconfdir = $(prefix)/etc
 34 | datadir = $(datarootdir)
 35 | datarootdir = $(prefix)/share
 36 | libexecdir = $(exec_prefix)/libexec
 37 | sbindir = $(exec_prefix)/sbin
 38 | bindir = $(exec_prefix)/bin
 39 | rubylibdir = $(rubylibprefix)/$(ruby_version)
 40 | archdir = $(rubylibdir)/$(arch)
 41 | sitelibdir = $(sitedir)/$(ruby_version)
 42 | sitearchdir = $(sitelibdir)/$(sitearch)
 43 | vendorlibdir = $(vendordir)/$(ruby_version)
 44 | vendorarchdir = $(vendorlibdir)/$(sitearch)
 45 | 
 46 | CC = gcc
 47 | CXX = g++
 48 | LIBRUBY = $(LIBRUBY_SO)
 49 | LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
 50 | LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
 51 | LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static
 52 | OUTFLAG = -o 
 53 | COUTFLAG = -o 
 54 | 
 55 | RUBY_EXTCONF_H = 
 56 | cflags   =  $(optflags) $(debugflags) $(warnflags)
 57 | optflags = -O3
 58 | debugflags = -ggdb
 59 | warnflags = -Wextra -Wno-unused-parameter -Wno-parentheses -Wpointer-arith -Wwrite-strings -Wno-missing-field-initializers -Wshorten-64-to-32 -Wno-long-long
 60 | CFLAGS   = -fno-common $(cflags)  -fno-common -pipe 
 61 | INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
 62 | DEFS     = 
 63 | CPPFLAGS =  -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE $(DEFS) $(cppflags)
 64 | CXXFLAGS = $(CFLAGS) $(cxxflags)
 65 | ldflags  = -L. -L/usr/local/lib
 66 | dldflags = -Wl,-undefined,dynamic_lookup -Wl,-multiply_defined,suppress -Wl,-flat_namespace
 67 | ARCH_FLAG = 
 68 | DLDFLAGS = $(ldflags) $(dldflags)
 69 | LDSHARED = $(CC) -dynamic -bundle
 70 | LDSHAREDXX = $(CXX) -dynamic -bundle
 71 | AR = ar
 72 | EXEEXT = 
 73 | 
 74 | RUBY_BASE_NAME = ruby
 75 | RUBY_INSTALL_NAME = ruby
 76 | RUBY_SO_NAME = ruby.1.9.1
 77 | arch = i386-darwin9.8.0
 78 | sitearch = $(arch)
 79 | ruby_version = 1.9.1
 80 | ruby = /Users/zog/.rvm/rubies/ruby-1.9.2-p136/bin/ruby
 81 | RUBY = $(ruby)
 82 | RM = rm -f
 83 | RM_RF = $(RUBY) -run -e rm -- -rf
 84 | RMDIRS = $(RUBY) -run -e rmdir -- -p
 85 | MAKEDIRS = mkdir -p
 86 | INSTALL = /usr/bin/install -c
 87 | INSTALL_PROG = $(INSTALL) -m 0755
 88 | INSTALL_DATA = $(INSTALL) -m 644
 89 | COPY = cp
 90 | 
 91 | #### End of system configuration section. ####
 92 | 
 93 | preload = 
 94 | 
 95 | libpath = . $(libdir)
 96 | LIBPATH =  -L. -L$(libdir)
 97 | DEFFILE = 
 98 | 
 99 | CLEANFILES = mkmf.log
100 | DISTCLEANFILES = 
101 | DISTCLEANDIRS = 
102 | 
103 | extout = 
104 | extout_prefix = 
105 | target_prefix = 
106 | LOCAL_LIBS = 
107 | LIBS = $(LIBRUBYARG_SHARED)  -lpthread -ldl -lobjc 
108 | SRCS = rubycu.cpp
109 | OBJS = rubycu.o
110 | TARGET = rubycu
111 | DLLIB = $(TARGET).bundle
112 | EXTSTATIC = 
113 | STATIC_LIB = 
114 | 
115 | BINDIR        = $(bindir)
116 | RUBYCOMMONDIR = $(sitedir)$(target_prefix)
117 | RUBYLIBDIR    = $(sitelibdir)$(target_prefix)
118 | RUBYARCHDIR   = $(sitearchdir)$(target_prefix)
119 | HDRDIR        = $(rubyhdrdir)/ruby$(target_prefix)
120 | ARCHHDRDIR    = $(rubyhdrdir)/$(arch)/ruby$(target_prefix)
121 | 
122 | TARGET_SO     = $(DLLIB)
123 | CLEANLIBS     = $(TARGET).bundle 
124 | CLEANOBJS     = *.o  *.bak
125 | 
126 | all:    $(DLLIB)
127 | static: $(STATIC_LIB)
128 | .PHONY: all install static install-so install-rb
129 | .PHONY: clean clean-so clean-rb
130 | 
131 | clean-rb-default::
132 | clean-rb::
133 | clean-so::
134 | clean: clean-so clean-rb-default clean-rb
135 | 		@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
136 | 
137 | distclean-rb-default::
138 | distclean-rb::
139 | distclean-so::
140 | distclean: clean distclean-so distclean-rb-default distclean-rb
141 | 		@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
142 | 		@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
143 | 		@-$(RMDIRS) $(DISTCLEANDIRS)
144 | 
145 | realclean: distclean
146 | install: install-so install-rb
147 | 
148 | install-so: $(RUBYARCHDIR)
149 | install-so: $(RUBYARCHDIR)/$(DLLIB)
150 | $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
151 | 	@-$(MAKEDIRS) $(@D)
152 | 	$(INSTALL_PROG) $(DLLIB) $(@D)
153 | install-rb: pre-install-rb install-rb-default
154 | install-rb-default: pre-install-rb-default
155 | pre-install-rb: Makefile
156 | pre-install-rb-default: Makefile
157 | $(RUBYARCHDIR):
158 | 	$(MAKEDIRS) $@
159 | 
160 | site-install: site-install-so site-install-rb
161 | site-install-so: install-so
162 | site-install-rb: install-rb
163 | 
164 | .SUFFIXES: .c .m .cc .cxx .cpp .C .o
165 | 
166 | .cc.o:
167 | 	$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
168 | 
169 | .cxx.o:
170 | 	$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
171 | 
172 | .cpp.o:
173 | 	$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
174 | 
175 | .C.o:
176 | 	$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $<
177 | 
178 | .c.o:
179 | 	$(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $<
180 | 
181 | $(DLLIB): $(OBJS) Makefile
182 | 	@-$(RM) $(@)
183 | 	$(LDSHAREDXX) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
184 | 
185 | 
186 | 
187 | $(OBJS): $(hdrdir)/ruby.h $(hdrdir)/ruby/defines.h $(arch_hdrdir)/ruby/config.h
188 | 


--------------------------------------------------------------------------------
/sgc/lib/madison/comparable.rb:
--------------------------------------------------------------------------------
  1 | module Madison
  2 |   module Comparable
  3 |     def self.included(base)
  4 |       base.send :extend, ClassMethods
  5 |       base.send :include, InstanceMethods
  6 |     end
  7 |     
  8 |     module InstanceMethods
  9 |       include SGC::Cuda
 10 |       
 11 |       BLOCK_SIZE = 16
 12 |       CLUSTER_SIZE = 16
 13 |       INTEGER_SIZE = Buffer.element_size(:int)
 14 |       
 15 |       def compare_to matrix
 16 |         raise "Dimensions must match" unless matrix.vectors_dimension == self.vectors_dimension
 17 |         
 18 |         self_num_blocks = (self.count.to_f / BLOCK_SIZE).ceil
 19 |         self_count = self_num_blocks * BLOCK_SIZE
 20 |         self_clusters_count = self_num_blocks / CLUSTER_SIZE
 21 |         self_leftovers_count = self_num_blocks - self_clusters_count * CLUSTER_SIZE
 22 |         
 23 |         other_num_blocks = (matrix.count.to_f / BLOCK_SIZE).ceil
 24 |         other_count = other_num_blocks * BLOCK_SIZE
 25 |         other_clusters_count = other_num_blocks / CLUSTER_SIZE
 26 |         other_leftovers_count = other_num_blocks - other_clusters_count * CLUSTER_SIZE
 27 |         
 28 |         @scores_size = (CLUSTER_SIZE * BLOCK_SIZE) ** 2
 29 |         @scores = Buffer.new(@type, @scores_size)
 30 |         @scores_dev = CudaDeviceMemory.malloc(@type_size * @scores_size)
 31 |         
 32 |         # Initialize scores to 0
 33 |         (0...@scores_size).each do |i| @scores[i] = 0.0 end
 34 |         
 35 |         @values_dev_1 = CudaDeviceMemory.malloc(@type_size * @size)
 36 |         @values_dev_2 = CudaDeviceMemory.malloc(@type_size * matrix.size)
 37 |         
 38 |         @keys_dev_1 = CudaDeviceMemory.malloc(INTEGER_SIZE * @size)
 39 |         @keys_dev_2 = CudaDeviceMemory.malloc(INTEGER_SIZE * matrix.size)
 40 |         
 41 |         offset_increment = CLUSTER_SIZE * BLOCK_SIZE * self.vectors_dimension
 42 |         
 43 |         path = prepare_kernel_lib
 44 |         CudaFunction.load_lib_file(path)
 45 |         CudaMemory.memcpy_htod(@scores_dev, @scores, @scores_size * @type_size)
 46 |         
 47 |         (0...self_clusters_count).each do |c|
 48 |           self.class.log  "\n> Cluster ##{c}"
 49 |           CudaMemory.memcpy_htod(@values_dev_1, self.values.offset(c*offset_increment), offset_increment * @type_size)
 50 |           CudaMemory.memcpy_htod(@keys_dev_1, self.keys.offset(c*offset_increment), offset_increment * INTEGER_SIZE)
 51 |           
 52 |           (0...other_clusters_count).each do |cc|
 53 |             self.class.log ">> with Cluster ##{cc}"
 54 |             compare_cluster_with(matrix, c, cc, CLUSTER_SIZE, CLUSTER_SIZE)
 55 |           end
 56 |           # We have to handle the leftovers => if we have 66 blocks and CLUSTER_SIZE == 64, we have to handle 2 blocks separately
 57 |           if other_leftovers_count > 0
 58 |             self.class.log ">> with the leftovers"
 59 |             compare_cluster_with(matrix, c, other_clusters_count, CLUSTER_SIZE, other_leftovers_count)
 60 |           end
 61 |         end
 62 |         if self_leftovers_count > 0
 63 |           self.class.log  "\n> The leftovers"
 64 |           c = self_clusters_count
 65 |           CudaMemory.memcpy_htod(@values_dev_1, self.values.offset(c*offset_increment), self_leftovers_count * BLOCK_SIZE * self.vectors_dimension * @type_size)
 66 |           CudaMemory.memcpy_htod(@keys_dev_1, self.keys.offset(c*offset_increment), self_leftovers_count * BLOCK_SIZE * self.vectors_dimension * INTEGER_SIZE)
 67 |           
 68 |           (0...other_clusters_count).each do |cc|
 69 |             self.class.log ">> with Cluster ##{cc}"
 70 |             compare_cluster_with(matrix, self_clusters_count, cc, self_leftovers_count, CLUSTER_SIZE)
 71 |           end
 72 |           # We have to handle the leftovers => if we have 66 blocks and CLUSTER_SIZE == 64, we have to handle 2 blocks separately
 73 |           if other_leftovers_count > 0
 74 |             self.class.log ">> with the leftovers"
 75 |             compare_cluster_with(matrix, self_clusters_count, other_clusters_count, self_leftovers_count, other_leftovers_count)
 76 |           end
 77 |         end
 78 |       end
 79 |       
 80 |       def compare_cluster_with(matrix, cluster, offset, current_cluster_size, size)
 81 |         puts [matrix.inspect, cluster, offset, current_cluster_size, size]
 82 |         puts matrix.inspect
 83 |         puts  size * BLOCK_SIZE * self.vectors_dimension * @type_size
 84 |         CudaMemory.memcpy_htod(@values_dev_2, matrix.values.offset(offset * CLUSTER_SIZE * BLOCK_SIZE * self.vectors_dimension), size * BLOCK_SIZE * self.vectors_dimension * @type_size)
 85 |         CudaMemory.memcpy_htod(@keys_dev_2, matrix.keys.offset(offset * CLUSTER_SIZE * BLOCK_SIZE * self.vectors_dimension), size * BLOCK_SIZE * self.vectors_dimension * INTEGER_SIZE)
 86 |         
 87 |         CudaFunction.configure(Dim3.new(current_cluster_size, 1, 1), Dim3.new(BLOCK_SIZE, 1, 1))
 88 |         CudaFunction.setup(@values_dev_1, @values_dev_2, @keys_dev_1, @keys_dev_2, @scores_dev, size * BLOCK_SIZE)
 89 |         f = CudaFunction.new("ParallelScore")
 90 |         f.launch
 91 |         CudaMemory.memcpy_dtoh(@scores, @scores_dev, @scores_size * @type_size)
 92 |         # @scores.each do |s| puts s end
 93 |         
 94 |         $stderr.puts "#{cluster * CLUSTER_SIZE * BLOCK_SIZE} .. #{(cluster) * BLOCK_SIZE * CLUSTER_SIZE + current_cluster_size * BLOCK_SIZE - 1} x #{offset * CLUSTER_SIZE * BLOCK_SIZE} .. #{offset * CLUSTER_SIZE * BLOCK_SIZE + size * BLOCK_SIZE - 1}"
 95 |         self.class.output_scores(current_cluster_size * BLOCK_SIZE, size * BLOCK_SIZE, cluster * CLUSTER_SIZE * BLOCK_SIZE, offset * CLUSTER_SIZE * BLOCK_SIZE, @scores)
 96 |       end
 97 |       
 98 |       def prepare_kernel_lib
 99 |         kernel_dir = "#{File.dirname(__FILE__)}/kernel"
100 |         File.open("#{kernel_dir}/kernel.h", 'w') do |f|
101 |           f.write "#define DIMENSIONS #{self.vectors_dimension}\n"
102 |           f.write "#define BLOCK_SIZE #{BLOCK_SIZE}\n"
103 |           f.write "#define CLUSTER_SIZE #{CLUSTER_SIZE}\n"
104 |         end
105 |         system "cd #{kernel_dir}; rm libkernel.*.so;nvcc -shared -Xcompiler -fPIC kernel.cu -o libkernel.#{self.vectors_dimension}.so"
106 |         "#{kernel_dir}/libkernel.#{self.vectors_dimension}.so"
107 |       end
108 |     end
109 |     
110 |     module ClassMethods
111 |       
112 |       def log message
113 |         $stderr.puts message
114 |       end
115 |       
116 |       def output_scores rows, cols, offset_x, offset_y, score
117 |         (0...rows).each do |i|
118 |           (0...cols).each do |j|
119 |             real_i = offset_x + i
120 |             real_j = offset_y + j
121 |             puts "#{real_i}\t #{real_j}\t %.3f\n" % (score.is_a?(SGC::Memory::Buffer) ? score[i * cols + j] : score)
122 |           end
123 |         end
124 |       end
125 |     end
126 |   end
127 | end


--------------------------------------------------------------------------------
/sgc/lib/cuda/runtime/ffi-cuda.rb:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright (c) 2010 Chung Shin Yee
  3 | #
  4 | #       shinyee@speedgocomputing.com
  5 | #       http://www.speedgocomputing.com
  6 | #       http://github.com/xman/sgc-ruby-cuda
  7 | #       http://rubyforge.org/projects/rubycuda
  8 | #
  9 | # This file is part of SGC-Ruby-CUDA.
 10 | #
 11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
 12 | # it under the terms of the GNU General Public License as published by
 13 | # the Free Software Foundation, either version 3 of the License, or
 14 | # (at your option) any later version.
 15 | #
 16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 | # GNU General Public License for more details.
 20 | #
 21 | # You should have received a copy of the GNU General Public License
 22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
 23 | #
 24 | 
 25 | require 'ffi'
 26 | require 'ffi/prettystruct'
 27 | 
 28 | 
 29 | module SGC
 30 | module Cuda
 31 | module API
 32 | 
 33 |     extend FFI::Library
 34 |     ffi_lib "cudart"
 35 | 
 36 |     CudaError = enum(
 37 |         :cudaSuccess, 0,
 38 |         :cudaErrorMissingConfiguration, 1,
 39 |         :cudaErrorMemoryAllocation, 2,
 40 |         :cudaErrorInitializationError, 3,
 41 |         :cudaErrorLaunchFailure, 4,
 42 |         :cudaErrorPriorLaunchFailure, 5,
 43 |         :cudaErrorLaunchTimeout, 6,
 44 |         :cudaErrorLaunchOutOfResources, 7,
 45 |         :cudaErrorInvalidDeviceFunction, 8,
 46 |         :cudaErrorInvalidConfiguration, 9,
 47 |         :cudaErrorInvalidDevice, 10,
 48 |         :cudaErrorInvalidValue, 11,
 49 |         :cudaErrorInvalidPitchValue, 12,
 50 |         :cudaErrorInvalidSymbol, 13,
 51 |         :cudaErrorMapBufferObjectFailed, 14,
 52 |         :cudaErrorUnmapBufferObjectFailed, 15,
 53 |         :cudaErrorInvalidHostPointer, 16,
 54 |         :cudaErrorInvalidDevicePointer, 17,
 55 |         :cudaErrorInvalidTexture, 18,
 56 |         :cudaErrorInvalidTextureBinding, 19,
 57 |         :cudaErrorInvalidChannelDescriptor, 20,
 58 |         :cudaErrorInvalidMemcpyDirection, 21,
 59 |         :cudaErrorAddressOfConstant, 22,
 60 |         :cudaErrorTextureFetchFailed, 23,
 61 |         :cudaErrorTextureNotBound, 24,
 62 |         :cudaErrorSynchronizationError, 25,
 63 |         :cudaErrorInvalidFilterSetting, 26,
 64 |         :cudaErrorInvalidNormSetting, 27,
 65 |         :cudaErrorMixedDeviceExecution, 28,
 66 |         :cudaErrorCudartUnloading, 29,
 67 |         :cudaErrorUnknown, 30,
 68 |         :cudaErrorNotYetImplemented, 31,
 69 |         :cudaErrorMemoryValueTooLarge, 32,
 70 |         :cudaErrorInvalidResourceHandle, 33,
 71 |         :cudaErrorNotReady, 34,
 72 |         :cudaErrorInsufficientDriver, 35,
 73 |         :cudaErrorSetOnActiveProcess, 36,
 74 |         :cudaErrorInvalidSurface, 37,
 75 |         :cudaErrorNoDevice, 38,
 76 |         :cudaErrorECCUncorrectable, 39,
 77 |         :cudaErrorSharedObjectSymbolNotFound, 40,
 78 |         :cudaErrorSharedObjectInitFailed, 41,
 79 |         :cudaErrorUnsupportedLimit, 42,
 80 |         :cudaErrorDuplicateVariableName, 43,
 81 |         :cudaErrorDuplicateTextureName, 44,
 82 |         :cudaErrorDuplicateSurfaceName, 45,
 83 |         :cudaErrorDevicesUnavailable, 46,
 84 |         :cudaErrorInvalidKernelImage, 47,
 85 |         :cudaErrorNoKernelImageForDevice, 48,
 86 |         :cudaErrorIncompatibleDriverContext, 49,
 87 |         :cudaErrorStartupFailure, 0x7F,
 88 |         :cudaErrorApiFailureBase, 10000,
 89 |     )
 90 |     CudaError_t = CudaError
 91 | 
 92 |     CudaDeviceFlags = enum(
 93 |         :cudaDeviceScheduleAuto, 0,
 94 |         :cudaDeviceScheduleSpin, 1,
 95 |         :cudaDeviceScheduleYield, 2,
 96 |         :cudaDeviceBlockingSync, 4,
 97 |         :cudaDeviceMapHost, 8,
 98 |         :cudaDeviceLmemResizeToMax, 16,
 99 |     )
100 | 
101 |     CudaEventFlags = enum(
102 |         :cudaEventDefault, 0,
103 |         :cudaEventBlockingSync, 1,
104 |         :cudaEventDisableTiming, 2,
105 |     )
106 | 
107 |     CudaHostAllocFlags = enum(
108 |         :cudaHostAllocDefault, 0,
109 |         :cudaHostAllocPortable, 1,
110 |         :cudaHostAllocMapped, 2,
111 |         :cudaHostAllocWriteCombined, 4,
112 |     )
113 | 
114 |     CudaArrayFlags = enum(
115 |         :cudaArrayDefault, 0x00,
116 |         :cudaArraySurfaceLoadStore, 0x02,
117 |     )
118 | 
119 |     CudaMemcpyKind = enum(
120 |         :cudaMemcpyHostToHost, 0,
121 |         :cudaMemcpyHostToDevice, 1,
122 |         :cudaMemcpyDeviceToHost, 2,
123 |         :cudaMemcpyDeviceToDevice, 3,
124 |     )
125 | 
126 |     CudaChannelFormatKind = enum(
127 |         :cudaChannelFormatKindSigned, 0,
128 |         :cudaChannelFormatKindUnsigned, 1,
129 |         :cudaChannelFormatKindFloat, 2,
130 |         :cudaChannelFormatKindNone,3,
131 |     )
132 | 
133 |     CudaFuncCache = enum(
134 |         :cudaFuncCachePreferNone, 0,
135 |         :cudaFuncCachePreferShared, 1,
136 |         :cudaFuncCachePreferL1, 2,
137 |     )
138 | 
139 |     CudaLimit = enum(
140 |         :cudaLimitStackSize, 0x00,
141 |         :cudaLimitPrintfFifoSize, 0x01,
142 |         :cudaLimitMallocHeapSize, 0x02,
143 |     )
144 | 
145 |     CudaComputeMode = enum(
146 |         :cudaComputeModeDefault, 0,
147 |         :cudaComputeModeExclusive, 1,
148 |         :cudaComputeModeProhibited, 2,
149 |     )
150 | 
151 |     CudaSurfaceBoundaryMode = enum(
152 |         :cudaBoundaryModeZero, 0,
153 |         :cudaBoundaryModeClamp, 1,
154 |         :cudaBoundaryModeTrap, 2,
155 |     )
156 | 
157 |     CudaSurfaceFormatMode = enum(
158 |         :cudaFormatModeForced, 0,
159 |         :cudaFormatModeAuto, 1,
160 |     )
161 | 
162 |     CudaTextureAddressMode = enum(
163 |         :cudaAddressModeWrap, 0,
164 |         :cudaAddressModeClamp, 1,
165 |         :cudaAddressModeMirror, 2,
166 |         :cudaAddressModeBorder, 3,
167 |     )
168 | 
169 |     CudaTextureFilterMode = enum(
170 |         :cudaFilterModePoint, 0,
171 |         :cudaFilterModeLinear, 1,
172 |     )
173 | 
174 |     CudaTextureReadMode = enum(
175 |         :cudaReadModeElementType, 0,
176 |         :cudaReadModeNormalizedFloat, 1,
177 |     )
178 | 
179 |     typedef :pointer, :CudaStream
180 |     typedef :pointer, :CudaEvent
181 | 
182 |     typedef :CudaStream, :CudaStream_t
183 |     typedef :CudaEvent, :CudaEvent_t
184 | 
185 | 
186 |     class Dim3 < FFI::Struct
187 |         layout(
188 |             :array, [:uint, 3],
189 |         )
190 | 
191 |         alias :init :initialize
192 |         alias :get :[]
193 |         alias :set :[]=
194 |         private :init, :get, :set
195 | 
196 |         def initialize(x, y, z)
197 |             init
198 |             @array = get(:array)
199 |             @array[0], @array[1], @array[2] = x, y, z
200 |         end
201 | 
202 |         def [](index); @array[index]; end
203 |         def []=(index, value); @array[index] = value; end
204 | 
205 |         def x; @array[0]; end
206 |         def y; @array[1]; end
207 |         def z; @array[2]; end
208 | 
209 |         def x=(value); @array[0] = value; end
210 |         def y=(value); @array[1] = value; end
211 |         def z=(value); @array[2] = value; end
212 | 
213 |     end
214 | 
215 |     class CudaDeviceProp < FFI::PrettyStruct
216 |         layout(
217 |             :name, [:char, 256],
218 |             :totalGlobalMem, :size_t,
219 |             :sharedMemPerBlock, :size_t,
220 |             :regsPerBlock, :int,
221 |             :warpSize, :int,
222 |             :memPitch, :size_t,
223 |             :maxThreadsPerBlock, :int,
224 |             :maxThreadsDim, [:int, 3],
225 |             :maxGridSize, [:int, 3],
226 |             :clockRate, :int,
227 |             :totalConstMem, :size_t,
228 |             :major, :int,
229 |             :minor, :int,
230 |             :textureAlignment, :size_t,
231 |             :deviceOverlap, :int,
232 |             :multiProcessorCount, :int,
233 |             :kernelExecTimeoutEnabled, :int,
234 |             :integrated, :int,
235 |             :canMapHostMemory, :int,
236 |             :computeMode, :int,
237 |             :maxTexture1D, :int,
238 |             :maxTexture2D, [:int, 2],
239 |             :maxTexture3D, [:int, 3],
240 |             :maxTexture2DArray, [:int, 3],
241 |             :surfaceAlignment, :size_t,
242 |             :concurrentKernels, :int,
243 |             :ECCEnabled, :int,
244 |             :pciBusID, :int,
245 |             :__cudaReserved, [:int, 21],
246 |         )
247 |     end
248 | 
249 |     class CudaFuncAttributes < FFI::PrettyStruct
250 |         layout(
251 |             :sharedSizeBytes, :size_t,
252 |             :constSizeBytes, :size_t,
253 |             :localSizeBytes, :size_t,
254 |             :maxThreadsPerBlock, :int,
255 |             :numRegs, :int,
256 |             :ptxVersion, :int,
257 |             :binaryVersion, :int,
258 |             :__cudaReserved, [:int, 6],
259 |         )
260 |     end
261 | 
262 |     class CudaChannelFormatDesc < FFI::PrettyStruct
263 |         layout(
264 |             :x, :int,
265 |             :y, :int,
266 |             :z, :int,
267 |             :w, :int,
268 |             :f, CudaChannelFormatKind,
269 |         )
270 |     end
271 | 
272 |     class CudaPitchedPtr < FFI::PrettyStruct
273 |         layout(
274 |             :ptr, :pointer,
275 |             :pitch, :size_t,
276 |             :xsize, :size_t,
277 |             :ysize, :size_t,
278 |         )
279 |     end
280 | 
281 |     class CudaPos < FFI::PrettyStruct
282 |         layout(
283 |             :x, :size_t,
284 |             :y, :size_t,
285 |             :z, :size_t,
286 |         )
287 |     end
288 | 
289 |     class CudaExtent < FFI::PrettyStruct
290 |         layout(
291 |             :width, :size_t,
292 |             :height, :size_t,
293 |             :depth, :size_t,
294 |         )
295 |     end
296 | 
297 |     class CudaMemcpy3DParms < FFI::PrettyStruct
298 |         layout(
299 |             :srcArray, :pointer,
300 |             :srcPos, CudaPos,
301 |             :srcPtr, CudaPitchedPtr,
302 |             :dstArray, :pointer,
303 |             :dstPos, CudaPos,
304 |             :dstPtr, CudaPitchedPtr,
305 |             :extent, CudaExtent,
306 |             :kind, CudaMemcpyKind,
307 |         )
308 |     end
309 | 
310 |     class TextureReference < FFI::PrettyStruct
311 |         layout(
312 |             :normalized, :int,
313 |             :filterMode, CudaTextureFilterMode,
314 |             :addressMode, [CudaTextureAddressMode, 3],
315 |             :channelDesc, CudaChannelFormatDesc,
316 |             :__cudaReserved, [:int, 16],
317 |         )
318 |     end
319 | 
320 |     class SurfaceReference < FFI::PrettyStruct
321 |         layout(
322 |             :channelDesc, CudaChannelFormatDesc,
323 |         )
324 |     end
325 | 
326 |     # CUDA Version Management.
327 |     attach_function :cudaDriverGetVersion, [:pointer], :int
328 |     attach_function :cudaRuntimeGetVersion, [:pointer], :int
329 | 
330 |     # CUDA Error Handling.
331 |     attach_function :cudaGetErrorString, [CudaError], :string
332 |     attach_function :cudaGetLastError, [], :int
333 |     attach_function :cudaPeekAtLastError, [], :int
334 | 
335 |     # CUDA Device Management.
336 |     attach_function :cudaChooseDevice, [:pointer, :pointer], :int
337 |     attach_function :cudaGetDevice, [:pointer], :int
338 |     attach_function :cudaGetDeviceCount, [:pointer], :int
339 |     attach_function :cudaGetDeviceProperties, [:pointer, :int], :int
340 |     attach_function :cudaSetDevice, [:int], :int
341 |     attach_function :cudaSetDeviceFlags, [:uint], :int
342 |     attach_function :cudaSetValidDevices, [:pointer, :int], :int
343 | 
344 |     # CUDA Thread Management.
345 |     attach_function :cudaThreadExit, [], :int
346 |     attach_function :cudaThreadGetCacheConfig, [:pointer], :int
347 |     attach_function :cudaThreadGetLimit, [:pointer, CudaLimit], :int
348 |     attach_function :cudaThreadSetCacheConfig, [CudaFuncCache], :int
349 |     attach_function :cudaThreadSetLimit, [CudaLimit, :size_t], :int
350 |     attach_function :cudaThreadSynchronize, [], :int
351 | 
352 |     # CUDA Memory Management.
353 |     attach_function :cudaFree, [:pointer], :int
354 |     attach_function :cudaFreeArray, [:pointer], :int
355 |     attach_function :cudaFreeHost, [:pointer], :int
356 |     attach_function :cudaGetSymbolAddress, [:pointer, :string], :int
357 |     attach_function :cudaGetSymbolSize, [:pointer, :string], :int
358 |     attach_function :cudaHostAlloc, [:pointer, :size_t, :uint], :int
359 |     attach_function :cudaHostGetDevicePointer, [:pointer, :pointer, :uint], :int
360 |     attach_function :cudaHostGetFlags, [:pointer, :pointer], :int
361 |     attach_function :cudaMalloc, [:pointer, :size_t], :int
362 |     attach_function :cudaMalloc3D, [:pointer, CudaExtent.by_value], :int
363 |     attach_function :cudaMalloc3DArray, [:pointer, :pointer, CudaExtent.by_value, :uint], :int
364 |     attach_function :cudaMallocArray, [:pointer, :pointer, :size_t, :size_t, :uint], :int
365 |     attach_function :cudaMallocHost, [:pointer, :size_t], :int
366 |     attach_function :cudaMallocPitch, [:pointer, :pointer, :size_t, :size_t], :int
367 |     attach_function :cudaMemcpy, [:pointer, :pointer, :size_t, CudaMemcpyKind], :int
368 |     attach_function :cudaMemcpy2D, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
369 |     attach_function :cudaMemcpy2DArrayToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
370 |     attach_function :cudaMemcpy2DAsync, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
371 |     attach_function :cudaMemcpy2DFromArray, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
372 |     attach_function :cudaMemcpy2DFromArrayAsync, [:pointer, :size_t, :pointer, :size_t, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
373 |     attach_function :cudaMemcpy2DToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
374 |     attach_function :cudaMemcpy2DToArrayAsync, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
375 |     attach_function :cudaMemcpy3D, [:pointer], :int
376 |     attach_function :cudaMemcpy3DAsync, [:pointer, :CudaStream], :int
377 |     attach_function :cudaMemcpyArrayToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
378 |     attach_function :cudaMemcpyAsync, [:pointer, :pointer, :size_t, CudaMemcpyKind, :CudaStream], :int
379 |     attach_function :cudaMemcpyFromArray, [:pointer, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind], :int
380 |     attach_function :cudaMemcpyFromArrayAsync, [:pointer, :pointer, :size_t, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
381 |     attach_function :cudaMemcpyFromSymbol, [:pointer, :string, :size_t, :size_t, CudaMemcpyKind], :int
382 |     attach_function :cudaMemcpyFromSymbolAsync, [:pointer, :string, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
383 |     attach_function :cudaMemcpyToArray, [:pointer, :size_t, :size_t, :pointer, :size_t, CudaMemcpyKind], :int
384 |     attach_function :cudaMemcpyToArrayAsync, [:pointer, :size_t, :size_t, :pointer, :size_t, CudaMemcpyKind, :CudaStream], :int
385 |     attach_function :cudaMemcpyToSymbol, [:string, :pointer, :size_t, :size_t, CudaMemcpyKind], :int
386 |     attach_function :cudaMemcpyToSymbolAsync, [:string, :pointer, :size_t, :size_t, CudaMemcpyKind, :CudaStream], :int
387 |     attach_function :cudaMemGetInfo, [:pointer, :pointer], :int
388 |     attach_function :cudaMemset, [:pointer, :int, :size_t], :int
389 |     attach_function :cudaMemset2D, [:pointer, :size_t, :int, :size_t, :size_t], :int
390 |     attach_function :cudaMemset2DAsync, [:pointer, :size_t, :int, :size_t, :size_t, :CudaStream], :int
391 |     attach_function :cudaMemset3D, [CudaPitchedPtr.by_value, :int, CudaExtent.by_value], :int
392 |     attach_function :cudaMemset3DAsync, [CudaPitchedPtr.by_value, :int, CudaExtent.by_value, :CudaStream], :int
393 |     attach_function :cudaMemsetAsync, [:pointer, :int, :size_t, :CudaStream], :int
394 |     # attach_function :make_cudaExtent, [:size_t, :size_t, :size_t], CudaExtent
395 |     # attach_function :make_cudaPitchedPtr, [:pointer, :size_t, :size_t, :size_t], CudaPitchedPtr
396 |     # attach_function :make_cudaPos, [:size_t, :size_t, :size_t], CudaPos
397 | 
398 |     def make_cudaExtent(w, h, d)
399 |         e = CudaExtent.new
400 |         e[:width], e[:height], e[:depth] = w, h, d
401 |         e
402 |     end
403 | 
404 |     def make_cudaPitchedPtr(d, p, xsz, ysz)
405 |         s = CudaPitchedPtr.new
406 |         s[:ptr] = d
407 |         s[:pitch] = p
408 |         s[:xsize] = xsz
409 |         s[:ysize] = ysz
410 |         s
411 |     end
412 | 
413 |     def make_cudaPos(x, y, z)
414 |         p = CudaPos.new
415 |         p[:x] = x
416 |         p[:y] = y
417 |         p[:z] = z
418 |         p
419 |     end
420 | 
421 |     # CUDA Execution Control.
422 |     attach_function :cudaConfigureCall, [Dim3.by_value, Dim3.by_value, :size_t, :uint], :int
423 |     attach_function :cudaFuncGetAttributes, [:pointer, :string], :int
424 |     attach_function :cudaFuncSetCacheConfig, [:string, CudaFuncCache], :int
425 |     attach_function :cudaLaunch, [:string], :int
426 |     attach_function :cudaSetDoubleForDevice, [:pointer], :int
427 |     attach_function :cudaSetDoubleForHost, [:pointer], :int
428 |     attach_function :cudaSetupArgument, [:pointer, :size_t, :size_t], :int
429 | 
430 |     # CUDA Stream Management.
431 |     attach_function :cudaStreamCreate, [:pointer], :int
432 |     attach_function :cudaStreamDestroy, [:CudaStream], :int
433 |     attach_function :cudaStreamQuery, [:CudaStream], :int
434 |     attach_function :cudaStreamSynchronize, [:CudaStream], :int
435 |     attach_function :cudaStreamWaitEvent, [:CudaStream, :CudaEvent, :uint], :int
436 | 
437 |     # CUDA Event Management.
438 |     attach_function :cudaEventCreate, [:pointer], :int
439 |     attach_function :cudaEventCreateWithFlags, [:pointer, :uint], :int
440 |     attach_function :cudaEventDestroy, [:CudaEvent], :int
441 |     attach_function :cudaEventElapsedTime, [:pointer, :CudaEvent, :CudaEvent], :int
442 |     attach_function :cudaEventQuery, [:CudaEvent], :int
443 |     attach_function :cudaEventRecord, [:CudaEvent, :CudaStream], :int
444 |     attach_function :cudaEventSynchronize, [:CudaEvent], :int
445 | 
446 |     # CUDA Texture Reference Management.
447 |     attach_function :cudaBindTexture, [:pointer, :pointer, :pointer, :pointer, :size_t], :int
448 |     attach_function :cudaBindTexture2D, [:pointer, :pointer, :pointer, :pointer, :size_t, :size_t, :size_t], :int
449 |     attach_function :cudaBindTextureToArray, [:pointer, :pointer, :pointer], :int
450 |     attach_function :cudaCreateChannelDesc, [:int, :int, :int, :int, CudaChannelFormatKind], CudaChannelFormatDesc.by_value
451 |     attach_function :cudaGetChannelDesc, [:pointer, :pointer], :int
452 |     attach_function :cudaGetTextureAlignmentOffset, [:pointer, :pointer], :int
453 |     attach_function :cudaGetTextureReference, [:pointer, :string], :int
454 |     attach_function :cudaUnbindTexture, [:pointer], :int
455 | 
456 |     # CUDA Surface Reference Management.
457 |     attach_function :cudaBindSurfaceToArray, [:pointer, :pointer, :pointer], :int
458 |     attach_function :cudaGetSurfaceReference, [:pointer, :string], :int
459 | 
460 | end # module
461 | end # module
462 | end # module
463 | 


--------------------------------------------------------------------------------
/sgc/lib/cuda/driver/rubycu.cpp:
--------------------------------------------------------------------------------
   1 | /*
   2 | # Copyright (c) 2010 Chung Shin Yee
   3 | #
   4 | #       shinyee@speedgocomputing.com
   5 | #       http://www.speedgocomputing.com
   6 | #       http://github.com/xman/sgc-ruby-cuda
   7 | #       http://rubyforge.org/projects/rubycuda
   8 | #
   9 | # This file is part of SGC-Ruby-CUDA.
  10 | #
  11 | # SGC-Ruby-CUDA is free software: you can redistribute it and/or modify
  12 | # it under the terms of the GNU General Public License as published by
  13 | # the Free Software Foundation, either version 3 of the License, or
  14 | # (at your option) any later version.
  15 | #
  16 | # SGC-Ruby-CUDA is distributed in the hope that it will be useful,
  17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 | # GNU General Public License for more details.
  20 | #
  21 | # You should have received a copy of the GNU General Public License
  22 | # along with SGC-Ruby-CUDA.  If not, see <http://www.gnu.org/licenses/>.
  23 | */
  24 | 
  25 | #include <cstring>
  26 | #include "ruby.h"
  27 | #include "cuda.h"
  28 | 
  29 | namespace SGC {
  30 | namespace CU {
  31 | 
  32 | // {{{ SGC Ruby modules.
  33 | static VALUE rb_mSGC;
  34 | static VALUE rb_mCU;
  35 | static VALUE rb_mIBuffer;
  36 | static VALUE rb_mIBufferClassMethods;
  37 | // }}}
  38 | 
  39 | // {{{ CUDA Ruby classes.
  40 | static VALUE rb_cCUDevice;
  41 | static VALUE rb_cCUContext;
  42 | static VALUE rb_cCUContextFlags;
  43 | static VALUE rb_cCULimit;
  44 | static VALUE rb_cCUModule;
  45 | static VALUE rb_cCUFunction;
  46 | static VALUE rb_cCUFunctionAttribute;
  47 | static VALUE rb_cCUFunctionCache;
  48 | static VALUE rb_cCUDevicePtr;
  49 | static VALUE rb_cCUDeviceAttribute;
  50 | static VALUE rb_cCUComputeMode;
  51 | static VALUE rb_cCUStream;
  52 | static VALUE rb_cCUEvent;
  53 | static VALUE rb_cCUEventFlags;
  54 | static VALUE rb_cCUAddressMode;
  55 | static VALUE rb_cCUFilterMode;
  56 | static VALUE rb_cCUTexRefFlags;
  57 | static VALUE rb_cCUTexRef;
  58 | static VALUE rb_cCUResult;
  59 | // }}}
  60 | 
  61 | // {{{ SGC Ruby classes.
  62 | static VALUE rb_eCUStandardError;
  63 | 
  64 | static VALUE rb_eCUDeviceError;
  65 | static VALUE rb_eCUDeviceNotInitializedError;
  66 | static VALUE rb_eCUDeviceDeinitializedError;
  67 | static VALUE rb_eCUNoDeviceError;
  68 | static VALUE rb_eCUInvalidDeviceError;
  69 | 
  70 | static VALUE rb_eCUMapError;
  71 | static VALUE rb_eCUMapFailedError;
  72 | static VALUE rb_eCUUnMapFailedError;
  73 | static VALUE rb_eCUArrayIsMappedError;
  74 | static VALUE rb_eCUAlreadyMappedError;
  75 | static VALUE rb_eCUNotMappedError;
  76 | static VALUE rb_eCUNotMappedAsArrayError;
  77 | static VALUE rb_eCUNotMappedAsPointerError;
  78 | 
  79 | static VALUE rb_eCUContextError;
  80 | static VALUE rb_eCUInvalidContextError;
  81 | static VALUE rb_eCUContextAlreadyCurrentError;
  82 | static VALUE rb_eCUUnsupportedLimitError;
  83 | 
  84 | static VALUE rb_eCULaunchError;
  85 | static VALUE rb_eCULaunchFailedError;
  86 | static VALUE rb_eCULaunchOutOfResourcesError;
  87 | static VALUE rb_eCULaunchTimeoutError;
  88 | static VALUE rb_eCULaunchIncompatibleTexturingError;
  89 | 
  90 | static VALUE rb_eCUParameterError;
  91 | static VALUE rb_eCUInvalidValueError;
  92 | static VALUE rb_eCUInvalidHandleError;
  93 | 
  94 | static VALUE rb_eCUMemoryError;
  95 | static VALUE rb_eCUOutOfMemoryError;
  96 | 
  97 | static VALUE rb_eCULibraryError;
  98 | static VALUE rb_eCUSharedObjectSymbolNotFoundError;
  99 | static VALUE rb_eCUSharedObjectInitFailedError;
 100 | 
 101 | static VALUE rb_eCUHardwareError;
 102 | static VALUE rb_eCUECCUncorrectableError;
 103 | 
 104 | static VALUE rb_eCUFileError;
 105 | static VALUE rb_eCUNoBinaryForGPUError;
 106 | static VALUE rb_eCUFileNotFoundError;
 107 | static VALUE rb_eCUInvalidSourceError;
 108 | static VALUE rb_eCUInvalidImageError;
 109 | 
 110 | static VALUE rb_eCUReferenceError;
 111 | static VALUE rb_eCUReferenceNotFoundError;
 112 | 
 113 | static VALUE rb_eCUOtherError;
 114 | static VALUE rb_eCUAlreadyAcquiredError;
 115 | static VALUE rb_eCUNotReadyError;
 116 | static VALUE rb_eCUOperatingSystemError;
 117 | 
 118 | static VALUE rb_eCUUnknownError;
 119 | 
 120 | static VALUE rb_cMemoryPointer;
 121 | static VALUE rb_cMemoryBuffer;
 122 | static VALUE rb_cInt32Buffer;
 123 | static VALUE rb_cInt64Buffer;
 124 | static VALUE rb_cFloat32Buffer;
 125 | static VALUE rb_cFloat64Buffer;
 126 | // }}}
 127 | 
 128 | // {{{ SGC C/C++ structures.
 129 | typedef struct {
 130 |     char* p;
 131 | } MemoryPointer;
 132 | 
 133 | typedef struct : MemoryPointer {
 134 |     size_t size;
 135 |     bool is_page_locked;
 136 | } MemoryBuffer;
 137 | 
 138 | template <typename TElement>
 139 | struct TypedBuffer : public MemoryBuffer {};
 140 | 
 141 | typedef struct TypedBuffer<int>    Int32Buffer;
 142 | typedef struct TypedBuffer<long>   Int64Buffer;
 143 | typedef struct TypedBuffer<float>  Float32Buffer;
 144 | typedef struct TypedBuffer<double> Float64Buffer;
 145 | // }}}
 146 | 
 147 | // {{{ Function prototypes.
 148 | static VALUE device_ptr_alloc(VALUE klass);
 149 | static VALUE device_ptr_initialize(int argc, VALUE* argv, VALUE self);
 150 | // }}}
 151 | 
 152 | // {{{ SGC helpers.
 153 | template <typename T>
 154 | static void generic_free(void* p)
 155 | {
 156 |     delete static_cast<T*>(p);
 157 | }
 158 | 
 159 | template <typename T>
 160 | static VALUE to_rb(T v);
 161 | 
 162 | VALUE to_rb(bool b)
 163 | {
 164 |     if (b) {
 165 |         return Qtrue;
 166 |     }
 167 |     return Qfalse;
 168 | }
 169 | 
 170 | template <>
 171 | VALUE to_rb(int v)
 172 | {
 173 |     return INT2FIX(v);
 174 | }
 175 | 
 176 | template <>
 177 | VALUE to_rb(long v)
 178 | {
 179 |     return LONG2NUM(v);
 180 | }
 181 | 
 182 | template <>
 183 | VALUE to_rb(float v)
 184 | {
 185 |     return DBL2NUM(static_cast<double>(v));
 186 | }
 187 | 
 188 | template <>
 189 | VALUE to_rb(double v)
 190 | {
 191 |     return DBL2NUM(v);
 192 | }
 193 | 
 194 | template <typename T>
 195 | static T to_ctype(VALUE v);
 196 | 
 197 | template <>
 198 | bool to_ctype<bool>(VALUE b)
 199 | {
 200 |     if (b == Qfalse || b == Qnil) {
 201 |         return false;
 202 |     }
 203 |     return true;
 204 | }
 205 | 
 206 | template <>
 207 | int to_ctype<int>(VALUE v)
 208 | {
 209 |     return NUM2INT(v);
 210 | }
 211 | 
 212 | template <>
 213 | unsigned int to_ctype<unsigned int>(VALUE v)
 214 | {
 215 |     return NUM2UINT(v);
 216 | }
 217 | 
 218 | template <>
 219 | long to_ctype<long>(VALUE v)
 220 | {
 221 |     return NUM2LONG(v);
 222 | }
 223 | 
 224 | template <>
 225 | unsigned long to_ctype<unsigned long>(VALUE v)
 226 | {
 227 |     return NUM2ULONG(v);
 228 | }
 229 | 
 230 | template <>
 231 | float to_ctype<float>(VALUE v)
 232 | {
 233 |     return static_cast<float>(NUM2DBL(v));
 234 | }
 235 | 
 236 | template <>
 237 | double to_ctype(VALUE v)
 238 | {
 239 |     return NUM2DBL(v);
 240 | }
 241 | 
 242 | // in  ary[0]: Class contains class constants.
 243 | // in  ary[1]: Constant to match.
 244 | // out ary[2]: Label matches with constant.
 245 | static VALUE class_const_match(VALUE current_label, VALUE* ary)
 246 | {
 247 |     const VALUE& rb_class_const = ary[0];
 248 |     const VALUE& constant_value = ary[1];
 249 |     VALUE& label = ary[2];
 250 |     VALUE v = rb_const_get(rb_class_const, SYM2ID(current_label));
 251 |     if (FIX2INT(v) == FIX2INT(constant_value)) {
 252 |         label = current_label;
 253 |         return Qtrue;
 254 |     }
 255 |     return Qfalse;
 256 | }
 257 | 
 258 | // Extend _klass_ with the module _mod::ClassMethods_.
 259 | static VALUE module_included_classmethods_hook(VALUE mod, VALUE klass)
 260 | {
 261 |     VALUE m = rb_cvar_get(mod, rb_intern("ClassMethods"));
 262 |     rb_extend_object(klass, m);
 263 |     return Qnil;
 264 | }
 265 | 
 266 | #define RAISE_CU_STD_ERROR_FORMATTED(status, format, ...) rb_raise(rb_hash_aref(rb_error_class_by_enum, INT2FIX(status)), "%s:%d " format, __FILE__, __LINE__, __VA_ARGS__)
 267 | #define RAISE_CU_STD_ERROR(status, message) RAISE_CU_STD_ERROR_FORMATTED(status, "%s", message)
 268 | // }}}
 269 | 
 270 | // {{{ SGC Ruby data.
 271 | static VALUE rb_error_class_by_enum;
 272 | // }}}
 273 | 
 274 | 
 275 | // {{{ CUdevice
 276 | 
 277 | /*  call-seq: CUDevice.get_count    ->    Fixnum
 278 |  *
 279 |  *  Return the number of CUDA devices.
 280 |  */
 281 | static VALUE device_get_count(VALUE klass)
 282 | {
 283 |     int count;
 284 |     CUresult status = cuDeviceGetCount(&count);
 285 |     if (status != CUDA_SUCCESS) {
 286 |         RAISE_CU_STD_ERROR(status, "Failed to get device count.");
 287 |     }
 288 |     return INT2FIX(count);
 289 | }
 290 | 
 291 | /*  call-seq: CUDevice.get(index)    ->    CUDevice
 292 |  *
 293 |  *  Return a CUDevice instance corresponding to CUDA device _index_ (0..CUDevice.get_count-1).
 294 |  */
 295 | static VALUE device_get(VALUE klass, VALUE num)
 296 | {
 297 |     CUdevice* pdev;
 298 |     VALUE rb_pdev = rb_class_new_instance(0, NULL, rb_cCUDevice);
 299 |     Data_Get_Struct(rb_pdev, CUdevice, pdev);
 300 |     int i = FIX2INT(num);
 301 |     CUresult status = cuDeviceGet(pdev, i);
 302 |     if (status != CUDA_SUCCESS) {
 303 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get device %d.", i);
 304 |     }
 305 |     return rb_pdev;
 306 | }
 307 | 
 308 | static VALUE device_alloc(VALUE klass)
 309 | {
 310 |     CUdevice* p = new CUdevice;
 311 |     return Data_Wrap_Struct(klass, 0, generic_free<CUdevice>, p);
 312 | }
 313 | 
 314 | static VALUE device_initialize(int argc, VALUE* argv, VALUE self)
 315 | {
 316 |     return self;
 317 | }
 318 | 
 319 | /*  call-seq: dev.get_name    ->    String
 320 |  *
 321 |  *  Return the name of _self_ with a maximum of 255 characters.
 322 |  */
 323 | static VALUE device_get_name(VALUE self)
 324 | {
 325 |     CUdevice* p;
 326 |     Data_Get_Struct(self, CUdevice, p);
 327 |     char name[256];
 328 |     CUresult status = cuDeviceGetName(name, 256, *p);
 329 |     if (status != CUDA_SUCCESS) {
 330 |         RAISE_CU_STD_ERROR(status, "Failed to get device name.");
 331 |     }
 332 |     return rb_str_new2(name);
 333 | }
 334 | 
 335 | /*  call-seq: dev.compute_capability    ->    Hash { major:, minor: }
 336 |  *
 337 |  *  Return the compute capability of _self_.
 338 |  *
 339 |  *      # For a device with compute capability 1.3:
 340 |  *      dev.compute_capability        #=> { major: 1, minor: 3 }
 341 |  */
 342 | static VALUE device_compute_capability(VALUE self)
 343 | {
 344 |     CUdevice* p;
 345 |     Data_Get_Struct(self, CUdevice, p);
 346 |     int major;
 347 |     int minor;
 348 |     CUresult status = cuDeviceComputeCapability(&major, &minor, *p);
 349 |     if (status != CUDA_SUCCESS) {
 350 |         RAISE_CU_STD_ERROR(status, "Failed to query device compute capability.");
 351 |     }
 352 |     VALUE h = rb_hash_new();
 353 |     rb_hash_aset(h, ID2SYM(rb_intern("major")), INT2FIX(major));
 354 |     rb_hash_aset(h, ID2SYM(rb_intern("minor")), INT2FIX(minor));
 355 |     return h;
 356 | }
 357 | 
 358 | /*  call-seq: dev.get_attribute(attribute)    ->    Fixnum
 359 |  *
 360 |  *  Return _attribute_ (CUDeviceAttribute) of _self_.
 361 |  *
 362 |  *      dev.get_attribute(CUDeviceAttribute::MAX_THREADS_PER_BLOCK)        #=> 512
 363 |  *      dev.get_attribute(CUDeviceAttribute::MULTIPROCESSOR_COUNT)         #=> 30
 364 |  *      dev.get_attribute(CUDeviceAttribute::MAX_SHARED_MEMORY_PER_BLOCK)  #=> 16384
 365 |  */
 366 | static VALUE device_get_attribute(VALUE self, VALUE attribute)
 367 | {
 368 |     CUdevice* p;
 369 |     Data_Get_Struct(self, CUdevice, p);
 370 |     int v;
 371 |     CUresult status = cuDeviceGetAttribute(&v, static_cast<CUdevice_attribute>(FIX2INT(attribute)), *p);
 372 |     if (status != CUDA_SUCCESS) {
 373 |         VALUE attributes = rb_funcall(rb_cCUDeviceAttribute, rb_intern("constants"), 0);
 374 |         VALUE ary[3] = { rb_cCUDeviceAttribute, attribute, Qnil };
 375 |         rb_block_call(attributes, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
 376 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to query device attribute: %s.", rb_id2name(SYM2ID(ary[2])));
 377 |     }
 378 |     return INT2FIX(v);
 379 | }
 380 | 
 381 | /*  call-seq: dev.get_properties    ->    Hash
 382 |  *
 383 |  *  Return the properties of _self_ in a hash with the following keys:
 384 |  *  * :clock_rate
 385 |  *  * :max_grid_size
 386 |  *  * :max_threads_dim
 387 |  *  * :max_threads_per_block
 388 |  *  * :mem_pitch
 389 |  *  * :regs_per_block
 390 |  *  * :shared_mem_per_block
 391 |  *  * :simd_width
 392 |  *  * :texture_align
 393 |  *  * :total_constant_memory
 394 |  */
 395 | static VALUE device_get_properties(VALUE self)
 396 | {
 397 |     CUdevice* pdevice;
 398 |     Data_Get_Struct(self, CUdevice, pdevice);
 399 |     CUdevprop prop;
 400 |     CUresult status = cuDeviceGetProperties(&prop, *pdevice);
 401 |     if (status != CUDA_SUCCESS) {
 402 |         RAISE_CU_STD_ERROR(status, "Failed to get device properties.");
 403 |     }
 404 | 
 405 |     VALUE max_grid_size = rb_ary_new3(3, INT2FIX(prop.maxGridSize[0]), INT2FIX(prop.maxGridSize[1]), INT2FIX(prop.maxGridSize[2]));
 406 |     VALUE max_threads_dim = rb_ary_new3(3, INT2FIX(prop.maxThreadsDim[0]), INT2FIX(prop.maxThreadsDim[1]), INT2FIX(prop.maxThreadsDim[2]));
 407 | 
 408 |     VALUE h = rb_hash_new();
 409 |     rb_hash_aset(h, ID2SYM(rb_intern("clock_rate")), INT2FIX(prop.clockRate));
 410 |     rb_hash_aset(h, ID2SYM(rb_intern("max_grid_size")), max_grid_size);
 411 |     rb_hash_aset(h, ID2SYM(rb_intern("max_threads_dim")), max_threads_dim);
 412 |     rb_hash_aset(h, ID2SYM(rb_intern("max_threads_per_block")), INT2FIX(prop.maxThreadsPerBlock));
 413 |     rb_hash_aset(h, ID2SYM(rb_intern("mem_pitch")), INT2FIX(prop.memPitch));
 414 |     rb_hash_aset(h, ID2SYM(rb_intern("regs_per_block")), INT2FIX(prop.regsPerBlock));
 415 |     rb_hash_aset(h, ID2SYM(rb_intern("shared_mem_per_block")), INT2FIX(prop.sharedMemPerBlock));
 416 |     rb_hash_aset(h, ID2SYM(rb_intern("simd_width")), INT2FIX(prop.SIMDWidth));
 417 |     rb_hash_aset(h, ID2SYM(rb_intern("texture_align")), INT2FIX(prop.textureAlign));
 418 |     rb_hash_aset(h, ID2SYM(rb_intern("total_constant_memory")), INT2FIX(prop.totalConstantMemory));
 419 |     return h;
 420 | }
 421 | 
 422 | /*  call-seq: dev.total_mem    ->    Numeric
 423 |  *
 424 |  *  Return the total amount of device memory in bytes.
 425 |  */
 426 | static VALUE device_total_mem(VALUE self)
 427 | {
 428 |     CUdevice* p;
 429 |     Data_Get_Struct(self, CUdevice, p);
 430 |     size_t nbytes;
 431 |     CUresult status = cuDeviceTotalMem(&nbytes, *p);
 432 |     if (status != CUDA_SUCCESS) {
 433 |         RAISE_CU_STD_ERROR(status, "Failed to get device total amount of memory available.");
 434 |     }
 435 |     return SIZET2NUM(nbytes);
 436 | }
 437 | 
 438 | // }}}
 439 | 
 440 | 
 441 | // {{{ CUcontext
 442 | 
 443 | static VALUE context_alloc(VALUE klass)
 444 | {
 445 |     CUcontext* p = new CUcontext;
 446 |     return Data_Wrap_Struct(klass, 0, generic_free<CUcontext>, p);
 447 | }
 448 | 
 449 | static VALUE context_initialize(int argc, VALUE* argv, VALUE self)
 450 | {
 451 |     return self;
 452 | }
 453 | 
 454 | /*  call-seq: ctx.create(device)           ->    self
 455 |  *            ctx.create(flags, device)    ->    self
 456 |  *
 457 |  *  Create a new CUDA context with _flags_ (CUContextFlags) and _device_ (CUDevice),
 458 |  *  then associate it with the calling thread, and return the context.
 459 |  *  Setting flags to 0 or ommitting flags uses SCHED_AUTO.
 460 |  *
 461 |  *      dev = CUDevice.get(0)
 462 |  *      ctx = CUContext.new
 463 |  *      ctx.create(dev)           #=>    ctx
 464 |  *      ctx.create(0, dev)        #=>    ctx
 465 |  *      ctx.create(CUContextFlags::SCHED_SPIN | CUContextFlags::BLOCKING_SYNC, dev)        #=>    ctx
 466 |  */
 467 | static VALUE context_create(int argc, VALUE* argv, VALUE self)
 468 | {
 469 |     if (argc <= 0 || argc > 2) {
 470 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc);
 471 |     }
 472 | 
 473 |     CUcontext* pcontext;
 474 |     CUdevice* pdevice;
 475 |     unsigned int flags = 0;
 476 |     Data_Get_Struct(self, CUcontext, pcontext);
 477 |     if (argc == 2) {
 478 |         flags = FIX2UINT(argv[0]);
 479 |         Data_Get_Struct(argv[1], CUdevice, pdevice);
 480 |     } else { // argc == 1
 481 |         Data_Get_Struct(argv[0], CUdevice, pdevice);
 482 |     }
 483 |     CUresult status = cuCtxCreate(pcontext, flags, *pdevice);
 484 |     if (status != CUDA_SUCCESS) {
 485 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create context: flags = 0x%x.", flags);
 486 |     }
 487 |     return self;
 488 | }
 489 | 
 490 | /*  call-seq: ctx.destroy    ->    nil
 491 |  *
 492 |  *  Destroy the CUDA context _self_.
 493 |  */
 494 | static VALUE context_destroy(VALUE self)
 495 | {
 496 |     CUcontext* p;
 497 |     Data_Get_Struct(self, CUcontext, p);
 498 |     CUresult status = cuCtxDestroy(*p);
 499 |     if (status != CUDA_SUCCESS) {
 500 |         RAISE_CU_STD_ERROR(status, "Failed to destroy context.");
 501 |     }
 502 |     return Qnil;
 503 | }
 504 | 
 505 | /*  call-seq: ctx.attach           ->    self
 506 |  *            ctx.attach(flags)    ->    self
 507 |  *
 508 |  *  Increment the reference count on _self_.
 509 |  *  Currently, _flags_ must be set to 0.
 510 |  */
 511 | static VALUE context_attach(int argc, VALUE* argv, VALUE self)
 512 | {
 513 |     CUcontext* p;
 514 |     unsigned int flags = 0;
 515 |     Data_Get_Struct(self, CUcontext, p);
 516 |     if (argc == 1) {
 517 |         flags = FIX2UINT(argv[0]);
 518 |     }
 519 |     CUresult status = cuCtxAttach(p, flags);
 520 |     if (status != CUDA_SUCCESS) {
 521 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to attach context: flags = 0x%x.", flags);
 522 |     }
 523 |     return self;
 524 | }
 525 | 
 526 | 
 527 | /*  call-seq: ctx.detach    ->    nil
 528 |  *
 529 |  *  Decrement the reference count on _self_.
 530 |  */
 531 | static VALUE context_detach(VALUE self)
 532 | {
 533 |     CUcontext* p;
 534 |     Data_Get_Struct(self, CUcontext, p);
 535 |     CUresult status = cuCtxDetach(*p);
 536 |     if (status != CUDA_SUCCESS) {
 537 |         RAISE_CU_STD_ERROR(status, "Failed to detach context.");
 538 |     }
 539 |     return Qnil;
 540 | }
 541 | 
 542 | /*  call-seq: ctx.push_current    ->    self
 543 |  *
 544 |  *  Push _self_ onto the context stack, which becomes currently active context.
 545 |  */
 546 | static VALUE context_push_current(VALUE self)
 547 | {
 548 |     CUcontext* p;
 549 |     Data_Get_Struct(self, CUcontext, p);
 550 |     CUresult status = cuCtxPushCurrent(*p);
 551 |     if (status != CUDA_SUCCESS) {
 552 |         RAISE_CU_STD_ERROR(status, "Failed to push this context.");
 553 |     }
 554 |     return self;
 555 | }
 556 | 
 557 | /*  call-seq: ctx.get_api_version    ->     Numeric
 558 |  *
 559 |  *  Return the API version used to create _self_.
 560 |  */
 561 | static VALUE context_get_api_version(VALUE self)
 562 | {
 563 |     CUcontext* p;
 564 |     Data_Get_Struct(self, CUcontext, p);
 565 |     unsigned int version;
 566 |     CUresult status = cuCtxGetApiVersion(*p, &version);
 567 |     if (status != CUDA_SUCCESS) {
 568 |         RAISE_CU_STD_ERROR(status, "Failed to get the API version of this context.");
 569 |     }
 570 |     return UINT2NUM(version);
 571 | }
 572 | 
 573 | /*  call-seq: CUContext.get_api_version    ->    Numeric
 574 |  *
 575 |  *  Return the API version used to create current context.
 576 |  */
 577 | static VALUE context_get_api_version_singleton(VALUE klass)
 578 | {
 579 |     unsigned int version;
 580 |     CUresult status = cuCtxGetApiVersion(NULL, &version);
 581 |     if (status != CUDA_SUCCESS) {
 582 |         RAISE_CU_STD_ERROR(status, "Failed to get the API version of current context.");
 583 |     }
 584 |     return UINT2NUM(version);
 585 | }
 586 | 
 587 | /*  call-seq: CUContext.get_device    ->    CUDevice
 588 |  *
 589 |  *  Return the device associated to the current CUDA context.
 590 |  */
 591 | static VALUE context_get_device(VALUE klass)
 592 | {
 593 |     VALUE device = rb_class_new_instance(0, NULL, rb_cCUDevice);
 594 |     CUdevice* pdevice;
 595 |     Data_Get_Struct(device, CUdevice, pdevice);
 596 |     CUresult status = cuCtxGetDevice(pdevice);
 597 |     if (status != CUDA_SUCCESS) {
 598 |         RAISE_CU_STD_ERROR(status, "Failed to get current context's device.");
 599 |     }
 600 |     return device;
 601 | }
 602 | 
 603 | /*  call-seq: CUContext.get_limit(limit)    ->    Numeric
 604 |  *
 605 |  *  Return the _limit_ (CULimit) of the current CUDA context.
 606 |  *
 607 |  *      CUContext.get_limit(CULimit::STACK_SIZE)        #=>    8192
 608 |  */
 609 | static VALUE context_get_limit(VALUE klass, VALUE limit)
 610 | {
 611 |     CUlimit l = static_cast<CUlimit>(FIX2UINT(limit));
 612 |     size_t v = 0;
 613 |     CUresult status = cuCtxGetLimit(&v, l);
 614 |     if (status != CUDA_SUCCESS) {
 615 |         VALUE limits = rb_funcall(rb_cCULimit, rb_intern("constants"), 0);
 616 |         VALUE ary[3] = { rb_cCULimit, limit, Qnil };
 617 |         rb_block_call(limits, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
 618 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get context limit: %s.", rb_id2name(SYM2ID(ary[2])));
 619 |     }
 620 |     return SIZET2NUM(v);
 621 | }
 622 | 
 623 | /*  call-seq: CUContext.set_limit(limit, value)    ->    nil
 624 |  *
 625 |  *  Set the _limit_ (CULimit) of the current CUDA context.
 626 |  *
 627 |  *      CUContext.set_limit(CULimit::STACK_SIZE, 8192)        #=>    nil
 628 |  */
 629 | static VALUE context_set_limit(VALUE klass, VALUE limit, VALUE value)
 630 | {
 631 |     CUlimit l = static_cast<CUlimit>(FIX2UINT(limit));
 632 |     CUresult status = cuCtxSetLimit(l, NUM2SIZET(value));
 633 |     if (status != CUDA_SUCCESS) {
 634 |         VALUE limits = rb_funcall(rb_cCULimit, rb_intern("constants"), 0);
 635 |         VALUE ary[3] = { rb_cCULimit, limit, Qnil };
 636 |         rb_block_call(limits, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
 637 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set context limit: %s to %lu.", rb_id2name(SYM2ID(ary[2])), NUM2SIZET(value));
 638 |     }
 639 |     return Qnil;
 640 | }
 641 | 
 642 | /*  call-seq: CUContext.get_cache_config        ->    CUFunctionCache
 643 |  *
 644 |  *  Return the cache config of the current CUDA context.
 645 |  *
 646 |  *      CUContext.get_cache_config        #=> 1
 647 |  */
 648 | static VALUE context_get_cache_config(VALUE klass)
 649 | {
 650 |     CUfunc_cache config;
 651 |     CUresult status = cuCtxGetCacheConfig(&config);
 652 |     if (status != CUDA_SUCCESS) {
 653 |         RAISE_CU_STD_ERROR(status, "Failed to get context cache config.");
 654 |     }
 655 |     return UINT2NUM(static_cast<unsigned int>(config));
 656 | }
 657 | 
 658 | /*  call-seq: CUContext.set_cache_config(config)        ->    nil
 659 |  *
 660 |  *  Set the cache with _config_ (CUFunctionCache) for the current CUDA context.
 661 |  *
 662 |  *      CUContext.set_cache_config(CUFunctionCache::PREFER_SHARED)        #=> nil
 663 |  */
 664 | static VALUE context_set_cache_config(VALUE klass, VALUE config)
 665 | {
 666 |     CUresult status = cuCtxSetCacheConfig(static_cast<CUfunc_cache>(FIX2UINT(config)));
 667 |     if (status != CUDA_SUCCESS) {
 668 |         VALUE configs = rb_funcall(rb_cCUFunctionCache, rb_intern("constants"), 0);
 669 |         VALUE ary[3] = { rb_cCUFunctionCache, config, Qnil };
 670 |         rb_block_call(configs, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
 671 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set context cache config: %s.", rb_id2name(SYM2ID(ary[2])));
 672 |     }
 673 |     return Qnil;
 674 | }
 675 | 
 676 | /*  call-seq: CUContext.pop_current        ->    CUContext
 677 |  *
 678 |  *  Pop the current CUDA context from the context stack, which becomes inactive.
 679 |  */
 680 | static VALUE context_pop_current(VALUE klass)
 681 | {
 682 |     VALUE context = rb_class_new_instance(0, NULL, rb_cCUContext);
 683 |     CUcontext* pcontext;
 684 |     Data_Get_Struct(context, CUcontext, pcontext);
 685 |     CUresult status = cuCtxPopCurrent(pcontext);
 686 |     if (status != CUDA_SUCCESS) {
 687 |         RAISE_CU_STD_ERROR(status, "Failed to pop current context.");
 688 |     }
 689 |     return context;
 690 | }
 691 | 
 692 | /*  call-seq: CUContext.synchronize        ->    nil
 693 |  *
 694 |  *  Block until all the tasks of the current CUDA context complete.
 695 |  */
 696 | static VALUE context_synchronize(VALUE klass)
 697 | {
 698 |     CUresult status = cuCtxSynchronize();
 699 |     if (status != CUDA_SUCCESS) {
 700 |         RAISE_CU_STD_ERROR(status, "Failed to synchronize this context.");
 701 |     }
 702 |     return Qnil;
 703 | }
 704 | 
 705 | // }}}
 706 | 
 707 | 
 708 | // {{{ CUmodule
 709 | 
 710 | static VALUE module_alloc(VALUE klass)
 711 | {
 712 |     CUmodule* p = new CUmodule;
 713 |     return Data_Wrap_Struct(klass, 0, generic_free<CUmodule>, p);
 714 | }
 715 | 
 716 | static VALUE module_initialize(int argc, VALUE* argv, VALUE self)
 717 | {
 718 |     return self;
 719 | }
 720 | 
 721 | /*  call-seq: mod.load(path)    ->    self
 722 |  *
 723 |  *  Load a compute module from the file at _path_ into the current CUDA context.
 724 |  *  The file should be a cubin file or a PTX file.
 725 |  *
 726 |  *  A PTX file may be obtained by compiling the .cu file using nvcc with -ptx option.
 727 |  *      $ nvcc -ptx vadd.cu
 728 |  */
 729 | static VALUE module_load(VALUE self, VALUE str)
 730 | {
 731 |     CUmodule* p;
 732 |     Data_Get_Struct(self, CUmodule, p);
 733 |     CUresult status = cuModuleLoad(p, StringValuePtr(str));
 734 |     if (status != CUDA_SUCCESS) {
 735 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to load module: %s.", StringValuePtr(str));
 736 |     }
 737 |     return self;
 738 | }
 739 | 
 740 | /*  call-seq: mod.load_data(image_str)    ->    self
 741 |  *
 742 |  *  Load a compute module from the String _image_str_ which contains a cubin or a PTX data
 743 |  *  into the current CUDA context.
 744 |  *
 745 |  *  <br /> See also CUModule#load.
 746 |  */
 747 | static VALUE module_load_data(VALUE self, VALUE image)
 748 | {
 749 |     CUmodule* p;
 750 |     Data_Get_Struct(self, CUmodule, p);
 751 |     CUresult status = cuModuleLoadData(p, StringValuePtr(image));
 752 |     if (status != CUDA_SUCCESS) {
 753 |         RAISE_CU_STD_ERROR(status, "Failed to load module data.");
 754 |     }
 755 |     return self;
 756 | }
 757 | 
 758 | /*  call-seq: mod.unload    ->    self
 759 |  *
 760 |  *  Unload _self_ from the current CUDA context.
 761 |  */
 762 | static VALUE module_unload(VALUE self)
 763 | {
 764 |     CUmodule* p;
 765 |     Data_Get_Struct(self, CUmodule, p);
 766 |     CUresult status = cuModuleUnload(*p);
 767 |     if (status != CUDA_SUCCESS) {
 768 |         RAISE_CU_STD_ERROR(status, "Failed to unload module.");
 769 |     }
 770 |     return self;
 771 | }
 772 | 
 773 | /*  call-seq: mod.get_function(name_str)    ->    CUFunction
 774 |  *
 775 |  *  Return a CUFunction instance corresponding to the function name _name_str_ in the loaded compute module.
 776 |  *  A compute module was loaded with CUModule#load and alike methods.
 777 |  */
 778 | static VALUE module_get_function(VALUE self, VALUE str)
 779 | {
 780 |     CUmodule* p;
 781 |     Data_Get_Struct(self, CUmodule, p);
 782 |     CUfunction* pfunc = new CUfunction;
 783 |     CUresult status = cuModuleGetFunction(pfunc, *p, StringValuePtr(str));
 784 |     if (status != CUDA_SUCCESS) {
 785 |         delete pfunc;
 786 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module function: %s.", StringValuePtr(str));
 787 |     }
 788 |     return Data_Wrap_Struct(rb_cCUFunction, 0, generic_free<CUfunction>, pfunc);
 789 | }
 790 | 
 791 | /*  call-seq: mod.get_global(name_str)    ->    [CUDevicePtr, Numeric]
 792 |  *
 793 |  *  Return the CUDevicePtr corresponding to the global variable in the loaded compute module and its size in bytes.
 794 |  */
 795 | static VALUE module_get_global(VALUE self, VALUE str)
 796 | {
 797 |     CUmodule* p;
 798 |     Data_Get_Struct(self, CUmodule, p);
 799 |     VALUE rb_devptr = device_ptr_alloc(rb_cCUDevicePtr);
 800 |     device_ptr_initialize(0, NULL, rb_devptr);
 801 |     CUdeviceptr* pdevptr;
 802 |     Data_Get_Struct(rb_devptr, CUdeviceptr, pdevptr);
 803 |     size_t nbytes;
 804 |     CUresult status = cuModuleGetGlobal(pdevptr, &nbytes, *p, StringValuePtr(str));
 805 |     if (status != CUDA_SUCCESS) {
 806 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module global: %s.", StringValuePtr(str));
 807 |     }
 808 |     return rb_ary_new3(2, rb_devptr, SIZET2NUM(nbytes));
 809 | }
 810 | 
 811 | /*  call-seq: mod.get_texref(name_str)    ->    CUTexRef
 812 |  *
 813 |  *  Return a CUTexRef instance corresponding to the texture name _name_str_ in the loaded compute module.
 814 |  */
 815 | static VALUE module_get_texref(VALUE self, VALUE str)
 816 | {
 817 |     CUmodule* pmodule;
 818 |     CUtexref* ptexref;
 819 |     Data_Get_Struct(self, CUmodule, pmodule);
 820 |     VALUE rb_texref = rb_class_new_instance(0, NULL, rb_cCUTexRef);
 821 |     Data_Get_Struct(rb_texref, CUtexref, ptexref);
 822 |     CUresult status = cuModuleGetTexRef(ptexref, *pmodule, StringValuePtr(str));
 823 |     if (status != CUDA_SUCCESS) {
 824 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get module texture reference: %s.", StringValuePtr(str));
 825 |     }
 826 |     return rb_texref;
 827 | }
 828 | 
 829 | // }}}
 830 | 
 831 | 
 832 | // {{{ CUdeviceptr
 833 | 
 834 | static VALUE device_ptr_alloc(VALUE klass)
 835 | {
 836 |     CUdeviceptr* p = new CUdeviceptr;
 837 |     return Data_Wrap_Struct(klass, 0, generic_free<CUdeviceptr>, p);
 838 | }
 839 | 
 840 | static VALUE device_ptr_initialize(int argc, VALUE* argv, VALUE self)
 841 | {
 842 |     CUdeviceptr* p;
 843 |     Data_Get_Struct(self, CUdeviceptr, p);
 844 |     *p = static_cast<CUdeviceptr>(0);
 845 |     return self;
 846 | }
 847 | 
 848 | /*  call-seq: devptr.offset(offset)    ->    CUDevicePtr
 849 |  *
 850 |  *  Return a CUDevicePtr instance pointing to the memory location _offset_ (bytes) from _self_.
 851 |  */
 852 | static VALUE device_ptr_offset(VALUE self, VALUE offset)
 853 | {
 854 |     CUdeviceptr* pdevptr;
 855 |     CUdeviceptr* pdevptr_offset;
 856 |     Data_Get_Struct(self, CUdeviceptr, pdevptr);
 857 |     VALUE rb_pdevptr_offset = rb_class_new_instance(0, NULL, rb_cCUDevicePtr);
 858 |     Data_Get_Struct(rb_pdevptr_offset, CUdeviceptr, pdevptr_offset);
 859 |     *pdevptr_offset = *pdevptr + NUM2UINT(offset);
 860 |     return rb_pdevptr_offset;
 861 | }
 862 | 
 863 | /*  call-seq: devptr.mem_alloc(nbytes)    ->    self
 864 |  *
 865 |  *  Allocate _nbytes_ device memory and let _self_ points to this allocated memory.
 866 |  */
 867 | static VALUE device_ptr_mem_alloc(VALUE self, VALUE nbytes)
 868 | {
 869 |     CUdeviceptr* p;
 870 |     Data_Get_Struct(self, CUdeviceptr, p);
 871 |     CUresult status = cuMemAlloc(p, NUM2UINT(nbytes));
 872 |     if (status != CUDA_SUCCESS) {
 873 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to allocate memory: size = %u.", NUM2UINT(nbytes));
 874 |     }
 875 |     return self;
 876 | }
 877 | 
 878 | /*  call-seq: devptr.mem_free    ->    self
 879 |  *
 880 |  *  Free the allocated device memory _self_ pointing to.
 881 |  */
 882 | static VALUE device_ptr_mem_free(VALUE self)
 883 | {
 884 |     CUdeviceptr* p;
 885 |     Data_Get_Struct(self, CUdeviceptr, p);
 886 |     CUresult status = cuMemFree(*p);
 887 |     if (status != CUDA_SUCCESS) {
 888 |         RAISE_CU_STD_ERROR(status, "Failed to free memory.");
 889 |     }
 890 |     return self;
 891 | }
 892 | 
 893 | // }}}
 894 | 
 895 | 
 896 | // {{{ CUfunction
 897 | 
 898 | static VALUE function_alloc(VALUE klass)
 899 | {
 900 |     CUfunction* p = new CUfunction;
 901 |     return Data_Wrap_Struct(klass, 0, generic_free<CUfunction>, p);
 902 | }
 903 | 
 904 | static VALUE function_initialize(int argc, VALUE* argv, VALUE self)
 905 | {
 906 |     return self;
 907 | }
 908 | 
 909 | /*  call-seq: func.set_param(arg1, arg2, *other_args)    ->    self
 910 |  *
 911 |  *  Set the argument list of _self_ to _arg1_, _arg2_, *other_args.
 912 |  */
 913 | static VALUE function_set_param(int argc, VALUE* argv, VALUE self)
 914 | {
 915 |     #define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
 916 | 
 917 |     int offset = 0;
 918 |     CUfunction* pfunc;
 919 |     Data_Get_Struct(self, CUfunction, pfunc);
 920 | 
 921 |     CUresult status = CUDA_ERROR_UNKNOWN;
 922 |     for (int i = 0; i < argc; ++i) {
 923 |         if (CLASS_OF(argv[i]) == rb_cCUDevicePtr) {
 924 |             CUdeviceptr* p;
 925 |             Data_Get_Struct(argv[i], CUdeviceptr, p);
 926 |             ALIGN_UP(offset, __alignof(*p));
 927 |             status = cuParamSetv(*pfunc, offset, p, sizeof(*p));
 928 |             if (status != CUDA_SUCCESS) break;
 929 |             offset += sizeof(*p);
 930 |         } else if (CLASS_OF(argv[i]) == rb_cFixnum) {
 931 |             int num = FIX2INT(argv[i]);
 932 |             ALIGN_UP(offset, __alignof(num));
 933 |             status = cuParamSeti(*pfunc, offset, num);
 934 |             if (status != CUDA_SUCCESS) break;
 935 |             offset += sizeof(int);
 936 |         } else if (CLASS_OF(argv[i]) == rb_cFloat) {
 937 |             float num = static_cast<float>(NUM2DBL(argv[i]));
 938 |             ALIGN_UP(offset, __alignof(num));
 939 |             status = cuParamSetf(*pfunc, offset, num);
 940 |             if (status != CUDA_SUCCESS) break;
 941 |             offset += sizeof(float);
 942 |         } else {
 943 |             rb_raise(rb_eArgError, "Invalid type of argument %d.", i+1);
 944 |         }
 945 |     }
 946 |     if (argc > 0 && status != CUDA_SUCCESS) {
 947 |         RAISE_CU_STD_ERROR(status, "Failed to set function parameters.");
 948 |     }
 949 | 
 950 |     status = cuParamSetSize(*pfunc, offset);
 951 |     if (status != CUDA_SUCCESS) {
 952 |         RAISE_CU_STD_ERROR(status, "Failed to set function parameter size.");
 953 |     }
 954 |     return self;
 955 | }
 956 | 
 957 | /*  call-seq: func.set_texref(texref)    ->    self
 958 |  *
 959 |  *  Add the _texref_ to the argument list of _self_.
 960 |  *
 961 |  *  Note: This method is *deprecated*. This is no longer necessary.
 962 |  */
 963 | static VALUE function_set_texref(VALUE self, VALUE texref)
 964 | {
 965 |     rb_warn("CUFunction#set_texref is deprecated.");
 966 |     CUfunction* pfunc;
 967 |     CUtexref* ptexref;
 968 |     Data_Get_Struct(self, CUfunction, pfunc);
 969 |     Data_Get_Struct(texref, CUtexref, ptexref);
 970 |     CUresult status = cuParamSetTexRef(*pfunc, CU_PARAM_TR_DEFAULT, *ptexref);
 971 |     if (status != CUDA_SUCCESS) {
 972 |         RAISE_CU_STD_ERROR(status, "Failed to set function texture reference.");
 973 |     }
 974 |     return self;
 975 | }
 976 | 
 977 | /*  call-seq: func.set_block_shape(xdim)                ->    self
 978 |  *            func.set_block_shape(xdim, ydim)          ->    self
 979 |  *            func.set_block_shape(xdim, ydim, zdim)    ->    self
 980 |  *
 981 |  *  Set the block dimensions to use for next launch. _ydim_ and _zdim_ which may be omitted are default to 1.
 982 |  */
 983 | static VALUE function_set_block_shape(int argc, VALUE* argv, VALUE self)
 984 | {
 985 |     if (argc <= 0 || argc > 3) {
 986 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 to 3 integers).", argc);
 987 |     }
 988 | 
 989 |     CUfunction* pfunc;
 990 |     Data_Get_Struct(self, CUfunction, pfunc);
 991 | 
 992 |     int xdim = FIX2INT(argv[0]);
 993 |     int ydim = 1;
 994 |     int zdim = 1;
 995 | 
 996 |     if (argc >= 2) {
 997 |         ydim = FIX2INT(argv[1]);
 998 |     }
 999 |     if (argc >= 3) {
1000 |         zdim = FIX2INT(argv[2]);
1001 |     }
1002 | 
1003 |     CUresult status = cuFuncSetBlockShape(*pfunc, xdim, ydim, zdim);
1004 |     if (status != CUDA_SUCCESS) {
1005 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function block shape: (x,y,z) = (%d,%d,%d).", xdim, ydim, zdim);
1006 |     }
1007 |     return self;
1008 | }
1009 | 
1010 | /*  call-seq: func.set_shared_size(nbytes)    ->    self
1011 |  *
1012 |  *  Set the dynamic shared-memory size to use for next launch.
1013 |  */
1014 | static VALUE function_set_shared_size(VALUE self, VALUE nbytes)
1015 | {
1016 |     CUfunction* p;
1017 |     Data_Get_Struct(self, CUfunction, p);
1018 |     CUresult status = cuFuncSetSharedSize(*p, NUM2UINT(nbytes));
1019 |     if (status != CUDA_SUCCESS) {
1020 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function shared memory size: %u.", NUM2UINT(nbytes));
1021 |     }
1022 |     return self;
1023 | }
1024 | 
1025 | /*  call-seq: func.launch    ->    self
1026 |  *
1027 |  *  Launch _self_ to execute on a CUDA device.
1028 |  */
1029 | static VALUE function_launch(VALUE self)
1030 | {
1031 |     CUfunction* p;
1032 |     Data_Get_Struct(self, CUfunction, p);
1033 |     CUresult status = cuLaunch(*p);
1034 |     if (status != CUDA_SUCCESS) {
1035 |         RAISE_CU_STD_ERROR(status, "Failed to launch kernel function on 1x1x1 grid of blocks.");
1036 |     }
1037 |     return self;
1038 | }
1039 | 
1040 | /*  call-seq: func.launch_grid(xdim)          ->    self
1041 |  *            func.launch_grid(xdim, ydim)    ->    self
1042 |  *
1043 |  *  Launch _self_ with grid dimensions (xdim, ydim) to execute on a CUDA device.
1044 |  *  _ydim_ which may be omitted is default to 1.
1045 |  */
1046 | static VALUE function_launch_grid(int argc, VALUE* argv, VALUE self)
1047 | {
1048 |     if (argc <= 0 || argc > 2) {
1049 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2 integers).", argc);
1050 |     }
1051 | 
1052 |     CUfunction* pfunc;
1053 |     Data_Get_Struct(self, CUfunction, pfunc);
1054 | 
1055 |     int xdim = FIX2INT(argv[0]);
1056 |     int ydim = 1;
1057 | 
1058 |     if (argc >= 2) {
1059 |         ydim = FIX2INT(argv[1]);
1060 |     }
1061 | 
1062 |     CUresult status = cuLaunchGrid(*pfunc, xdim, ydim);
1063 |     if (status != CUDA_SUCCESS) {
1064 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to launch kernel function on %dx%d grid of blocks.", xdim, ydim);
1065 |     }
1066 |     return self;
1067 | }
1068 | 
1069 | /*  call-seq: func.launch_grid_async(xdim, stream)          ->    self
1070 |  *            func.launch_grid_async(xdim, ydim, stream)    ->    self
1071 |  *
1072 |  *  Launch _self_ with grid dimensions (xdim, ydim) on _stream_ asynchronously to execute on a CUDA device.
1073 |  *  _ydim_ which may be omitted is default to 1. Setting _stream_ to anything other than an instance of CUStream
1074 |  *  will execute on the default stream 0.
1075 |  */
1076 | static VALUE function_launch_grid_async(int argc, VALUE* argv, VALUE self)
1077 | {
1078 |     if (argc < 2 || argc > 3) {
1079 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 2 or 3).", argc);
1080 |     }
1081 | 
1082 |     CUfunction* pfunc;
1083 |     CUstream *pstream = NULL;
1084 |     CUstream stream0 = 0;
1085 |     Data_Get_Struct(self, CUfunction, pfunc);
1086 | 
1087 |     int xdim = FIX2INT(argv[0]);
1088 |     int ydim = 1;
1089 | 
1090 |     if (argc == 2) {
1091 |         if (CLASS_OF(argv[1]) == rb_cCUStream) {
1092 |             Data_Get_Struct(argv[1], CUstream, pstream);
1093 |         } else {
1094 |             pstream = &stream0;
1095 |         }
1096 |     } else if (argc == 3) {
1097 |         ydim = FIX2INT(argv[1]);
1098 |         if (CLASS_OF(argv[2]) == rb_cCUStream) {
1099 |             Data_Get_Struct(argv[2], CUstream, pstream);
1100 |         } else {
1101 |             pstream = &stream0;
1102 |         }
1103 |     }
1104 | 
1105 |     CUresult status = cuLaunchGridAsync(*pfunc, xdim, ydim, *pstream);
1106 |     if (status != CUDA_SUCCESS) {
1107 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to launch kernel function asynchronously on %dx%d grid of blocks.", xdim, ydim);
1108 |     }
1109 |     return self;
1110 | }
1111 | 
1112 | /*  call-seq: func.get_attribute(attribute)    ->    Fixnum
1113 |  *
1114 |  *  Return _attribute_ (CUFunctionAttribute) of _self_.
1115 |  *
1116 |  *      func.get_attribute(CUFunctionAttribute::MAX_THREADS_PER_BLOCK)    #=> 512
1117 |  *      func.get_attribute(CUFunctionAttribute::SHARED_SIZE_BYTES)        #=> 44
1118 |  *      func.get_attribute(CUFunctionAttribute::NUM_REGS)                 #=> 3
1119 |  */
1120 | static VALUE function_get_attribute(VALUE self, VALUE attribute)
1121 | {
1122 |     CUfunction* p;
1123 |     Data_Get_Struct(self, CUfunction, p);
1124 |     int v;
1125 |     CUresult status = cuFuncGetAttribute(&v, static_cast<CUfunction_attribute>(FIX2INT(attribute)), *p);
1126 |     if (status != CUDA_SUCCESS) {
1127 |         VALUE attributes = rb_funcall(rb_cCUFunctionAttribute, rb_intern("constants"), 0);
1128 |         VALUE ary[3] = { rb_cCUFunctionAttribute, attribute, Qnil };
1129 |         rb_block_call(attributes, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
1130 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to query function attribute: %s.", rb_id2name(SYM2ID(ary[2])));
1131 |     }
1132 |     return INT2FIX(v);
1133 | }
1134 | 
1135 | /*  call-seq: func.set_cache_config(config)    ->    self
1136 |  *
1137 |  *  Set the preferred cache configuration (CUFunctionCache) to use for next launch.
1138 |  */
1139 | static VALUE function_set_cache_config(VALUE self, VALUE config)
1140 | {
1141 |     CUfunction* p;
1142 |     Data_Get_Struct(self, CUfunction, p);
1143 |     CUresult status = cuFuncSetCacheConfig(*p, static_cast<CUfunc_cache>(FIX2UINT(config)));
1144 |     if (status != CUDA_SUCCESS) {
1145 |         VALUE configs = rb_funcall(rb_cCUFunctionCache, rb_intern("constants"), 0);
1146 |         VALUE ary[3] = { rb_cCUFunctionCache, config, Qnil };
1147 |         rb_block_call(configs, rb_intern("find"), 0, NULL, RUBY_METHOD_FUNC(class_const_match), (VALUE)ary);
1148 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set function cache config: %s.", rb_id2name(SYM2ID(ary[2])));
1149 |     }
1150 |     return self;
1151 | }
1152 | 
1153 | // }}}
1154 | 
1155 | 
1156 | // {{{ CUstream
1157 | 
1158 | static VALUE stream_alloc(VALUE klass)
1159 | {
1160 |     CUstream* p = new CUstream;
1161 |     return Data_Wrap_Struct(klass, 0, generic_free<CUstream>, p);
1162 | }
1163 | 
1164 | static VALUE stream_initialize(VALUE self)
1165 | {
1166 |     return self;
1167 | }
1168 | 
1169 | /*  call-seq: stream.create           ->    self
1170 |  *            stream.create(flags)    ->    self
1171 |  *
1172 |  *  Create a stream and set _self_ to this stream. Currently, _flags_ must be set to 0.
1173 |  */
1174 | static VALUE stream_create(int argc, VALUE* argv, VALUE self)
1175 | {
1176 |     if (argc < 0 || argc > 1) {
1177 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 0 or 1).", argc);
1178 |     }
1179 | 
1180 |     CUstream* p;
1181 |     unsigned int flags = 0;
1182 |     Data_Get_Struct(self, CUstream, p);
1183 |     if (argc == 1) {
1184 |         flags = FIX2UINT(argv[0]);
1185 |     }
1186 |     CUresult status = cuStreamCreate(p, flags);
1187 |     if (status != CUDA_SUCCESS) {
1188 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create stream: flags = 0x%x", flags);
1189 |     }
1190 |     return self;
1191 | }
1192 | 
1193 | /*  call-seq: stream.destroy    ->    nil
1194 |  *
1195 |  *  Destroy the stream _self_.
1196 |  */
1197 | static VALUE stream_destroy(VALUE self)
1198 | {
1199 |     CUstream* p;
1200 |     Data_Get_Struct(self, CUstream, p);
1201 |     CUresult status = cuStreamDestroy(*p);
1202 |     if (status != CUDA_SUCCESS) {
1203 |         RAISE_CU_STD_ERROR(status, "Failed to destroy stream.");
1204 |     }
1205 |     return Qnil;
1206 | }
1207 | 
1208 | /*  call-seq: stream.query    ->    true or false
1209 |  *
1210 |  *  Return true if all operations in _self_ have completed. Otherwise, return false.
1211 |  */
1212 | static VALUE stream_query(VALUE self)
1213 | {
1214 |     CUstream* p;
1215 |     Data_Get_Struct(self, CUstream, p);
1216 |     CUresult status = cuStreamQuery(*p);
1217 |     if (status == CUDA_SUCCESS) {
1218 |         return Qtrue;
1219 |     } else if (status == CUDA_ERROR_NOT_READY) {
1220 |         return Qfalse;
1221 |     } else {
1222 |         RAISE_CU_STD_ERROR(status, "Failed to query stream.");
1223 |     }
1224 | }
1225 | 
1226 | /*  call-seq: stream.synchronize    ->    self
1227 |  *
1228 |  *  Block until all operations in _self_ complete.
1229 |  */
1230 | static VALUE stream_synchronize(VALUE self)
1231 | {
1232 |     CUstream* p;
1233 |     Data_Get_Struct(self, CUstream, p);
1234 |     CUresult status = cuStreamSynchronize(*p);
1235 |     if (status != CUDA_SUCCESS) {
1236 |         RAISE_CU_STD_ERROR(status, "Failed to synchronize stream.");
1237 |     }
1238 |     return self;
1239 | }
1240 | 
1241 | /*  call-seq: stream.wait_event(event)           ->    self
1242 |  *            stream.wait_event(event, flags)    ->    self
1243 |  *
1244 |  *  Let all future operations submitted to _self_ wait until _event_ (CUEvent) complete before beginning execution.
1245 |  *  Currently, _flags_ must be 0.
1246 |  */
1247 | static VALUE stream_wait_event(int argc, VALUE* argv, VALUE self)
1248 | {
1249 |     if (argc <= 0 || argc > 2) {
1250 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc);
1251 |     }
1252 | 
1253 |     CUstream* pstream;
1254 |     CUevent* pevent;
1255 |     unsigned int flags = 0;
1256 |     Data_Get_Struct(self, CUstream, pstream);
1257 |     Data_Get_Struct(argv[0], CUevent, pevent);
1258 |     if (argc == 2) {
1259 |         flags = FIX2UINT(argv[1]);
1260 |     }
1261 |     CUresult status = cuStreamWaitEvent(*pstream, *pevent, flags);
1262 |     if (status != CUDA_SUCCESS) {
1263 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to make stream's future operations to wait event: flags = 0x%x", flags);
1264 |     }
1265 |     return self;
1266 | }
1267 | 
1268 | /*  call-seq: CUStream.wait_event(event)           ->    nil
1269 |  *            CUStream.wait_event(event, flags)    ->    nil
1270 |  *
1271 |  *  Let all future operations submitted to stream 0 (NULL stream) wait until _event_ (CUEvent) complete before beginning execution.
1272 |  *  Currently, _flags_ must be 0.
1273 |  */
1274 | static VALUE stream_wait_event_singleton(int argc, VALUE* argv, VALUE klass)
1275 | {
1276 |     if (argc <= 0 || argc > 2) {
1277 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc);
1278 |     }
1279 | 
1280 |     CUevent* pevent;
1281 |     unsigned int flags = 0;
1282 |     Data_Get_Struct(argv[0], CUevent, pevent);
1283 |     if (argc == 2) {
1284 |         flags = FIX2UINT(argv[1]);
1285 |     }
1286 |     CUresult status = cuStreamWaitEvent(0, *pevent, flags);
1287 |     if (status != CUDA_SUCCESS) {
1288 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to make current stream's future operations to wait event: flags = 0x%x", flags);
1289 |     }
1290 |     return Qnil;
1291 | }
1292 | 
1293 | // }}}
1294 | 
1295 | 
1296 | // {{{ CUevent
1297 | 
1298 | static VALUE event_alloc(VALUE klass)
1299 | {
1300 |     CUevent* p = new CUevent;
1301 |     return Data_Wrap_Struct(klass, 0, generic_free<CUevent>, p);
1302 | }
1303 | 
1304 | static VALUE event_initialize(VALUE self)
1305 | {
1306 |     return self;
1307 | }
1308 | 
1309 | /*  call-seq: event.create           ->    self
1310 |  *            event.create(flags)    ->    self
1311 |  *
1312 |  *  Create an event with _flags_ (CUEventFlags) and set _self_ to this event.
1313 |  *  The _flags_ is default to CUEventFlags::DEFAULT.
1314 |  *
1315 |  *      event.create                                     #=> self
1316 |  *      event.create(CUEventFlags::DEFAULT)              #=> self
1317 |  *      event.create(CUEventFlags::BLOCKING_SYNC)        #=> self
1318 |  */
1319 | static VALUE event_create(int argc, VALUE* argv, VALUE self)
1320 | {
1321 |     if (argc < 0 || argc > 1) {
1322 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 0 or 1).", argc);
1323 |     }
1324 | 
1325 |     CUevent* p;
1326 |     unsigned int flags = CU_EVENT_DEFAULT;
1327 |     Data_Get_Struct(self, CUevent, p);
1328 |     if (argc == 1) {
1329 |         flags = FIX2UINT(argv[0]);
1330 |     }
1331 |     CUresult status = cuEventCreate(p, flags);
1332 |     if (status != CUDA_SUCCESS) {
1333 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to create event: flags = 0x%x.", flags);
1334 |     }
1335 |     return self;
1336 | }
1337 | 
1338 | /*  call-seq: event.destroy    ->    nil
1339 |  *
1340 |  *  Destroy the event _self_.
1341 |  */
1342 | static VALUE event_destroy(VALUE self)
1343 | {
1344 |     CUevent* p;
1345 |     Data_Get_Struct(self, CUevent, p);
1346 |     CUresult status = cuEventDestroy(*p);
1347 |     if (status != CUDA_SUCCESS) {
1348 |         RAISE_CU_STD_ERROR(status, "Failed to destroy event.");
1349 |     }
1350 |     return Qnil;
1351 | }
1352 | 
1353 | /*  call-seq: event.query    ->    true or false
1354 |  *
1355 |  *  Return true if _self_ has been recorded. Otherwise, return false.
1356 |  */
1357 | static VALUE event_query(VALUE self)
1358 | {
1359 |     CUevent* p;
1360 |     Data_Get_Struct(self, CUevent, p);
1361 |     CUresult status = cuEventQuery(*p);
1362 |     if (status == CUDA_SUCCESS) {
1363 |         return Qtrue;
1364 |     } else if (status == CUDA_ERROR_NOT_READY) {
1365 |         return Qfalse;
1366 |     } else if (status == CUDA_ERROR_INVALID_VALUE) {
1367 |         RAISE_CU_STD_ERROR(status, "Failed to query event: cuEventRecord() has not been called on this event.");
1368 |     } else {
1369 |         RAISE_CU_STD_ERROR(status, "Failed to query event.");
1370 |     }
1371 | }
1372 | 
1373 | /*  call-seq: event.record(stream)    ->    self
1374 |  *
1375 |  *  Record event _self_ asynchronously in _stream_.
1376 |  *  Setting _stream_ to anything other than an instance of CUStream will record on the default stream 0.
1377 |  */
1378 | static VALUE event_record(VALUE self, VALUE rb_stream)
1379 | {
1380 |     CUevent* pevent = NULL;
1381 |     CUstream* pstream = NULL;
1382 |     CUresult status;
1383 |     Data_Get_Struct(self, CUevent, pevent);
1384 |     if (CLASS_OF(rb_stream) == rb_cCUStream) {
1385 |         Data_Get_Struct(rb_stream, CUstream, pstream);
1386 |         status = cuEventRecord(*pevent, *pstream);
1387 |     } else {
1388 |         status = cuEventRecord(*pevent, 0);
1389 |     }
1390 |     if (status == CUDA_ERROR_INVALID_VALUE) {
1391 |         RAISE_CU_STD_ERROR(status, "Failed to record event: cuEventRecord() has been called and has not been recorded yet.");
1392 |     } else if (status != CUDA_SUCCESS) {
1393 |         RAISE_CU_STD_ERROR(status, "Failed to record event.");
1394 |     }
1395 |     return self;
1396 | }
1397 | 
1398 | /*  call-seq: event.synchronize    ->    self
1399 |  *
1400 |  *  Block until _self_ has been recorded.
1401 |  */
1402 | static VALUE event_synchronize(VALUE self)
1403 | {
1404 |     CUevent* p;
1405 |     Data_Get_Struct(self, CUevent, p);
1406 |     CUresult status = cuEventSynchronize(*p);
1407 |     // TODO: Handle status == CUDA_ERROR_INVALID_VALUE
1408 |     if (status != CUDA_SUCCESS) {
1409 |         RAISE_CU_STD_ERROR(status, "Failed to synchronize event.");
1410 |     }
1411 |     return self;
1412 | }
1413 | 
1414 | /*  call-seq: event.elapsed_time(event_start, event_end)    ->    Numeric
1415 |  *
1416 |  *  Return the elapsed time (ms) from _event_start_ (CUEvent) to _event_end_ (CUEvent).
1417 |  */
1418 | static VALUE event_elapsed_time(VALUE klass, VALUE event_start, VALUE event_end)
1419 | {
1420 |     CUevent* pevent_start;
1421 |     CUevent* pevent_end;
1422 |     Data_Get_Struct(event_start, CUevent, pevent_start);
1423 |     Data_Get_Struct(event_end, CUevent, pevent_end);
1424 |     float etime;
1425 |     CUresult status = cuEventElapsedTime(&etime, *pevent_start, *pevent_end);
1426 |     if (status == CUDA_ERROR_NOT_READY) {
1427 |         RAISE_CU_STD_ERROR(status, "Failed to get elapsed time of events: either event has not been recorded yet.");
1428 |     } else if (status != CUDA_SUCCESS) {
1429 |         RAISE_CU_STD_ERROR(status, "Failed to get elapsed time of events.");
1430 |     }
1431 |     return DBL2NUM(etime);
1432 | }
1433 | 
1434 | // }}}
1435 | 
1436 | 
1437 | // {{{ CUtexref
1438 | 
1439 | static VALUE texref_alloc(VALUE klass)
1440 | {
1441 |     CUtexref* p = new CUtexref;
1442 |     return Data_Wrap_Struct(klass, 0, generic_free<CUtexref>, p);
1443 | }
1444 | 
1445 | static VALUE texref_initialize(VALUE self)
1446 | {
1447 |     return self;
1448 | }
1449 | 
1450 | /*  call-seq: texref.create    ->    self
1451 |  *
1452 |  *  Create a texture reference and set _self_ to this texture reference.
1453 |  *
1454 |  *  Note: This method is *deprecated*.
1455 |  */
1456 | static VALUE texref_create(VALUE self)
1457 | {
1458 |     rb_warn("CUTexRef#create is deprecated.");
1459 |     CUtexref* p;
1460 |     Data_Get_Struct(self, CUtexref, p);
1461 |     CUresult status = cuTexRefCreate(p);
1462 |     if (status != CUDA_SUCCESS) {
1463 |         RAISE_CU_STD_ERROR(status, "Failed to create texture.");
1464 |     }
1465 |     return self;
1466 | }
1467 | 
1468 | /*  call-seq: texref.destroy    ->    nil
1469 |  *
1470 |  *  Destroy the texture reference _self_.
1471 |  *
1472 |  *  Note: This method is *deprecated*.
1473 |  */
1474 | static VALUE texref_destroy(VALUE self)
1475 | {
1476 |     rb_warn("CUTexRef#destroy is deprecated.");
1477 |     CUtexref* p;
1478 |     Data_Get_Struct(self, CUtexref, p);
1479 |     CUresult status = cuTexRefDestroy(*p);
1480 |     if (status != CUDA_SUCCESS) {
1481 |         RAISE_CU_STD_ERROR(status, "Failed to destroy texture.");
1482 |     }
1483 |     return Qnil;
1484 | }
1485 | 
1486 | /*  call-seq: texref.get_address    ->    CUDevicePtr
1487 |  *
1488 |  *  Return a CUDevicePtr instance bound to the texture reference.
1489 |  */
1490 | static VALUE texref_get_address(VALUE self)
1491 | {
1492 |     CUtexref* ptexref;
1493 |     CUdeviceptr* pdevptr;
1494 |     Data_Get_Struct(self, CUtexref, ptexref);
1495 |     VALUE rb_devptr = rb_class_new_instance(0, NULL, rb_cCUDevicePtr);
1496 |     Data_Get_Struct(rb_devptr, CUdeviceptr, pdevptr);
1497 |     CUresult status = cuTexRefGetAddress(pdevptr, *ptexref);
1498 |     if (status != CUDA_SUCCESS) {
1499 |         RAISE_CU_STD_ERROR(status, "Failed to get texture address.");
1500 |     }
1501 |     return rb_devptr;
1502 | }
1503 | 
1504 | /*  call-seq: texref.get_address_mode(dim)    ->    Fixnum
1505 |  *
1506 |  *  Return the address mode of the dimension _dim_ (0..2) of _self_.
1507 |  */
1508 | static VALUE texref_get_address_mode(VALUE self, VALUE dim)
1509 | {
1510 |     CUtexref* p;
1511 |     CUaddress_mode mode;
1512 |     Data_Get_Struct(self, CUtexref, p);
1513 |     CUresult status = cuTexRefGetAddressMode(&mode, *p, FIX2INT(dim));
1514 |     if (status != CUDA_SUCCESS) {
1515 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to get texture address mode: dim = %d.", FIX2INT(dim));
1516 |     }
1517 |     return INT2FIX(mode);
1518 | }
1519 | 
1520 | /*  call-seq: texref.get_filter_mode    ->    Fixnum
1521 |  *
1522 |  *  Return the filter mode of _self_.
1523 |  */
1524 | static VALUE texref_get_filter_mode(VALUE self)
1525 | {
1526 |     CUtexref* p;
1527 |     CUfilter_mode mode;
1528 |     Data_Get_Struct(self, CUtexref, p);
1529 |     CUresult status = cuTexRefGetFilterMode(&mode, *p);
1530 |     if (status != CUDA_SUCCESS) {
1531 |         RAISE_CU_STD_ERROR(status, "Failed to get texture filter mode.");
1532 |     }
1533 |     return INT2FIX(mode);
1534 | }
1535 | 
1536 | /*  call-seq: texref.get_flags    ->    Numeric
1537 |  *
1538 |  *  Return the flags of _self_.
1539 |  */
1540 | static VALUE texref_get_flags(VALUE self)
1541 | {
1542 |     CUtexref* p;
1543 |     unsigned int flags;
1544 |     Data_Get_Struct(self, CUtexref, p);
1545 |     CUresult status = cuTexRefGetFlags(&flags, *p);
1546 |     if (status != CUDA_SUCCESS) {
1547 |         RAISE_CU_STD_ERROR(status, "Failed to get texture flags.");
1548 |     }
1549 |     return UINT2NUM(flags);
1550 | }
1551 | 
1552 | /*  call-seq: texref.set_address(devptr, nbytes)    ->    Numeric
1553 |  *
1554 |  *  Bind _devptr_ (CUDevicePtr) with _nbytes_ to _self_.
1555 |  */
1556 | static VALUE texref_set_address(VALUE self, VALUE rb_device_ptr, VALUE nbytes)
1557 | {
1558 |     CUtexref* ptexref;
1559 |     CUdeviceptr* pdevptr;
1560 |     size_t offset;
1561 |     Data_Get_Struct(self, CUtexref, ptexref);
1562 |     Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevptr);
1563 |     CUresult status = cuTexRefSetAddress(&offset, *ptexref, *pdevptr, NUM2UINT(nbytes));
1564 |     if (status != CUDA_SUCCESS) {
1565 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture address: nbytes = %u.", NUM2UINT(nbytes));
1566 |     }
1567 |     return SIZET2NUM(offset);
1568 | }
1569 | 
1570 | /*  call-seq: texref.set_address_mode(dim, mode)    ->    self
1571 |  *
1572 |  *  Set the address mode of _self_ with _dim_ (0..2) and _mode_ (CUAddressMode).
1573 |  */
1574 | static VALUE texref_set_address_mode(VALUE self, VALUE dim, VALUE mode)
1575 | {
1576 |     CUtexref* p;
1577 |     Data_Get_Struct(self, CUtexref, p);
1578 |     CUresult status = cuTexRefSetAddressMode(*p, FIX2INT(dim), static_cast<CUaddress_mode>(FIX2INT(mode)));
1579 |     if (status != CUDA_SUCCESS) {
1580 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture address mode: dim = %d, mode = %d", FIX2INT(dim), FIX2INT(mode));
1581 |     }
1582 |     return self;
1583 | }
1584 | 
1585 | /*  call-seq: texref.set_filter_mode(mode)    ->    self
1586 |  *
1587 |  *  Set the filter mode of _self_ with _mode_ (CUFilterMode).
1588 |  */
1589 | static VALUE texref_set_filter_mode(VALUE self, VALUE mode)
1590 | {
1591 |     CUtexref* p;
1592 |     Data_Get_Struct(self, CUtexref, p);
1593 |     CUresult status = cuTexRefSetFilterMode(*p, static_cast<CUfilter_mode>(FIX2INT(mode)));
1594 |     if (status != CUDA_SUCCESS) {
1595 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture filter mode: mode = %d.", FIX2INT(mode));
1596 |     }
1597 |     return self;
1598 | }
1599 | 
1600 | /*  call-seq: texref.set_flags(flags)    ->    self
1601 |  *
1602 |  *  Set the _flags_ (CUTexRefFlags) of _self_.
1603 |  */
1604 | static VALUE texref_set_flags(VALUE self, VALUE flags)
1605 | {
1606 |     CUtexref* p;
1607 |     Data_Get_Struct(self, CUtexref, p);
1608 |     CUresult status = cuTexRefSetFlags(*p, NUM2UINT(flags));
1609 |     if (status != CUDA_SUCCESS) {
1610 |         RAISE_CU_STD_ERROR_FORMATTED(status, "Failed to set texture flags: flags = 0x%x.", NUM2UINT(flags));
1611 |     }
1612 |     return self;
1613 | }
1614 | 
1615 | // }}}
1616 | 
1617 | 
1618 | // {{{ Memory pointer
1619 | static VALUE memory_pointer_alloc(VALUE klass)
1620 | {
1621 |     MemoryPointer* ppointer = new MemoryPointer;
1622 |     ppointer->p = NULL;
1623 |     return Data_Wrap_Struct(klass, 0, generic_free<MemoryPointer>, ppointer);
1624 | }
1625 | 
1626 | static VALUE memory_pointer_initialize(VALUE self)
1627 | {
1628 |     return self;
1629 | }
1630 | // }}}
1631 | 
1632 | 
1633 | // {{{ Buffer
1634 | 
1635 | /*  call-seq: Buffer.new(size, options = {})    ->    Buffer
1636 |  *
1637 |  *  Create a buffer with _size_ elements.
1638 |  *
1639 |  *  Options:
1640 |  *  * _page_locked_ - Allocate page-locked memory if _:page_locked_ is true. Otherwise, allocate pageable memory.
1641 |  *
1642 |  *      Buffer.new(10)                       # Allocate 10 elements with pageable memory.
1643 |  *      Buffer.new(20, page_locked: true)    # Allocate 20 elements with page-locked memory.
1644 |  */
1645 | static VALUE ibuffer_initialize(int argc, VALUE* argv, VALUE self)
1646 | {
1647 |     // This function exists for documentation only.
1648 |     rb_notimplement();
1649 |     return Qnil;
1650 | }
1651 | 
1652 | /*  call-seq: Buffer.element_size
1653 |  *
1654 |  *  Return the size of an element of this Buffer in bytes.
1655 |  */
1656 | static VALUE ibuffer_element_size(VALUE klass)
1657 | {
1658 |     rb_notimplement();
1659 |     return Qnil;
1660 | }
1661 | 
1662 | /*  call-seq: buffer.size    ->    Numeric
1663 |  *
1664 |  *  Return the number of elements in this buffer.
1665 |  */
1666 | static VALUE ibuffer_size(VALUE self)
1667 | {
1668 |     rb_notimplement();
1669 |     return Qnil;
1670 | }
1671 | 
1672 | /*  call-seq: buffer.page_locked?    ->    true or false
1673 |  *
1674 |  *  Return true if this buffer is page-locked allocated.
1675 |  *  Otherwise, return false.
1676 |  */
1677 | static VALUE ibuffer_is_page_locked(VALUE self)
1678 | {
1679 |     rb_notimplement();
1680 |     return Qnil;
1681 | }
1682 | 
1683 | /*  call-seq: buffer.offset(index)    ->    MemoryPointer
1684 |  *
1685 |  *  Return the memory pointer of the element at _index_ (0...size) in this buffer.
1686 |  */
1687 | static VALUE ibuffer_offset(VALUE self, VALUE offset)
1688 | {
1689 |     rb_notimplement();
1690 |     return Qnil;
1691 | }
1692 | 
1693 | /*  call-seq: buffer[index]    ->    Object
1694 |  *
1695 |  *  Return the element at _index_ (0...size) in this buffer.
1696 |  */
1697 | static VALUE ibuffer_element_get(VALUE self, VALUE index)
1698 | {
1699 |     rb_notimplement();
1700 |     return Qnil;
1701 | }
1702 | 
1703 | /*  call-seq: buffer[index] = value    ->    Object
1704 |  *
1705 |  *  Set the element at _index_ (0...size) in this buffer to _value_.
1706 |  *  Return _value_.
1707 |  */
1708 | static VALUE ibuffer_element_set(VALUE self, VALUE index, VALUE value)
1709 | {
1710 |     rb_notimplement();
1711 |     return Qnil;
1712 | }
1713 | 
1714 | static void memory_buffer_free(void* p)
1715 | {
1716 |     MemoryBuffer* pbuffer = static_cast<MemoryBuffer*>(p);
1717 |     if (pbuffer->is_page_locked) {
1718 |         cuMemFreeHost(reinterpret_cast<void*>(pbuffer->p));
1719 |     } else {
1720 |         delete[] pbuffer->p;
1721 |     }
1722 |     delete pbuffer;
1723 | }
1724 | 
1725 | static VALUE memory_buffer_alloc(VALUE klass)
1726 | {
1727 |     MemoryBuffer* pbuffer = new MemoryBuffer;
1728 |     pbuffer->size = 0;
1729 |     pbuffer->is_page_locked = false;
1730 |     pbuffer->p = NULL;
1731 |     return Data_Wrap_Struct(klass, 0, memory_buffer_free, pbuffer);
1732 | }
1733 | 
1734 | static VALUE memory_buffer_element_size(VALUE klass)
1735 | {
1736 |     return INT2FIX(1);
1737 | }
1738 | 
1739 | static VALUE memory_buffer_initialize(int argc, VALUE* argv, VALUE self)
1740 | {
1741 |     if (argc < 1 || argc > 2) {
1742 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc);
1743 |     }
1744 | 
1745 |     bool use_page_locked = false;
1746 |     size_t nbytes = NUM2SIZET(argv[0]);
1747 |     if (argc == 2 && CLASS_OF(argv[1]) == rb_cHash) {
1748 |         if (rb_hash_aref(argv[1], ID2SYM(rb_intern("page_locked"))) == Qtrue) {
1749 |             use_page_locked = true;
1750 |         }
1751 |     }
1752 | 
1753 |     MemoryBuffer* pbuffer;
1754 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1755 |     pbuffer->size = nbytes;
1756 |     if (use_page_locked) {
1757 |         CUresult status = cuMemAllocHost(reinterpret_cast<void**>(&pbuffer->p), nbytes);
1758 |         if (status != CUDA_SUCCESS) {
1759 |             RAISE_CU_STD_ERROR(status, "Failed to allocate page-locked host memory.");
1760 |         }
1761 |         pbuffer->is_page_locked = true;
1762 |     } else {
1763 |         pbuffer->p = new char[nbytes];
1764 |         pbuffer->is_page_locked = false;
1765 |     }
1766 |     std::memset(static_cast<void*>(pbuffer->p), 0, pbuffer->size);
1767 |     return self;
1768 | }
1769 | 
1770 | static VALUE memory_buffer_size(VALUE self)
1771 | {
1772 |     MemoryBuffer* pbuffer;
1773 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1774 |     return SIZET2NUM(pbuffer->size);
1775 | }
1776 | 
1777 | static VALUE memory_buffer_is_page_locked(VALUE self)
1778 | {
1779 |     MemoryBuffer* pbuffer;
1780 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1781 |     return to_rb(pbuffer->is_page_locked);
1782 | }
1783 | 
1784 | static VALUE memory_buffer_offset(VALUE self, VALUE offset)
1785 | {
1786 |     MemoryBuffer* pbuffer;
1787 |     MemoryPointer* ppointer_offset;
1788 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1789 |     VALUE rb_ppointer_offset = rb_class_new_instance(0, NULL, rb_cMemoryPointer);
1790 |     Data_Get_Struct(rb_ppointer_offset, MemoryPointer, ppointer_offset);
1791 |     ppointer_offset->p = pbuffer->p + NUM2SIZET(offset);
1792 |     return rb_ppointer_offset;
1793 | }
1794 | 
1795 | static VALUE memory_buffer_element_get(VALUE self, VALUE index)
1796 | {
1797 |     size_t i = NUM2SIZET(index);
1798 |     MemoryBuffer* pbuffer;
1799 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1800 |     int element = static_cast<int>(pbuffer->p[i]);
1801 |     return to_rb(element);
1802 | }
1803 | 
1804 | static VALUE memory_buffer_element_set(VALUE self, VALUE index, VALUE value)
1805 | {
1806 |     size_t i = NUM2SIZET(index);
1807 |     MemoryBuffer* pbuffer;
1808 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1809 |     pbuffer->p[i] = static_cast<char>(FIX2INT(value));
1810 |     return value;
1811 | }
1812 | 
1813 | template <typename TElement>
1814 | static void buffer_free(void* p)
1815 | {
1816 |     typedef struct TypedBuffer<TElement> TBuffer;
1817 |     TBuffer* pbuffer = static_cast<TBuffer*>(p);
1818 |     if (pbuffer->is_page_locked) {
1819 |         cuMemFreeHost(reinterpret_cast<void*>(pbuffer->p));
1820 |     } else {
1821 |         delete[] pbuffer->p;
1822 |     }
1823 |     delete pbuffer;
1824 | }
1825 | 
1826 | template <typename TElement>
1827 | static VALUE buffer_alloc(VALUE klass)
1828 | {
1829 |     typedef struct TypedBuffer<TElement> TBuffer;
1830 |     TBuffer* pbuffer = new TBuffer;
1831 |     pbuffer->size = 0;
1832 |     pbuffer->p = NULL;
1833 |     return Data_Wrap_Struct(klass, 0, &buffer_free<TElement>, pbuffer);
1834 | }
1835 | 
1836 | template <typename TElement>
1837 | static VALUE buffer_element_size(VALUE klass)
1838 | {
1839 |     return INT2FIX(sizeof(TElement));
1840 | }
1841 | typedef VALUE (*BufferElementSizeFunctionType)(VALUE);
1842 | 
1843 | template <typename TElement>
1844 | static VALUE buffer_initialize(int argc, VALUE* argv, VALUE self)
1845 | {
1846 |     if (argc <= 0 || argc >= 3) {
1847 |         rb_raise(rb_eArgError, "wrong number of arguments (%d for 1 or 2).", argc);
1848 |     }
1849 | 
1850 |     bool use_page_locked = false;
1851 |     VALUE n = NUM2SIZET(argv[0]);
1852 |     if (argc == 2 && CLASS_OF(argv[1]) == rb_cHash) {
1853 |         if (rb_hash_aref(argv[1], ID2SYM(rb_intern("page_locked"))) == Qtrue) {
1854 |             use_page_locked = true;
1855 |         }
1856 |     }
1857 | 
1858 |     typedef struct TypedBuffer<TElement> TBuffer;
1859 |     TBuffer* pbuffer;
1860 |     Data_Get_Struct(self, TBuffer, pbuffer);
1861 |     pbuffer->size = n*sizeof(TElement);
1862 |     if (use_page_locked) {
1863 |         CUresult status = cuMemAllocHost(reinterpret_cast<void**>(&pbuffer->p), n*sizeof(TElement));
1864 |         if (status != CUDA_SUCCESS) {
1865 |             RAISE_CU_STD_ERROR(status, "Failed to allocate page-locked host memory.");
1866 |         }
1867 |         pbuffer->is_page_locked = true;
1868 |     } else {
1869 |         pbuffer->p = reinterpret_cast<char*>(new TElement[n]);
1870 |         pbuffer->is_page_locked = false;
1871 |     }
1872 |     std::memset(static_cast<void*>(pbuffer->p), 0, pbuffer->size);
1873 |     return self;
1874 | }
1875 | typedef VALUE (*BufferInitializeFunctionType)(int, VALUE*, VALUE);
1876 | 
1877 | template <typename TElement>
1878 | static VALUE buffer_size(VALUE self)
1879 | {
1880 |     MemoryBuffer* pbuffer;
1881 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1882 |     return SIZET2NUM(pbuffer->size / sizeof(TElement));
1883 | }
1884 | typedef VALUE (*BufferSizeFunctionType)(VALUE);
1885 | 
1886 | template <typename TElement>
1887 | static VALUE buffer_is_page_locked(VALUE self)
1888 | {
1889 |     MemoryBuffer* pbuffer;
1890 |     Data_Get_Struct(self, MemoryBuffer, pbuffer);
1891 |     return to_rb(pbuffer->is_page_locked);
1892 | }
1893 | typedef VALUE (*BufferIsPageLocked)(VALUE);
1894 | 
1895 | template <typename TElement>
1896 | static VALUE buffer_offset(VALUE self, VALUE offset)
1897 | {
1898 |     typedef struct TypedBuffer<TElement> TBuffer;
1899 |     TBuffer* pbuffer;
1900 |     MemoryPointer* ppointer_offset;
1901 |     Data_Get_Struct(self, TBuffer, pbuffer);
1902 |     VALUE rb_ppointer_offset = rb_class_new_instance(0, NULL, rb_cMemoryPointer);
1903 |     Data_Get_Struct(rb_ppointer_offset, MemoryPointer, ppointer_offset);
1904 |     ppointer_offset->p = pbuffer->p + NUM2SIZET(offset)*sizeof(TElement);
1905 |     return rb_ppointer_offset;
1906 | }
1907 | typedef VALUE (*BufferOffsetFunctionType)(VALUE, VALUE);
1908 | 
1909 | template <typename TElement>
1910 | static VALUE buffer_element_get(VALUE self, VALUE index)
1911 | {
1912 |     typedef struct TypedBuffer<TElement> TBuffer;
1913 |     size_t i = NUM2SIZET(index);
1914 |     TBuffer* pbuffer;
1915 |     Data_Get_Struct(self, TBuffer, pbuffer);
1916 |     TElement* e = reinterpret_cast<TElement*>(pbuffer->p);
1917 |     TElement element = e[i];
1918 |     return to_rb<TElement>(element);
1919 | }
1920 | typedef VALUE (*BufferElementGetFunctionType)(VALUE, VALUE);
1921 | 
1922 | template <typename TElement>
1923 | static VALUE buffer_element_set(VALUE self, VALUE index, VALUE value)
1924 | {
1925 |     typedef struct TypedBuffer<TElement> TBuffer;
1926 |     size_t i = NUM2SIZET(index);
1927 |     TElement v = to_ctype<TElement>(value);
1928 |     TBuffer* pbuffer;
1929 |     Data_Get_Struct(self, TBuffer, pbuffer);
1930 |     TElement* e = reinterpret_cast<TElement*>(pbuffer->p);
1931 |     e[i] = v;
1932 |     return value;
1933 | }
1934 | typedef VALUE (*BufferElementSetFunctionType)(VALUE, VALUE, VALUE);
1935 | 
1936 | // }}}
1937 | 
1938 | 
1939 | // {{{ Memory
1940 | 
1941 | /*  call-seq: memcpy_htod(dst_devptr, src_mem, nbytes)    ->    nil
1942 |  *
1943 |  *  Copy _nbytes_ from host memory at _src_mem_ to device memory at _dst_devptr_.
1944 |  */
1945 | static VALUE memcpy_htod(VALUE self, VALUE rb_device_ptr, VALUE rb_memory, VALUE nbytes)
1946 | {
1947 |     CUdeviceptr* pdevice_ptr;
1948 |     MemoryPointer* pmem;
1949 |     Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr);
1950 |     Data_Get_Struct(rb_memory, MemoryPointer, pmem);
1951 |     CUresult status = cuMemcpyHtoD(*pdevice_ptr, static_cast<void*>(pmem->p), NUM2UINT(nbytes));
1952 |     if (status != CUDA_SUCCESS) {
1953 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory from host to device.");
1954 |     }
1955 |     return Qnil;
1956 | }
1957 | 
1958 | /*  call-seq: memcpy_htod_async(dst_devptr, src_mem, nbytes, stream)    ->    nil
1959 |  *
1960 |  *  Copy _nbytes_ from host memory at _src_mem_ to device memory at _dst_devptr_ in _stream_ asynchronously.
1961 |  *
1962 |  *  Note: The _src_mem_ should be *page-locked* memory.
1963 |  */
1964 | static VALUE memcpy_htod_async(VALUE self, VALUE rb_device_ptr, VALUE rb_memory, VALUE nbytes, VALUE rb_stream)
1965 | {
1966 |     CUdeviceptr* pdevice_ptr;
1967 |     MemoryPointer* pmem;
1968 |     CUstream* pstream;
1969 |     CUstream stream0 = 0;
1970 |     Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr);
1971 |     Data_Get_Struct(rb_memory, MemoryPointer, pmem);
1972 |     if (CLASS_OF(rb_stream) == rb_cCUStream) {
1973 |         Data_Get_Struct(rb_stream, CUstream, pstream);
1974 |     } else {
1975 |         pstream = &stream0;
1976 |     }
1977 |     CUresult status = cuMemcpyHtoDAsync(*pdevice_ptr, static_cast<void*>(pmem->p), NUM2UINT(nbytes), *pstream);
1978 |     if (status != CUDA_SUCCESS) {
1979 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from host to device.");
1980 |     }
1981 |     return Qnil;
1982 | }
1983 | 
1984 | /*  call-seq: memcpy_dtoh(dst_mem, src_devptr, nbytes)    ->    nil
1985 |  *
1986 |  *  Copy _nbytes_ from device memory at _src_devptr_ to host memory at _dst_mem_.
1987 |  */
1988 | static VALUE memcpy_dtoh(VALUE self, VALUE rb_memory, VALUE rb_device_ptr, VALUE nbytes)
1989 | {
1990 |     MemoryPointer* pmem;
1991 |     CUdeviceptr* pdevice_ptr;
1992 |     Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr);
1993 |     Data_Get_Struct(rb_memory, MemoryPointer, pmem);
1994 |     CUresult status = cuMemcpyDtoH(static_cast<void*>(pmem->p), *pdevice_ptr, NUM2UINT(nbytes));
1995 |     if (status != CUDA_SUCCESS) {
1996 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory from device to host.");
1997 |     }
1998 |     return Qnil;
1999 | }
2000 | 
2001 | /*  call-seq: memcpy_dtoh_async(dst_mem, src_devptr, nbytes, stream)    ->    nil
2002 |  *
2003 |  *  Copy _nbytes_ from device memory at _src_devptr_ to host memory at _dst_mem_ in _stream_ asynchronously.
2004 |  *
2005 |  *  Note: The _dst_mem_ should be *page-locked* memory.
2006 |  */
2007 | static VALUE memcpy_dtoh_async(VALUE self, VALUE rb_memory, VALUE rb_device_ptr, VALUE nbytes, VALUE rb_stream)
2008 | {
2009 |     MemoryPointer* pmem;
2010 |     CUdeviceptr* pdevice_ptr;
2011 |     CUstream* pstream;
2012 |     CUstream stream0 = 0;
2013 |     Data_Get_Struct(rb_device_ptr, CUdeviceptr, pdevice_ptr);
2014 |     Data_Get_Struct(rb_memory, MemoryPointer, pmem);
2015 |     if (CLASS_OF(rb_stream) == rb_cCUStream) {
2016 |         Data_Get_Struct(rb_stream, CUstream, pstream);
2017 |     } else {
2018 |         pstream = &stream0;
2019 |     }
2020 |     CUresult status = cuMemcpyDtoHAsync(static_cast<void*>(pmem->p), *pdevice_ptr, NUM2UINT(nbytes), *pstream);
2021 |     if (status != CUDA_SUCCESS) {
2022 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from device to host.");
2023 |     }
2024 |     return Qnil;
2025 | }
2026 | 
2027 | /*  call-seq: memcpy_dtod(dst_devptr, src_devptr, nbytes)    ->    nil
2028 |  *
2029 |  *  Copy _nbytes_ from device memory at _src_devptr_ to device memory at _dst_devptr_ asynchronously.
2030 |  */
2031 | static VALUE memcpy_dtod(VALUE self, VALUE rb_device_ptr_dst, VALUE rb_device_ptr_src, VALUE nbytes)
2032 | {
2033 |     CUdeviceptr* dst;
2034 |     CUdeviceptr* src;
2035 |     Data_Get_Struct(rb_device_ptr_dst, CUdeviceptr, dst);
2036 |     Data_Get_Struct(rb_device_ptr_src, CUdeviceptr, src);
2037 |     CUresult status = cuMemcpyDtoD(*dst, *src, NUM2UINT(nbytes));
2038 |     if (status != CUDA_SUCCESS) {
2039 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory from device to device.");
2040 |     }
2041 |     return Qnil;
2042 | }
2043 | 
2044 | /*  call-seq: memcpy_dtod_async(dst_devptr, src_devptr, nbytes, stream)    ->    nil
2045 |  *
2046 |  *  Copy _nbytes_ from device memory at _src_devptr_ to device memory at _dst_devptr_ in _stream_ asynchronously.
2047 |  */
2048 | static VALUE memcpy_dtod_async(VALUE self, VALUE rb_device_ptr_dst, VALUE rb_device_ptr_src, VALUE nbytes, VALUE rb_stream)
2049 | {
2050 |     CUdeviceptr* dst;
2051 |     CUdeviceptr* src;
2052 |     CUstream *pstream;
2053 |     CUstream stream0 = 0;
2054 |     Data_Get_Struct(rb_device_ptr_dst, CUdeviceptr, dst);
2055 |     Data_Get_Struct(rb_device_ptr_src, CUdeviceptr, src);
2056 |     if (CLASS_OF(rb_stream) == rb_cCUStream) {
2057 |         Data_Get_Struct(rb_stream, CUstream, pstream);
2058 |     } else {
2059 |         pstream = &stream0;
2060 |     }
2061 |     CUresult status = cuMemcpyDtoDAsync(*dst, *src, NUM2UINT(nbytes), *pstream);
2062 |     if (status != CUDA_SUCCESS) {
2063 |         RAISE_CU_STD_ERROR(status, "Failed to copy memory asynchronously from device to device.");
2064 |     }
2065 |     return Qnil;
2066 | }
2067 | 
2068 | /*  call-seq: mem_get_info    ->    Hash { free:, total: }
2069 |  *
2070 |  *  Return a hash { free:, total: } with the amount of free and total device memory in bytes.
2071 |  */
2072 | static VALUE mem_get_info(VALUE self)
2073 | {
2074 |     size_t free_memory;
2075 |     size_t total_memory;
2076 |     CUresult status = cuMemGetInfo(&free_memory, &total_memory);
2077 |     if (status != CUDA_SUCCESS) {
2078 |         RAISE_CU_STD_ERROR(status, "Failed to get memory information.");
2079 |     }
2080 |     VALUE h = rb_hash_new();
2081 |     rb_hash_aset(h, ID2SYM(rb_intern("free")), UINT2NUM(free_memory));
2082 |     rb_hash_aset(h, ID2SYM(rb_intern("total")), UINT2NUM(total_memory));
2083 |     return h;
2084 | }
2085 | 
2086 | // }}}
2087 | 
2088 | 
2089 | // {{{ Driver
2090 | 
2091 | /*  call-seq: driver_get_version    ->    Fixnum
2092 |  *
2093 |  *  Return the version number of the installed CUDA driver.
2094 |  */
2095 | static VALUE driver_get_version()
2096 | {
2097 |     int v;
2098 |     cuDriverGetVersion(&v);
2099 |     return INT2FIX(v);
2100 | }
2101 | 
2102 | // }}}
2103 | 
2104 | 
2105 | // {{{ Doc
2106 | 
2107 | /*  Document-class: SGC::CU::MemoryBuffer
2108 |  *  See IBuffer and IBuffer::ClassMethods.
2109 |  *
2110 |  *  Note: ELEMENT_SIZE is *deprecated*. Use MemoryBuffer.element_size.
2111 |  */
2112 | 
2113 | /*  Document-class: SGC::CU::Int32Buffer
2114 |  *  See IBuffer and IBuffer::ClassMethods.
2115 |  *
2116 |  *  Note: ELEMENT_SIZE is *deprecated*. Use Int32Buffer.element_size.
2117 |  */
2118 | 
2119 | /*  Document-class: SGC::CU::Int64Buffer
2120 |  *  See IBuffer and IBuffer::ClassMethods.
2121 |  *
2122 |  *  Note: ELEMENT_SIZE is *deprecated*. Use Int64Buffer.element_size.
2123 |  */
2124 | 
2125 | /*  Document-class: SGC::CU::Float32Buffer
2126 |  *  See IBuffer and IBuffer::ClassMethods.
2127 |  *
2128 |  *  Note: ELEMENT_SIZE is *deprecated*. Use Float32Buffer.element_size.
2129 |  */
2130 | 
2131 | /*  Document-class: SGC::CU::Float64Buffer
2132 |  *  See IBuffer and IBuffer::ClassMethods.
2133 |  *
2134 |  *  Note: ELEMENT_SIZE is *deprecated*. Use Float64Buffer.element_size.
2135 |  */
2136 | 
2137 |  // }}}
2138 | 
2139 | 
2140 | extern "C" void Init_rubycu()
2141 | {
2142 |     rb_mSGC = rb_define_module("SGC");
2143 |     rb_mCU  = rb_define_module_under(rb_mSGC, "CU");
2144 | 
2145 |     rb_cCUDevice = rb_define_class_under(rb_mCU, "CUDevice", rb_cObject);
2146 |     rb_define_singleton_method(rb_cCUDevice, "get_count", RUBY_METHOD_FUNC(device_get_count), 0);
2147 |     rb_define_singleton_method(rb_cCUDevice, "get", RUBY_METHOD_FUNC(device_get), 1);
2148 |     rb_define_alloc_func(rb_cCUDevice, device_alloc);
2149 |     rb_define_method(rb_cCUDevice, "initialize", RUBY_METHOD_FUNC(device_initialize), -1);
2150 |     rb_define_method(rb_cCUDevice, "get_name", RUBY_METHOD_FUNC(device_get_name), 0);
2151 |     rb_define_method(rb_cCUDevice, "compute_capability", RUBY_METHOD_FUNC(device_compute_capability), 0);
2152 |     rb_define_method(rb_cCUDevice, "get_attribute", RUBY_METHOD_FUNC(device_get_attribute), 1);
2153 |     rb_define_method(rb_cCUDevice, "get_properties", RUBY_METHOD_FUNC(device_get_properties), 0);
2154 |     rb_define_method(rb_cCUDevice, "total_mem", RUBY_METHOD_FUNC(device_total_mem), 0);
2155 | 
2156 |     rb_cCUComputeMode = rb_define_class_under(rb_mCU, "CUComputeMode", rb_cObject);
2157 |     rb_define_const(rb_cCUComputeMode, "DEFAULT", INT2FIX(CU_COMPUTEMODE_DEFAULT));
2158 |     rb_define_const(rb_cCUComputeMode, "EXCLUSIVE", INT2FIX(CU_COMPUTEMODE_EXCLUSIVE));
2159 |     rb_define_const(rb_cCUComputeMode, "PROHIBITED", INT2FIX(CU_COMPUTEMODE_PROHIBITED));
2160 | 
2161 |     rb_cCUDeviceAttribute = rb_define_class_under(rb_mCU, "CUDeviceAttribute", rb_cObject);
2162 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_THREADS_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK));
2163 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_X", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X));
2164 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_Y", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y));
2165 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_BLOCK_DIM_Z", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z));
2166 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_X", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X));
2167 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_Y", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y));
2168 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_GRID_DIM_Z", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z));
2169 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_REGISTERS_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK));
2170 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_SHARED_MEMORY_PER_BLOCK", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK));
2171 |     rb_define_const(rb_cCUDeviceAttribute, "TOTAL_CONSTANT_MEMORY", INT2FIX(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY));
2172 |     rb_define_const(rb_cCUDeviceAttribute, "WARP_SIZE", INT2FIX(CU_DEVICE_ATTRIBUTE_WARP_SIZE));
2173 |     rb_define_const(rb_cCUDeviceAttribute, "MAX_PITCH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAX_PITCH));
2174 |     rb_define_const(rb_cCUDeviceAttribute, "CLOCK_RATE", INT2FIX(CU_DEVICE_ATTRIBUTE_CLOCK_RATE));
2175 |     rb_define_const(rb_cCUDeviceAttribute, "TEXTURE_ALIGNMENT", INT2FIX(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT));
2176 |     rb_define_const(rb_cCUDeviceAttribute, "GPU_OVERLAP", INT2FIX(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP));
2177 |     rb_define_const(rb_cCUDeviceAttribute, "MULTIPROCESSOR_COUNT", INT2FIX(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT));
2178 |     rb_define_const(rb_cCUDeviceAttribute, "KERNEL_EXEC_TIMEOUT", INT2FIX(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT));
2179 |     rb_define_const(rb_cCUDeviceAttribute, "INTEGRATED", INT2FIX(CU_DEVICE_ATTRIBUTE_INTEGRATED));
2180 |     rb_define_const(rb_cCUDeviceAttribute, "CAN_MAP_HOST_MEMORY", INT2FIX(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY));
2181 |     rb_define_const(rb_cCUDeviceAttribute, "COMPUTE_MODE", INT2FIX(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE));
2182 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE1D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH));
2183 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH));
2184 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH));
2185 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT));
2186 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT));
2187 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE3D_DEPTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH));
2188 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_WIDTH", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH));
2189 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT));
2190 |     rb_define_const(rb_cCUDeviceAttribute, "MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES", INT2FIX(CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES));
2191 |     rb_define_const(rb_cCUDeviceAttribute, "SURFACE_ALIGNMENT", INT2FIX(CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT));
2192 |     rb_define_const(rb_cCUDeviceAttribute, "CONCURRENT_KERNELS", INT2FIX(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS));
2193 |     rb_define_const(rb_cCUDeviceAttribute, "ECC_ENABLED", INT2FIX(CU_DEVICE_ATTRIBUTE_ECC_ENABLED));
2194 |     rb_define_const(rb_cCUDeviceAttribute, "PCI_BUS_ID", INT2FIX(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID));
2195 |     rb_define_const(rb_cCUDeviceAttribute, "PCI_DEVICE_ID", INT2FIX(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID));
2196 |     rb_define_const(rb_cCUDeviceAttribute, "TCC_DRIVER", INT2FIX(CU_DEVICE_ATTRIBUTE_TCC_DRIVER));
2197 | 
2198 |     rb_cCUContext = rb_define_class_under(rb_mCU, "CUContext", rb_cObject);
2199 |     rb_define_alloc_func(rb_cCUContext, context_alloc);
2200 |     rb_define_method(rb_cCUContext, "initialize", RUBY_METHOD_FUNC(context_initialize), -1);
2201 |     rb_define_method(rb_cCUContext, "create", RUBY_METHOD_FUNC(context_create), -1);
2202 |     rb_define_method(rb_cCUContext, "destroy", RUBY_METHOD_FUNC(context_destroy), 0);
2203 |     rb_define_method(rb_cCUContext, "attach", RUBY_METHOD_FUNC(context_attach), -1);
2204 |     rb_define_method(rb_cCUContext, "detach", RUBY_METHOD_FUNC(context_detach), 0);
2205 |     rb_define_method(rb_cCUContext, "push_current", RUBY_METHOD_FUNC(context_push_current), 0);
2206 |     rb_define_method(rb_cCUContext, "get_api_version", RUBY_METHOD_FUNC(context_get_api_version), 0);
2207 |     rb_define_singleton_method(rb_cCUContext, "get_device", RUBY_METHOD_FUNC(context_get_device), 0);
2208 |     rb_define_singleton_method(rb_cCUContext, "get_limit", RUBY_METHOD_FUNC(context_get_limit), 1);
2209 |     rb_define_singleton_method(rb_cCUContext, "set_limit", RUBY_METHOD_FUNC(context_set_limit), 2);
2210 |     rb_define_singleton_method(rb_cCUContext, "get_cache_config", RUBY_METHOD_FUNC(context_get_cache_config), 0);
2211 |     rb_define_singleton_method(rb_cCUContext, "set_cache_config", RUBY_METHOD_FUNC(context_set_cache_config), 1);
2212 |     rb_define_singleton_method(rb_cCUContext, "get_api_version", RUBY_METHOD_FUNC(context_get_api_version_singleton), 0);
2213 |     rb_define_singleton_method(rb_cCUContext, "pop_current", RUBY_METHOD_FUNC(context_pop_current), 0);
2214 |     rb_define_singleton_method(rb_cCUContext, "synchronize", RUBY_METHOD_FUNC(context_synchronize), 0);
2215 | 
2216 |     rb_cCUContextFlags = rb_define_class_under(rb_mCU, "CUContextFlags", rb_cObject);
2217 |     rb_define_const(rb_cCUContextFlags, "SCHED_AUTO", INT2FIX(CU_CTX_SCHED_AUTO));
2218 |     rb_define_const(rb_cCUContextFlags, "SCHED_SPIN", INT2FIX(CU_CTX_SCHED_SPIN));
2219 |     rb_define_const(rb_cCUContextFlags, "SCHED_YIELD", INT2FIX(CU_CTX_SCHED_YIELD));
2220 |     rb_define_const(rb_cCUContextFlags, "BLOCKING_SYNC", INT2FIX(CU_CTX_BLOCKING_SYNC));
2221 |     rb_define_const(rb_cCUContextFlags, "MAP_HOST", INT2FIX(CU_CTX_MAP_HOST));
2222 |     rb_define_const(rb_cCUContextFlags, "LMEM_RESIZE_TO_MAX", INT2FIX(CU_CTX_LMEM_RESIZE_TO_MAX));
2223 | 
2224 |     rb_cCULimit = rb_define_class_under(rb_mCU, "CULimit", rb_cObject);
2225 |     rb_define_const(rb_cCULimit, "STACK_SIZE", INT2FIX(CU_LIMIT_STACK_SIZE));
2226 |     rb_define_const(rb_cCULimit, "PRINTF_FIFO_SIZE", INT2FIX(CU_LIMIT_PRINTF_FIFO_SIZE));
2227 |     rb_define_const(rb_cCULimit, "MALLOC_HEAP_SIZE", INT2FIX(CU_LIMIT_MALLOC_HEAP_SIZE));
2228 | 
2229 |     rb_cCUModule = rb_define_class_under(rb_mCU, "CUModule", rb_cObject);
2230 |     rb_define_alloc_func(rb_cCUModule, module_alloc);
2231 |     rb_define_method(rb_cCUModule, "initialize", RUBY_METHOD_FUNC(module_initialize), -1);
2232 |     rb_define_method(rb_cCUModule, "load", RUBY_METHOD_FUNC(module_load), 1);
2233 |     rb_define_method(rb_cCUModule, "load_data", RUBY_METHOD_FUNC(module_load_data), 1);
2234 |     rb_define_method(rb_cCUModule, "unload", RUBY_METHOD_FUNC(module_unload), 0);
2235 |     rb_define_method(rb_cCUModule, "get_function", RUBY_METHOD_FUNC(module_get_function), 1);
2236 |     rb_define_method(rb_cCUModule, "get_global", RUBY_METHOD_FUNC(module_get_global), 1);
2237 |     rb_define_method(rb_cCUModule, "get_texref", RUBY_METHOD_FUNC(module_get_texref), 1);
2238 | 
2239 |     rb_cCUDevicePtr = rb_define_class_under(rb_mCU, "CUDevicePtr", rb_cObject);
2240 |     rb_define_alloc_func(rb_cCUDevicePtr, device_ptr_alloc);
2241 |     rb_define_method(rb_cCUDevicePtr, "initialize", RUBY_METHOD_FUNC(device_ptr_initialize), -1);
2242 |     rb_define_method(rb_cCUDevicePtr, "offset", RUBY_METHOD_FUNC(device_ptr_offset), 1);
2243 |     rb_define_method(rb_cCUDevicePtr, "mem_alloc", RUBY_METHOD_FUNC(device_ptr_mem_alloc), 1);
2244 |     rb_define_method(rb_cCUDevicePtr, "mem_free", RUBY_METHOD_FUNC(device_ptr_mem_free), 0);
2245 | 
2246 |     rb_cCUFunction = rb_define_class_under(rb_mCU, "CUFunction", rb_cObject);
2247 |     rb_define_alloc_func(rb_cCUFunction, function_alloc);
2248 |     rb_define_method(rb_cCUFunction, "initialize", RUBY_METHOD_FUNC(function_initialize), -1);
2249 |     rb_define_method(rb_cCUFunction, "set_param", RUBY_METHOD_FUNC(function_set_param), -1);
2250 |     rb_define_method(rb_cCUFunction, "set_texref", RUBY_METHOD_FUNC(function_set_texref), 1);
2251 |     rb_define_method(rb_cCUFunction, "set_block_shape", RUBY_METHOD_FUNC(function_set_block_shape), -1);
2252 |     rb_define_method(rb_cCUFunction, "set_shared_size", RUBY_METHOD_FUNC(function_set_shared_size), 1);
2253 |     rb_define_method(rb_cCUFunction, "launch", RUBY_METHOD_FUNC(function_launch), 0);
2254 |     rb_define_method(rb_cCUFunction, "launch_grid", RUBY_METHOD_FUNC(function_launch_grid), -1);
2255 |     rb_define_method(rb_cCUFunction, "launch_grid_async", RUBY_METHOD_FUNC(function_launch_grid_async), -1);
2256 |     rb_define_method(rb_cCUFunction, "get_attribute", RUBY_METHOD_FUNC(function_get_attribute), 1);
2257 |     rb_define_method(rb_cCUFunction, "set_cache_config", RUBY_METHOD_FUNC(function_set_cache_config), 1);
2258 | 
2259 |     rb_cCUFunctionAttribute = rb_define_class_under(rb_mCU, "CUFunctionAttribute", rb_cObject);
2260 |     rb_define_const(rb_cCUFunctionAttribute, "MAX_THREADS_PER_BLOCK", INT2FIX(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK));
2261 |     rb_define_const(rb_cCUFunctionAttribute, "SHARED_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES));
2262 |     rb_define_const(rb_cCUFunctionAttribute, "CONST_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES));
2263 |     rb_define_const(rb_cCUFunctionAttribute, "LOCAL_SIZE_BYTES", INT2FIX(CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES));
2264 |     rb_define_const(rb_cCUFunctionAttribute, "NUM_REGS", INT2FIX(CU_FUNC_ATTRIBUTE_NUM_REGS));
2265 |     rb_define_const(rb_cCUFunctionAttribute, "PTX_VERSION", INT2FIX(CU_FUNC_ATTRIBUTE_PTX_VERSION));
2266 |     rb_define_const(rb_cCUFunctionAttribute, "BINARY_VERSION", INT2FIX(CU_FUNC_ATTRIBUTE_BINARY_VERSION));
2267 | 
2268 |     rb_cCUFunctionCache = rb_define_class_under(rb_mCU, "CUFunctionCache", rb_cObject);
2269 |     rb_define_const(rb_cCUFunctionCache, "PREFER_NONE", INT2FIX(CU_FUNC_CACHE_PREFER_NONE));
2270 |     rb_define_const(rb_cCUFunctionCache, "PREFER_SHARED", INT2FIX(CU_FUNC_CACHE_PREFER_SHARED));
2271 |     rb_define_const(rb_cCUFunctionCache, "PREFER_L1", INT2FIX(CU_FUNC_CACHE_PREFER_L1));
2272 | 
2273 |     rb_cCUStream = rb_define_class_under(rb_mCU, "CUStream", rb_cObject);
2274 |     rb_define_alloc_func(rb_cCUStream, stream_alloc);
2275 |     rb_define_method(rb_cCUStream, "initialize", RUBY_METHOD_FUNC(stream_initialize), 0);
2276 |     rb_define_method(rb_cCUStream, "create", RUBY_METHOD_FUNC(stream_create), -1);
2277 |     rb_define_method(rb_cCUStream, "destroy", RUBY_METHOD_FUNC(stream_destroy), 0);
2278 |     rb_define_method(rb_cCUStream, "query", RUBY_METHOD_FUNC(stream_query), 0);
2279 |     rb_define_method(rb_cCUStream, "synchronize", RUBY_METHOD_FUNC(stream_synchronize), 0);
2280 |     rb_define_method(rb_cCUStream, "wait_event", RUBY_METHOD_FUNC(stream_wait_event), -1);
2281 |     rb_define_singleton_method(rb_cCUStream, "wait_event", RUBY_METHOD_FUNC(stream_wait_event_singleton), -1);
2282 | 
2283 |     rb_cCUEvent = rb_define_class_under(rb_mCU, "CUEvent", rb_cObject);
2284 |     rb_define_alloc_func(rb_cCUEvent, event_alloc);
2285 |     rb_define_method(rb_cCUEvent, "initialize", RUBY_METHOD_FUNC(event_initialize), 0);
2286 |     rb_define_method(rb_cCUEvent, "create", RUBY_METHOD_FUNC(event_create), -1);
2287 |     rb_define_method(rb_cCUEvent, "destroy", RUBY_METHOD_FUNC(event_destroy), 0);
2288 |     rb_define_method(rb_cCUEvent, "query", RUBY_METHOD_FUNC(event_query), 0);
2289 |     rb_define_method(rb_cCUEvent, "record", RUBY_METHOD_FUNC(event_record), 1);
2290 |     rb_define_method(rb_cCUEvent, "synchronize", RUBY_METHOD_FUNC(event_synchronize), 0);
2291 |     rb_define_singleton_method(rb_cCUEvent, "elapsed_time", RUBY_METHOD_FUNC(event_elapsed_time), 2);
2292 | 
2293 |     rb_cCUEventFlags = rb_define_class_under(rb_mCU, "CUEventFlags", rb_cObject);
2294 |     rb_define_const(rb_cCUEventFlags, "DEFAULT", INT2FIX(CU_EVENT_DEFAULT));
2295 |     rb_define_const(rb_cCUEventFlags, "BLOCKING_SYNC", INT2FIX(CU_EVENT_BLOCKING_SYNC));
2296 |     rb_define_const(rb_cCUEventFlags, "DISABLE_TIMING", INT2FIX(CU_EVENT_DISABLE_TIMING));
2297 | 
2298 |     rb_cCUAddressMode = rb_define_class_under(rb_mCU, "CUAddressMode", rb_cObject);
2299 |     rb_define_const(rb_cCUAddressMode, "WRAP", INT2FIX(CU_TR_ADDRESS_MODE_WRAP));
2300 |     rb_define_const(rb_cCUAddressMode, "CLAMP", INT2FIX(CU_TR_ADDRESS_MODE_CLAMP));
2301 |     rb_define_const(rb_cCUAddressMode, "MIRROR", INT2FIX(CU_TR_ADDRESS_MODE_MIRROR));
2302 |     rb_define_const(rb_cCUAddressMode, "BORDER", INT2FIX(CU_TR_ADDRESS_MODE_BORDER));
2303 | 
2304 |     rb_cCUFilterMode = rb_define_class_under(rb_mCU, "CUFilterMode", rb_cObject);
2305 |     rb_define_const(rb_cCUFilterMode, "POINT", INT2FIX(CU_TR_FILTER_MODE_POINT));
2306 |     rb_define_const(rb_cCUFilterMode, "LINEAR", INT2FIX(CU_TR_FILTER_MODE_LINEAR));
2307 | 
2308 |     rb_cCUTexRefFlags = rb_define_class_under(rb_mCU, "CUTexRefFlags", rb_cObject);
2309 |     rb_define_const(rb_cCUTexRefFlags, "READ_AS_INTEGER", INT2FIX(CU_TRSF_READ_AS_INTEGER));
2310 |     rb_define_const(rb_cCUTexRefFlags, "NORMALIZED_COORDINATES", INT2FIX(CU_TRSF_NORMALIZED_COORDINATES));
2311 | 
2312 |     rb_cCUTexRef = rb_define_class_under(rb_mCU, "CUTexRef", rb_cObject);
2313 |     rb_define_alloc_func(rb_cCUTexRef, texref_alloc);
2314 |     rb_define_method(rb_cCUTexRef, "initialize", RUBY_METHOD_FUNC(texref_initialize), 0);
2315 |     rb_define_method(rb_cCUTexRef, "create", RUBY_METHOD_FUNC(texref_create), 0);
2316 |     rb_define_method(rb_cCUTexRef, "destroy", RUBY_METHOD_FUNC(texref_destroy), 0);
2317 |     rb_define_method(rb_cCUTexRef, "get_address", RUBY_METHOD_FUNC(texref_get_address), 0);
2318 |     rb_define_method(rb_cCUTexRef, "get_address_mode", RUBY_METHOD_FUNC(texref_get_address_mode), 1);
2319 |     rb_define_method(rb_cCUTexRef, "get_filter_mode", RUBY_METHOD_FUNC(texref_get_filter_mode), 0);
2320 |     rb_define_method(rb_cCUTexRef, "get_flags", RUBY_METHOD_FUNC(texref_get_flags), 0);
2321 |     rb_define_method(rb_cCUTexRef, "set_address", RUBY_METHOD_FUNC(texref_set_address), 2);
2322 |     rb_define_method(rb_cCUTexRef, "set_address_mode", RUBY_METHOD_FUNC(texref_set_address_mode), 2);
2323 |     rb_define_method(rb_cCUTexRef, "set_filter_mode", RUBY_METHOD_FUNC(texref_set_filter_mode), 1);
2324 |     rb_define_method(rb_cCUTexRef, "set_flags", RUBY_METHOD_FUNC(texref_set_flags), 1);
2325 | 
2326 |     rb_cCUResult = rb_define_class_under(rb_mCU, "CUResult", rb_cObject);
2327 |     rb_define_const(rb_cCUResult, "SUCCESS", INT2FIX(CUDA_SUCCESS));
2328 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_VALUE", INT2FIX(CUDA_ERROR_INVALID_VALUE));
2329 |     rb_define_const(rb_cCUResult, "ERROR_OUT_OF_MEMORY", INT2FIX(CUDA_ERROR_OUT_OF_MEMORY));
2330 |     rb_define_const(rb_cCUResult, "ERROR_NOT_INITIALIZED", INT2FIX(CUDA_ERROR_NOT_INITIALIZED));
2331 |     rb_define_const(rb_cCUResult, "ERROR_DEINITIALIZED", INT2FIX(CUDA_ERROR_DEINITIALIZED));
2332 |     rb_define_const(rb_cCUResult, "ERROR_NO_DEVICE", INT2FIX(CUDA_ERROR_NO_DEVICE));
2333 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_DEVICE", INT2FIX(CUDA_ERROR_INVALID_DEVICE));
2334 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_IMAGE", INT2FIX(CUDA_ERROR_INVALID_IMAGE));
2335 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_CONTEXT", INT2FIX(CUDA_ERROR_INVALID_CONTEXT));
2336 |     rb_define_const(rb_cCUResult, "ERROR_CONTEXT_ALREADY_CURRENT", INT2FIX(CUDA_ERROR_CONTEXT_ALREADY_CURRENT));
2337 |     rb_define_const(rb_cCUResult, "ERROR_MAP_FAILED", INT2FIX(CUDA_ERROR_MAP_FAILED));
2338 |     rb_define_const(rb_cCUResult, "ERROR_UNMAP_FAILED", INT2FIX(CUDA_ERROR_UNMAP_FAILED));
2339 |     rb_define_const(rb_cCUResult, "ERROR_ARRAY_IS_MAPPED", INT2FIX(CUDA_ERROR_ARRAY_IS_MAPPED));
2340 |     rb_define_const(rb_cCUResult, "ERROR_ALREADY_MAPPED", INT2FIX(CUDA_ERROR_ALREADY_MAPPED));
2341 |     rb_define_const(rb_cCUResult, "ERROR_NO_BINARY_FOR_GPU", INT2FIX(CUDA_ERROR_NO_BINARY_FOR_GPU));
2342 |     rb_define_const(rb_cCUResult, "ERROR_ALREADY_ACQUIRED", INT2FIX(CUDA_ERROR_ALREADY_ACQUIRED));
2343 |     rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED", INT2FIX(CUDA_ERROR_NOT_MAPPED));
2344 |     rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED_AS_ARRAY", INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_ARRAY));
2345 |     rb_define_const(rb_cCUResult, "ERROR_NOT_MAPPED_AS_POINTER", INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_POINTER));
2346 |     rb_define_const(rb_cCUResult, "ERROR_ECC_UNCORRECTABLE", INT2FIX(CUDA_ERROR_ECC_UNCORRECTABLE));
2347 |     rb_define_const(rb_cCUResult, "ERROR_UNSUPPORTED_LIMIT", INT2FIX(CUDA_ERROR_UNSUPPORTED_LIMIT));
2348 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_SOURCE", INT2FIX(CUDA_ERROR_INVALID_SOURCE));
2349 |     rb_define_const(rb_cCUResult, "ERROR_FILE_NOT_FOUND", INT2FIX(CUDA_ERROR_FILE_NOT_FOUND));
2350 |     rb_define_const(rb_cCUResult, "ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", INT2FIX(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND));
2351 |     rb_define_const(rb_cCUResult, "ERROR_SHARED_OBJECT_INIT_FAILED", INT2FIX(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED));
2352 |     rb_define_const(rb_cCUResult, "ERROR_OPERATING_SYSTEM", INT2FIX(CUDA_ERROR_OPERATING_SYSTEM));
2353 |     rb_define_const(rb_cCUResult, "ERROR_INVALID_HANDLE", INT2FIX(CUDA_ERROR_INVALID_HANDLE));
2354 |     rb_define_const(rb_cCUResult, "ERROR_NOT_FOUND", INT2FIX(CUDA_ERROR_NOT_FOUND));
2355 |     rb_define_const(rb_cCUResult, "ERROR_NOT_READY", INT2FIX(CUDA_ERROR_NOT_READY));
2356 |     rb_define_const(rb_cCUResult, "ERROR_LAUNCH_FAILED", INT2FIX(CUDA_ERROR_LAUNCH_FAILED));
2357 |     rb_define_const(rb_cCUResult, "ERROR_LAUNCH_OUT_OF_RESOURCES", INT2FIX(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES));
2358 |     rb_define_const(rb_cCUResult, "ERROR_LAUNCH_TIMEOUT", INT2FIX(CUDA_ERROR_LAUNCH_TIMEOUT));
2359 |     rb_define_const(rb_cCUResult, "ERROR_LAUNCH_INCOMPATIBLE_TEXTURING" , INT2FIX(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING));
2360 |     rb_define_const(rb_cCUResult, "ERROR_UNKNOWN", INT2FIX(CUDA_ERROR_UNKNOWN));
2361 | 
2362 |     rb_eCUStandardError = rb_define_class_under(rb_mCU, "CUStandardError", rb_eStandardError);
2363 | 
2364 |     rb_eCUDeviceError               = rb_define_class_under(rb_mCU, "CUDeviceError", rb_eCUStandardError);
2365 |     rb_eCUDeviceNotInitializedError = rb_define_class_under(rb_mCU, "CUDeviceNotInitializedError", rb_eCUDeviceError);
2366 |     rb_eCUDeviceDeinitializedError  = rb_define_class_under(rb_mCU, "CUDeviceDeinitializedError", rb_eCUDeviceError);
2367 |     rb_eCUNoDeviceError             = rb_define_class_under(rb_mCU, "CUNoDeviceError", rb_eCUDeviceError);
2368 |     rb_eCUInvalidDeviceError        = rb_define_class_under(rb_mCU, "CUInvalidDeviceError", rb_eCUDeviceError);
2369 | 
2370 |     rb_eCUMapError                = rb_define_class_under(rb_mCU, "CUMapError", rb_eCUStandardError);
2371 |     rb_eCUMapFailedError          = rb_define_class_under(rb_mCU, "CUMapFailedError", rb_eCUMapError);
2372 |     rb_eCUUnMapFailedError        = rb_define_class_under(rb_mCU, "CUUnMapFailedError", rb_eCUMapError);
2373 |     rb_eCUArrayIsMappedError      = rb_define_class_under(rb_mCU, "CUArrayIsMappedError", rb_eCUMapError);
2374 |     rb_eCUAlreadyMappedError      = rb_define_class_under(rb_mCU, "CUAlreadyMappedError", rb_eCUMapError);
2375 |     rb_eCUNotMappedError          = rb_define_class_under(rb_mCU, "CUNotMappedError", rb_eCUMapError);
2376 |     rb_eCUNotMappedAsArrayError   = rb_define_class_under(rb_mCU, "CUNotMappedAsArrayError", rb_eCUMapError);
2377 |     rb_eCUNotMappedAsPointerError = rb_define_class_under(rb_mCU, "CUNotMappedAsPointerError", rb_eCUMapError);
2378 | 
2379 |     rb_eCUContextError               = rb_define_class_under(rb_mCU, "CUContextError", rb_eCUStandardError);
2380 |     rb_eCUInvalidContextError        = rb_define_class_under(rb_mCU, "CUInvalidContextError", rb_eCUContextError);
2381 |     rb_eCUContextAlreadyCurrentError = rb_define_class_under(rb_mCU, "CUContextAlreadyCurrentError", rb_eCUContextError);
2382 |     rb_eCUUnsupportedLimitError      = rb_define_class_under(rb_mCU, "CUUnsupportedLimitError", rb_eCUContextError);
2383 | 
2384 |     rb_eCULaunchError                      = rb_define_class_under(rb_mCU, "CULaunchError", rb_eCUStandardError);
2385 |     rb_eCULaunchFailedError                = rb_define_class_under(rb_mCU, "CULaunchFailedError", rb_eCULaunchError);
2386 |     rb_eCULaunchOutOfResourcesError        = rb_define_class_under(rb_mCU, "CULaunchOutOfResourcesError", rb_eCULaunchError);
2387 |     rb_eCULaunchTimeoutError               = rb_define_class_under(rb_mCU, "CULaunchTimeoutError", rb_eCULaunchError);
2388 |     rb_eCULaunchIncompatibleTexturingError = rb_define_class_under(rb_mCU, "CULaunchIncompatibleTexturingError", rb_eCULaunchError);
2389 | 
2390 |     rb_eCUParameterError     = rb_define_class_under(rb_mCU, "CUParameterError", rb_eCUStandardError);
2391 |     rb_eCUInvalidValueError  = rb_define_class_under(rb_mCU, "CUInvalidValueError", rb_eCUParameterError);
2392 |     rb_eCUInvalidHandleError = rb_define_class_under(rb_mCU, "CUInvalidHandleError", rb_eCUParameterError);
2393 | 
2394 |     rb_eCUMemoryError      = rb_define_class_under(rb_mCU, "CUMemoryError", rb_eCUStandardError);
2395 |     rb_eCUOutOfMemoryError = rb_define_class_under(rb_mCU, "CUOutOfMemoryError", rb_eCUMemoryError);
2396 | 
2397 |     rb_eCULibraryError                    = rb_define_class_under(rb_mCU, "CULibraryError", rb_eCUStandardError);
2398 |     rb_eCUSharedObjectSymbolNotFoundError = rb_define_class_under(rb_mCU, "CUSharedObjectSymbolNotFoundError", rb_eCULibraryError);
2399 |     rb_eCUSharedObjectInitFailedError     = rb_define_class_under(rb_mCU, "CUSharedObjectInitFailedError", rb_eCULibraryError);
2400 | 
2401 |     rb_eCUHardwareError         = rb_define_class_under(rb_mCU, "CUHardwareError", rb_eCUStandardError);
2402 |     rb_eCUECCUncorrectableError = rb_define_class_under(rb_mCU, "CUECCUncorrectableError", rb_eCUHardwareError);
2403 | 
2404 |     rb_eCUFileError           = rb_define_class_under(rb_mCU, "CUFileError", rb_eCUStandardError);
2405 |     rb_eCUNoBinaryForGPUError = rb_define_class_under(rb_mCU, "CUNoBinaryForGPUError", rb_eCUFileError);
2406 |     rb_eCUFileNotFoundError   = rb_define_class_under(rb_mCU, "CUFileNotFoundError", rb_eCUFileError);
2407 |     rb_eCUInvalidSourceError  = rb_define_class_under(rb_mCU, "CUInvalidSourceError", rb_eCUFileError);
2408 |     rb_eCUInvalidImageError   = rb_define_class_under(rb_mCU, "CUInvalidImageError", rb_eCUFileError);
2409 | 
2410 |     rb_eCUReferenceError         = rb_define_class_under(rb_mCU, "CUReferenceError", rb_eCUStandardError);
2411 |     rb_eCUReferenceNotFoundError = rb_define_class_under(rb_mCU, "CUReferenceNotFoundError", rb_eCUReferenceError);
2412 | 
2413 |     rb_eCUOtherError           = rb_define_class_under(rb_mCU, "CUOtherError", rb_eCUStandardError);
2414 |     rb_eCUAlreadyAcquiredError = rb_define_class_under(rb_mCU, "CUAlreadyAcquiredError", rb_eCUOtherError);
2415 |     rb_eCUNotReadyError        = rb_define_class_under(rb_mCU, "CUNotReadyError", rb_eCUOtherError);
2416 |     rb_eCUOperatingSystemError = rb_define_class_under(rb_mCU, "CUOperatingSystemError", rb_eCUOtherError);
2417 | 
2418 |     rb_eCUUnknownError = rb_define_class_under(rb_mCU, "CUUnknownError", rb_eCUStandardError);
2419 | 
2420 |     rb_error_class_by_enum = rb_hash_new();
2421 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_INITIALIZED), rb_eCUDeviceNotInitializedError);
2422 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_DEINITIALIZED)  , rb_eCUDeviceDeinitializedError);
2423 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NO_DEVICE)      , rb_eCUNoDeviceError);
2424 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_DEVICE) , rb_eCUInvalidDeviceError);
2425 | 
2426 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_MAP_FAILED)           , rb_eCUMapFailedError);
2427 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNMAP_FAILED)         , rb_eCUUnMapFailedError);
2428 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ARRAY_IS_MAPPED)      , rb_eCUArrayIsMappedError);
2429 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ALREADY_MAPPED)       , rb_eCUAlreadyMappedError);
2430 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED)           , rb_eCUNotMappedError);
2431 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_ARRAY)  , rb_eCUNotMappedAsArrayError);
2432 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_MAPPED_AS_POINTER), rb_eCUNotMappedAsPointerError);
2433 | 
2434 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_CONTEXT)        , rb_eCUInvalidContextError);
2435 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_CONTEXT_ALREADY_CURRENT), rb_eCUContextAlreadyCurrentError);
2436 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNSUPPORTED_LIMIT)      , rb_eCUUnsupportedLimitError);
2437 | 
2438 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_FAILED)                , rb_eCULaunchFailedError);
2439 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES)      , rb_eCULaunchOutOfResourcesError);
2440 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_TIMEOUT)               , rb_eCULaunchTimeoutError);
2441 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING), rb_eCULaunchIncompatibleTexturingError);
2442 | 
2443 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_VALUE)  , rb_eCUInvalidValueError);
2444 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_HANDLE) , rb_eCUInvalidHandleError);
2445 | 
2446 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_OUT_OF_MEMORY), rb_eCUOutOfMemoryError);
2447 | 
2448 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND), rb_eCUSharedObjectSymbolNotFoundError);
2449 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED)     , rb_eCUSharedObjectInitFailedError);
2450 | 
2451 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ECC_UNCORRECTABLE), rb_eCUECCUncorrectableError);
2452 | 
2453 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NO_BINARY_FOR_GPU), rb_eCUNoBinaryForGPUError);
2454 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_FILE_NOT_FOUND)   , rb_eCUFileNotFoundError);
2455 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_SOURCE)   , rb_eCUInvalidSourceError);
2456 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_INVALID_IMAGE)    , rb_eCUInvalidImageError);
2457 | 
2458 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_FOUND), rb_eCUReferenceNotFoundError);
2459 | 
2460 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_ALREADY_ACQUIRED), rb_eCUAlreadyAcquiredError);
2461 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_NOT_READY)       , rb_eCUNotReadyError);
2462 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_OPERATING_SYSTEM), rb_eCUOperatingSystemError);
2463 | 
2464 |     rb_hash_aset(rb_error_class_by_enum, INT2FIX(CUDA_ERROR_UNKNOWN), rb_eCUUnknownError);
2465 | 
2466 |     rb_cMemoryPointer = rb_define_class_under(rb_mCU, "MemoryPointer", rb_cObject);
2467 |     rb_define_alloc_func(rb_cMemoryPointer, memory_pointer_alloc);
2468 |     rb_define_method(rb_cMemoryPointer, "initialize", RUBY_METHOD_FUNC(memory_pointer_initialize), 0);
2469 | 
2470 |     rb_mIBuffer = rb_define_module_under(rb_mCU, "IBuffer");
2471 |     rb_define_singleton_method(rb_mIBuffer, "included", RUBY_METHOD_FUNC(module_included_classmethods_hook), 1);
2472 |     rb_define_method(rb_mIBuffer, "initialize", RUBY_METHOD_FUNC(ibuffer_initialize), -1);
2473 |     rb_define_method(rb_mIBuffer, "size", RUBY_METHOD_FUNC(ibuffer_size), 0);
2474 |     rb_define_method(rb_mIBuffer, "page_locked?", RUBY_METHOD_FUNC(ibuffer_is_page_locked), 0);
2475 |     rb_define_method(rb_mIBuffer, "offset", RUBY_METHOD_FUNC(ibuffer_offset), 1);
2476 |     rb_define_method(rb_mIBuffer, "[]", RUBY_METHOD_FUNC(ibuffer_element_get), 1);
2477 |     rb_define_method(rb_mIBuffer, "[]=", RUBY_METHOD_FUNC(ibuffer_element_set), 2);
2478 | 
2479 |     rb_mIBufferClassMethods = rb_define_module_under(rb_mIBuffer, "ClassMethods");
2480 |     rb_define_method(rb_mIBufferClassMethods, "element_size", RUBY_METHOD_FUNC(ibuffer_element_size), 0);
2481 | 
2482 |     rb_cMemoryBuffer = rb_define_class_under(rb_mCU, "MemoryBuffer", rb_cMemoryPointer);
2483 |     rb_include_module(rb_cMemoryBuffer, rb_mIBuffer);
2484 |     module_included_classmethods_hook(rb_mIBuffer, rb_cMemoryBuffer);
2485 |     rb_define_alloc_func(rb_cMemoryBuffer, memory_buffer_alloc);
2486 |     rb_define_singleton_method(rb_cMemoryBuffer, "element_size", RUBY_METHOD_FUNC(memory_buffer_element_size), 0);
2487 |     rb_define_method(rb_cMemoryBuffer, "initialize", RUBY_METHOD_FUNC(memory_buffer_initialize), -1);
2488 |     rb_define_method(rb_cMemoryBuffer, "size", RUBY_METHOD_FUNC(memory_buffer_size), 0);
2489 |     rb_define_method(rb_cMemoryBuffer, "page_locked?", RUBY_METHOD_FUNC(memory_buffer_is_page_locked), 0);
2490 |     rb_define_method(rb_cMemoryBuffer, "offset", RUBY_METHOD_FUNC(memory_buffer_offset), 1);
2491 |     rb_define_method(rb_cMemoryBuffer, "[]", RUBY_METHOD_FUNC(memory_buffer_element_get), 1);
2492 |     rb_define_method(rb_cMemoryBuffer, "[]=", RUBY_METHOD_FUNC(memory_buffer_element_set), 2);
2493 | 
2494 |     rb_cInt32Buffer = rb_define_class_under(rb_mCU, "Int32Buffer", rb_cMemoryBuffer);
2495 |     rb_define_alloc_func(rb_cInt32Buffer, buffer_alloc<int>);
2496 |     rb_define_const(rb_cInt32Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(int)));
2497 |     rb_define_singleton_method(rb_cInt32Buffer, "element_size", RUBY_METHOD_FUNC(static_cast<BufferElementSizeFunctionType>(&buffer_element_size<int>)), 0);
2498 |     rb_define_method(rb_cInt32Buffer, "initialize", RUBY_METHOD_FUNC(static_cast<BufferInitializeFunctionType>(&buffer_initialize<int>)) , -1);
2499 |     rb_define_method(rb_cInt32Buffer, "size", RUBY_METHOD_FUNC(static_cast<BufferSizeFunctionType>(&buffer_size<int>)), 0);
2500 |     rb_define_method(rb_cInt32Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast<BufferIsPageLocked>(&buffer_is_page_locked<int>)), 0);
2501 |     rb_define_method(rb_cInt32Buffer, "offset", RUBY_METHOD_FUNC(static_cast<BufferOffsetFunctionType>(&buffer_offset<int>)), 1);
2502 |     rb_define_method(rb_cInt32Buffer, "[]", RUBY_METHOD_FUNC(static_cast<BufferElementGetFunctionType>(&buffer_element_get<int>)), 1);
2503 |     rb_define_method(rb_cInt32Buffer, "[]=", RUBY_METHOD_FUNC(static_cast<BufferElementSetFunctionType>(&buffer_element_set<int>)), 2);
2504 | 
2505 |     rb_cInt64Buffer = rb_define_class_under(rb_mCU, "Int64Buffer", rb_cMemoryBuffer);
2506 |     rb_define_alloc_func(rb_cInt64Buffer, buffer_alloc<long>);
2507 |     rb_define_const(rb_cInt64Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(long)));
2508 |     rb_define_singleton_method(rb_cInt64Buffer, "element_size", RUBY_METHOD_FUNC(static_cast<BufferElementSizeFunctionType>(&buffer_element_size<long>)), 0);
2509 |     rb_define_method(rb_cInt64Buffer, "initialize", RUBY_METHOD_FUNC(static_cast<BufferInitializeFunctionType>(&buffer_initialize<long>)) , -1);
2510 |     rb_define_method(rb_cInt64Buffer, "size", RUBY_METHOD_FUNC(static_cast<BufferSizeFunctionType>(&buffer_size<long>)), 0);
2511 |     rb_define_method(rb_cInt64Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast<BufferIsPageLocked>(&buffer_is_page_locked<long>)), 0);
2512 |     rb_define_method(rb_cInt64Buffer, "offset", RUBY_METHOD_FUNC(static_cast<BufferOffsetFunctionType>(&buffer_offset<long>)), 1);
2513 |     rb_define_method(rb_cInt64Buffer, "[]", RUBY_METHOD_FUNC(static_cast<BufferElementGetFunctionType>(&buffer_element_get<long>)), 1);
2514 |     rb_define_method(rb_cInt64Buffer, "[]=", RUBY_METHOD_FUNC(static_cast<BufferElementSetFunctionType>(&buffer_element_set<long>)), 2);
2515 | 
2516 |     rb_cFloat32Buffer = rb_define_class_under(rb_mCU, "Float32Buffer", rb_cMemoryBuffer);
2517 |     rb_define_alloc_func(rb_cFloat32Buffer, buffer_alloc<float>);
2518 |     rb_define_const(rb_cFloat32Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(float)));
2519 |     rb_define_singleton_method(rb_cFloat32Buffer, "element_size", RUBY_METHOD_FUNC(static_cast<BufferElementSizeFunctionType>(&buffer_element_size<float>)), 0);
2520 |     rb_define_method(rb_cFloat32Buffer, "initialize", RUBY_METHOD_FUNC(static_cast<BufferInitializeFunctionType>(&buffer_initialize<float>)) , -1);
2521 |     rb_define_method(rb_cFloat32Buffer, "size", RUBY_METHOD_FUNC(static_cast<BufferSizeFunctionType>(&buffer_size<float>)), 0);
2522 |     rb_define_method(rb_cFloat32Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast<BufferIsPageLocked>(&buffer_is_page_locked<float>)), 0);
2523 |     rb_define_method(rb_cFloat32Buffer, "offset", RUBY_METHOD_FUNC(static_cast<BufferOffsetFunctionType>(&buffer_offset<float>)), 1);
2524 |     rb_define_method(rb_cFloat32Buffer, "[]", RUBY_METHOD_FUNC(static_cast<BufferElementGetFunctionType>(&buffer_element_get<float>)), 1);
2525 |     rb_define_method(rb_cFloat32Buffer, "[]=", RUBY_METHOD_FUNC(static_cast<BufferElementSetFunctionType>(&buffer_element_set<float>)), 2);
2526 | 
2527 |     rb_cFloat64Buffer = rb_define_class_under(rb_mCU, "Float64Buffer", rb_cMemoryBuffer);
2528 |     rb_define_alloc_func(rb_cFloat64Buffer, buffer_alloc<double>);
2529 |     rb_define_const(rb_cFloat64Buffer, "ELEMENT_SIZE", INT2FIX(sizeof(double)));
2530 |     rb_define_method(rb_cFloat64Buffer, "initialize", RUBY_METHOD_FUNC(static_cast<BufferInitializeFunctionType>(&buffer_initialize<double>)) , -1);
2531 |     rb_define_singleton_method(rb_cFloat64Buffer, "element_size", RUBY_METHOD_FUNC(static_cast<BufferElementSizeFunctionType>(&buffer_element_size<double>)), 0);
2532 |     rb_define_method(rb_cFloat64Buffer, "size", RUBY_METHOD_FUNC(static_cast<BufferSizeFunctionType>(&buffer_size<double>)), 0);
2533 |     rb_define_method(rb_cFloat64Buffer, "page_locked?", RUBY_METHOD_FUNC(static_cast<BufferIsPageLocked>(&buffer_is_page_locked<double>)), 0);
2534 |     rb_define_method(rb_cFloat64Buffer, "offset", RUBY_METHOD_FUNC(static_cast<BufferOffsetFunctionType>(&buffer_offset<double>)), 1);
2535 |     rb_define_method(rb_cFloat64Buffer, "[]", RUBY_METHOD_FUNC(static_cast<BufferElementGetFunctionType>(&buffer_element_get<double>)), 1);
2536 |     rb_define_method(rb_cFloat64Buffer, "[]=", RUBY_METHOD_FUNC(static_cast<BufferElementSetFunctionType>(&buffer_element_set<double>)), 2);
2537 | 
2538 |     rb_define_module_function(rb_mCU, "memcpy_htod", RUBY_METHOD_FUNC(memcpy_htod), 3);
2539 |     rb_define_module_function(rb_mCU, "memcpy_dtoh", RUBY_METHOD_FUNC(memcpy_dtoh), 3);
2540 |     rb_define_module_function(rb_mCU, "memcpy_dtod", RUBY_METHOD_FUNC(memcpy_dtod), 3);
2541 |     rb_define_module_function(rb_mCU, "memcpy_htod_async", RUBY_METHOD_FUNC(memcpy_htod_async), 4);
2542 |     rb_define_module_function(rb_mCU, "memcpy_dtoh_async", RUBY_METHOD_FUNC(memcpy_dtoh_async), 4);
2543 |     rb_define_module_function(rb_mCU, "memcpy_dtod_async", RUBY_METHOD_FUNC(memcpy_dtod_async), 4);
2544 |     rb_define_module_function(rb_mCU, "mem_get_info", RUBY_METHOD_FUNC(mem_get_info), 0);
2545 | 
2546 |     rb_define_module_function(rb_mCU, "driver_get_version", RUBY_METHOD_FUNC(driver_get_version), 0);
2547 | 
2548 |     CUresult status = cuInit(0);
2549 |     if (status != CUDA_SUCCESS) {
2550 |         RAISE_CU_STD_ERROR(status, "Failed to initialize the CUDA driver API.");
2551 |     }
2552 | }
2553 | 
2554 | } // namespace
2555 | } // namespace
2556 | 


--------------------------------------------------------------------------------