├── README.md
├── gpuSparse.m
├── private
├── coo2csr.cu
├── coosortByRow.cu
├── csr2coo.cu
├── csr2csc.cu
├── csr2csc_cpu.cu
├── csrgeam.cu
├── csrmm.cu
├── csrmv.cu
├── csrsort.cu
├── mex_all.m
├── mxShowCriticalErrorMessage.h
└── wrappers_to_cuda_11.h
└── test_gpuSparse.m
/README.md:
--------------------------------------------------------------------------------
1 | # gpuSparse
2 |
3 | Matlab mex wrappers to NVIDIA cuSPARSE (https://developer.nvidia.com/cusparse).
4 |
5 |
6 | Uses int32 and single precision to save memory (Matlab sparse uses int64 and double).
7 |
8 |
9 | ## Installation
10 |
11 |
12 | 1. Save in a folder called @gpuSparse on the Matlab path
13 |
14 | 2. ```A = gpuSparse('recompile')``` to trigger compilation of mex
15 |
16 | 3. Recommended: CUDA-11 for much faster transpose-multiply
17 |
18 | ## Timings
19 |
20 | Due to memory layout (row/col-major) multiply and transpose-multiply differ in performance.
21 |
22 | size(A) = 221,401 x 213,331
23 | nnz(A) = 23,609,791 (0.05%)
24 | AT = precomputed transpose of A
25 |
26 | CPU sparse
27 | A*x (sparse) : Elapsed time is 1.370207 seconds.
28 | AT*y (sparse) : Elapsed time is 1.347447 seconds.
29 | A'*y (sparse) : Elapsed time is 0.267259 seconds.
30 |
31 | GPU sparse
32 | A*x (gpuArray) : Elapsed time is 0.137195 seconds.
33 | AT*y (gpuArray) : Elapsed time is 0.106331 seconds.
34 | A'*y (gpuArray) : Elapsed time is 0.232057 seconds. (CUDA 11)
35 | A'*y (gpuArray) : Elapsed time is 16.733638 seconds.
36 |
37 | GPU gpuSparse
38 | A*x (gpuSparse): Elapsed time is 0.068451 seconds.
39 | AT*y (gpuSparse): Elapsed time is 0.063651 seconds.
40 | A'*y (gpuSparse): Elapsed time is 0.059236 seconds. (CUDA 11)
41 | A'*y (gpuSparse): Elapsed time is 3.094271 seconds.
42 |
43 |
--------------------------------------------------------------------------------
/gpuSparse.m:
--------------------------------------------------------------------------------
1 | classdef gpuSparse
2 | %%
3 | % Sparse GPU array class (mex wrappers to cuSPARSE)
4 | % using int32 indices and single precision values.
5 | %
6 | % Usage: A = gpuSparse(row,col,val,nrows,ncols,nzmax)
7 | %
8 | % To recompile mex call gpuSparse('recompile')
9 | %
10 | % The nzmax argument can be used to check sufficient
11 | % memory: gpuSparse([],[],[],nrows,ncols,nzmax)
12 | %
13 | %%
14 | properties (SetAccess = private) %immutable)
15 |
16 | nrows(1,1) int32 % number of rows
17 | ncols(1,1) int32 % number of columns
18 |
19 | end
20 |
21 | properties (SetAccess = private, Hidden = true)
22 |
23 | row(:,1) gpuArray % int32 row index (CSR format)
24 | col(:,1) gpuArray % int32 column index
25 | val(:,1) gpuArray % single precision values
26 | trans(1,1) int32 % lazy transpose flag (passed to cuSPARSE)
27 | % 0 = CUSPARSE_OPERATION_NON_TRANSPOSE
28 | % 1 = CUSPARSE_OPERATION_TRANSPOSE
29 | % 2 = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE
30 |
31 | end
32 |
33 | %%
34 | methods
35 |
36 | %% constructor: same syntax as matlab's sparse
37 | function A = gpuSparse(row,col,val,nrows,ncols,nzmax)
38 |
39 | % empty gpuSparse matrix
40 | if nargin==0
41 | row = []; col = []; val = [];
42 | end
43 |
44 | % expecting a matrix, return gpuSparse ("row" is the first argument)
45 | if nargin==1
46 | if isa(row,'gpuSparse'); A = row; return; end % return unchanged
47 | if isequal(row,'recompile'); mex_all; return; end % recompile mex
48 | if ~isnumeric(row) && ~islogical(row); error('Cannot convert ''%s'' to gpuSparse.',class(row)); end
49 | if ~ismatrix(row); error('Cannot convert ND array to gpuSparse.'); end
50 | [nrows ncols] = size(row);
51 | [row col val] = find(row); % if sparse, could grab the CSR vectors directly but needs mex = hassle
52 | end
53 |
54 | % empty m x n matrix
55 | if nargin==2
56 | nrows = row; ncols = col;
57 | row = []; col = []; val = [];
58 | end
59 |
60 | % catch illegal no. arguments
61 | if nargin==4 || nargin>6
62 | error('Wrong number of arguments.');
63 | end
64 |
65 | % validate argument types
66 | validateattributes(row,{'numeric','gpuArray'},{'integer'},'','row');
67 | validateattributes(col,{'numeric','gpuArray'},{'integer'},'','col');
68 | validateattributes(val,{'numeric','gpuArray','logical'},{},'','val');
69 |
70 | % check vector lengths
71 | row = reshape(row,[],1);
72 | col = reshape(col,[],1);
73 | val = reshape(val,[],1);
74 | if numel(row)~=numel(col)
75 | error('Vectors must be the same length (row=%i col=%i).',numel(row),numel(col));
76 | end
77 | if numel(val)~=numel(row)
78 | if numel(val)==1
79 | val = repmat(val,numel(row),1);
80 | else
81 | error('Vectors must be the same length (row=%i val=%i).',numel(row),numel(val));
82 | end
83 | end
84 |
85 | % check bounds of indices
86 | if numel(row) > 0
87 | A.nrows = gather(max(row));
88 | if min(row)<1 || A.nrows==intmax('int32')
89 | error('row indices must be between 1 and %i.',intmax('int32')-1);
90 | end
91 | A.ncols = gather(max(col));
92 | if min(col)<1 || A.ncols==intmax('int32')
93 | error('col indices must be between 1 and %i.',intmax('int32')-1);
94 | end
95 | end
96 |
97 | % check and apply user-supplied matrix dims
98 | if exist('nrows','var')
99 | nrows = gather(nrows);
100 | validateattributes(nrows,{'numeric'},{'scalar','integer','>=',A.nrows,'<',intmax('int32')},'','nrows');
101 | A.nrows = nrows;
102 | end
103 | if exist('ncols','var')
104 | ncols = gather(ncols);
105 | validateattributes(ncols,{'numeric'},{'scalar','integer','>=',A.ncols,'<',intmax('int32')},'','ncols');
106 | A.ncols = ncols;
107 | end
108 |
109 | % simple memory check - needs work
110 | if ~exist('nzmax','var')
111 | nzmax = numel(val);
112 | else
113 | nzmax = gather(nzmax);
114 | validateattributes(nzmax,{'numeric'},{'scalar','integer','>=',numel(val)},'','nzmax');
115 | end
116 | RequiredMemory = 4*double(A.nrows+1)/1E9;
117 | RequiredMemory = RequiredMemory+4*double(nzmax)/1E9;
118 | RequiredMemory = RequiredMemory+4*double(nzmax)/1E9;
119 | AvailableMemory = getfield(gpuDevice(),'AvailableMemory') / 1E9;
120 | if RequiredMemory > AvailableMemory
121 | error('Not enough memory (%.1fGb required, %.1fGb available).',RequiredMemory,AvailableMemory);
122 | end
123 |
124 | % cast to required class
125 | row = int32(row);
126 | col = int32(col);
127 | val = single(val);
128 |
129 | % sort row and col for COO to CSR conversion (MATLAB version)
130 | %[B I] = sortrows([row col]);
131 | %A.row = B(:,1);
132 | %A.col = B(:,2);
133 | %A.val = val(I);
134 | %clear B I row col val
135 |
136 | % sort row and col for COO to CSR conversion (CUDA version)
137 | try
138 | [A.row A.col A.val] = coosortByRow(row,col,val,A.nrows,A.ncols);
139 | catch ME
140 | error('%s Try gpuSparse(''recompile'') to recompile mex.',ME.message);
141 | end
142 |
143 | % convert from COO to CSR
144 | A.row = coo2csr(A.row,A.nrows);
145 |
146 | end
147 |
148 | %% enforce some class properties - inexpensive checks only
149 | function A = set.row(A,row)
150 | if ~iscolumn(row) || ~isequal(classUnderlying(row),'int32')
151 | error('Property row must be a column vector of int32s.')
152 | end
153 | A.row = row;
154 | end
155 | function A = set.col(A,col)
156 | if ~iscolumn(col) || ~isequal(classUnderlying(col),'int32')
157 | error('Property col must be a column vector of int32s.')
158 | end
159 | A.col = col;
160 | end
161 | function A = set.val(A,val)
162 | if ~iscolumn(val) || ~isequal(classUnderlying(val),'single')
163 | error('Property val must be a column vector of singles.')
164 | end
165 | A.val = val;
166 | end
167 | function A = set.trans(A,trans)
168 | if trans~=0 && trans~=1 && trans~=2
169 | error('Property trans must be 0, 1 or 2.')
170 | end
171 | if isreal(A) && trans==2
172 | error('Real matrix trans flag must be 0 or 1');
173 | end
174 | A.trans = trans;
175 | end
176 |
177 | %% validation - helpful for testing
178 | function validate(A)
179 |
180 | message = 'Validation failure.';
181 |
182 | % fast checks
183 | if ~isa(A.nrows,'int32'); error(message); end
184 | if ~isa(A.ncols,'int32'); error(message); end
185 | if ~isa(A.trans,'int32'); error(message); end
186 | if ~isa(A.row,'gpuArray'); error(message); end
187 | if ~isa(A.col,'gpuArray'); error(message);end
188 | if ~isa(A.val,'gpuArray'); error(message); end
189 | if ~isequal(classUnderlying(A.row),'int32'); error(message); end
190 | if ~isequal(classUnderlying(A.col),'int32'); error(message); end
191 | if ~isequal(classUnderlying(A.val),'single'); error(message); end
192 | if A.nrows < 0; error(message); end
193 | if A.ncols < 0; error(message); end
194 | if A.nrows == intmax('int32'); error(message); end
195 | if A.ncols == intmax('int32'); error(message); end
196 | if ~iscolumn(A.row); error(message); end
197 | if ~iscolumn(A.col); error(message); end
198 | if ~iscolumn(A.val); error(message); end
199 | if numel(A.col) ~= numel(A.val); error(message); end
200 | if numel(A.row) ~= A.nrows+1; error(message); end
201 | if A.row(1) ~= 1; error(message); end
202 | if A.row(end) ~= numel(A.val)+1; error(message); end
203 | if A.trans~=0 && A.trans~=1 && A.trans~=2; error(message); end
204 | if isreal(A) && A.trans==2; error(message); end
205 |
206 | % slow checks
207 | if numel(A.val) > 0
208 | if min(A.col) < 1; error(message); end
209 | if max(A.col) > A.ncols; error(message); end
210 | rowcol = gather([csr2coo(A.row,A.nrows) A.col]);
211 | if ~issorted(rowcol,'rows'); error(message); end
212 | end
213 |
214 | end
215 |
216 | %% overloaded functions
217 |
218 | % isreal
219 | function retval = isreal(A)
220 | retval = isreal(A.val);
221 | end
222 |
223 | % real
224 | function A = real(A)
225 | A.val = real(A.val);
226 | if A.trans==2; A.trans = 1; end
227 | A = drop_zeros(A);
228 | end
229 |
230 | % imag
231 | function A = imag(A)
232 | A.val = imag(A.val);
233 | if A.trans==2; A.trans = 1; end
234 | A = drop_zeros(A);
235 | end
236 |
237 | % abs
238 | function A = abs(A)
239 | A.val = abs(A.val);
240 | if A.trans==2; A.trans = 1; end
241 | end
242 |
243 | % angle
244 | function A = angle(A)
245 | A.val = angle(A.val);
246 | if A.trans==2; A.trans = 1; end
247 | A = drop_zeros(A);
248 | end
249 |
250 | % conj
251 | function A = conj(A)
252 | A.val = conj(A.val);
253 | end
254 |
255 | % sign
256 | function A = sign(A)
257 | A.val = sign(A.val);
258 | if A.trans==2; A.trans = 1; end
259 | end
260 |
261 | % complex
262 | function A = complex(A)
263 | A.val = complex(A.val);
264 | end
265 |
266 | % classUnderlying
267 | function str = classUnderlying(A)
268 | str = classUnderlying(A.val);
269 | end
270 |
271 | % gt (only support scalar)
272 | function A = gt(A,tol);
273 | if ~isscalar(tol)
274 | error('Non-scalar argument not supported.');
275 | end
276 | A.val = cast(A.val > tol,classUnderlying(A));
277 | if A.trans==2; A.trans = 1; end
278 | A = drop_zeros(A);
279 | end
280 |
281 | % lt (only support scalar)
282 | function A = lt(A,tol);
283 | if ~isscalar(tol)
284 | error('Non-scalar argument not supported.');
285 | end
286 | A.val = cast(A.val < tol,classUnderlying(A));
287 | if A.trans==2; A.trans = 1; end
288 | A = drop_zeros(A);
289 | end
290 |
291 | % eq (only support scalar)
292 | function A = eq(A,tol);
293 | if ~isscalar(tol)
294 | error('Non-scalar argument not supported.');
295 | end
296 | A.val = cast(A.val == tol,classUnderlying(A));
297 | if A.trans==2; A.trans = 1; end
298 | A = drop_zeros(A);
299 | end
300 |
301 | % nnz
302 | function retval = nnz(A)
303 | retval = nnz(A.val);
304 | end
305 |
306 | % length
307 | function retval = length(A)
308 | retval = max(size(A));
309 | end
310 |
311 | % nzmax
312 | function retval = nzmax(A)
313 | retval = numel(A.val);
314 | end
315 |
316 | % mean: only A and DIM args are supported
317 | function retval = mean(A,DIM)
318 | if nargin==1; DIM = 1; end
319 | retval = sum(A,DIM) / size(A,DIM);
320 | end
321 |
322 | % nonzeros
323 | function val = nonzeros(A)
324 | val = nonzeros(A.val);
325 | if A.trans==2
326 | val = conj(val);
327 | end
328 | end
329 |
330 | % sum: only A and DIM args are supported
331 | function retval = sum(A,DIM)
332 | if nargin==1
333 | DIM = 1;
334 | else
335 | validateattributes(DIM,{'numeric'},{'integer','positive'},'','DIM')
336 | end
337 | if numel(A)==0
338 | retval = sum(zeros(size(A)),DIM);
339 | retval = gpuSparse(retval);
340 | else
341 | switch DIM
342 | case 1; retval =(A'* ones(size(A,1),1,'like',A.val))';
343 | case 2; retval = A * ones(size(A,2),1,'like',A.val);
344 | otherwise; retval = A;
345 | end
346 | end
347 | end
348 |
349 | % norm: support same types as sparse
350 | function retval = norm(A,p);
351 | if nargin<2; p = 2; end
352 | if isvector(A)
353 | retval = norm(A.val,p);
354 | else
355 | if isequal(p,2)
356 | error('gpuSparse norm(A,2) is not supported.');
357 | elseif isequal(p,1)
358 | retval = max(sum(abs(A),1));
359 | elseif isequal(p,Inf)
360 | retval = max(sum(abs(A),2));
361 | elseif isequal(p,'fro');
362 | retval = norm(A.val);
363 | else
364 | error('The only matrix norms supported are 1, 2, inf, and ''fro''.');
365 | end
366 | end
367 | end
368 |
369 | % max: support for max(A,[],2) only
370 | function retval = max(A,Y,DIM);
371 | if nargin ~= 3 || ~isempty(Y) || ~isequal(DIM,2)
372 | error('Only 3 argument form supported: max(A,[],2).');
373 | end
374 | if A.trans
375 | error('Transpose max not supported - try full_transpose(A).')
376 | end
377 |
378 | % do it on CPU to reduce transfer overhead
379 | row = gather(A.row);
380 | val = gather(A.val);
381 | retval = zeros(A.nrows,1,'like',val);
382 |
383 | for j = 1:A.nrows
384 | k = row(j):row(j+1)-1;
385 | if ~isempty(k)
386 | retval(j) = max(val(k));
387 | end
388 | end
389 | end
390 |
391 | % size
392 | function varargout = size(A,DIM)
393 | if A.trans==0
394 | m = double(A.nrows);
395 | n = double(A.ncols);
396 | else
397 | n = double(A.nrows);
398 | m = double(A.ncols);
399 | end
400 | if nargin>1
401 | if nargout>1
402 | error('too many output arguments.');
403 | end
404 | if ~isscalar(DIM) || DIM<=0 || mod(DIM,1)
405 | error('Dimension argument must be a positive integer scalar.')
406 | elseif DIM==1
407 | varargout{1} = m;
408 | elseif DIM==2
409 | varargout{1} = n;
410 | else
411 | varargout{1} = 1;
412 | end
413 | else
414 | if nargout==0 || nargout==1
415 | varargout{1} = [m n];
416 | else
417 | varargout{1} = m;
418 | varargout{2} = n;
419 | for k = 3:nargout
420 | varargout{k} = 1;
421 | end
422 | end
423 | end
424 | end
425 |
426 | % find: returns indices on the GPU (not efficient, mainly for debugging)
427 | function varargout = find(A)
428 | if nargin>1; error('only 1 input argument supported'); end
429 | if nargout>3; error('too many ouput arguments'); end
430 |
431 | % COO format on GPU
432 | i = csr2coo(A.row,A.nrows);
433 | j = A.col;
434 | v = A.val;
435 |
436 | % remove explicit zeros
437 | nz = (v ~= 0);
438 | i = i(nz);
439 | j = j(nz);
440 | v = v(nz);
441 |
442 | % MATLAB style, double precision, sorted columns
443 | if A.trans
444 | [i j] = deal(j,i);
445 | else
446 | [~,k] = sortrows([j i]);
447 | i = i(k);
448 | j = j(k);
449 | end
450 | i = double(i);
451 | j = double(j);
452 |
453 | if nargout==0 || nargout==1
454 | varargout{1} = sub2ind(size(A),i,j);
455 | else
456 | varargout{1} = i;
457 | varargout{2} = j;
458 | end
459 | if nargout==3
460 | if A.trans==0; varargout{3} = v(k); end
461 | if A.trans==1; varargout{3} = v; end
462 | if A.trans==2; varargout{3} = conj(v); end
463 | end
464 | end
465 |
466 | % add: C = A+B
467 | function C = plus(A,B)
468 | C = geam(A,B,1,1);
469 | end
470 |
471 | % minus: C = A-B
472 | function C = minus(A,B)
473 | C = geam(A,B,1,-1);
474 | end
475 |
476 | % csrgeam: C = a*A + b*B
477 | function C = geam(A,B,a,b)
478 | A = gpuSparse(A);
479 | B = gpuSparse(B);
480 | if ~isequal(size(A),size(B))
481 | error('Matrices must be the same size.')
482 | end
483 | if ~isreal(A) || ~isreal(B)
484 | error('Complex addition not supported at the moment.')
485 | end
486 | if A.trans ~= B.trans
487 | error('Matrix addition with lazy transpose not fully supported.')
488 | end
489 | validateattributes(a,{'numeric'},{'real','scalar','finite'},'','a');
490 | validateattributes(b,{'numeric'},{'real','scalar','finite'},'','b');
491 | if A.trans
492 | [n m] = size(A);
493 | else
494 | [m n] = size(A);
495 | end
496 | C = gpuSparse(m,n);
497 | C.trans = A.trans;
498 | [C.row C.col C.val] = csrgeam(A.row,A.col,A.val,m,n,B.row,B.col,B.val,a,b);
499 | end
500 |
501 | % mtimes: A*x (or x*A for scalar x)
502 | function y = mtimes(A,x)
503 | if isa(x,'gpuSparse') && ~isa(A,'gpuSparse')
504 | [A x] = deal(x,A);
505 | end
506 | if ~isnumeric(x) && islogical(x)
507 | error('Argument x must be numeric (%s not supported).',class(x))
508 | elseif isscalar(x) && ~iscolumn(A)
509 | y = A;
510 | y.val = y.val * x;
511 | elseif isvector(x)
512 | if isreal(A)
513 | y = csrmv(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,x);
514 | else
515 | y = csrmv(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,complex(x));
516 | end
517 | elseif ismatrix(x)
518 | if isreal(A)
519 | y = csrmm(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,x);
520 | else
521 | y = csrmm(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,complex(x));
522 | end
523 | end
524 | end
525 |
526 | % times: A.*x or x.*A (scalar x only)
527 | function A = times(A,x)
528 | if isa(x,'gpuSparse') && ~isa(A,'gpuSparse')
529 | [A x] = deal(x,A);
530 | end
531 | if ~isnumeric(x) && ~islogical(x) && ~isempty(x)
532 | error('Argument x must be numeric (%s not supported).',class(x))
533 | elseif isscalar(x) && isfinite(x)
534 | A.val = A.val .* x;
535 | else
536 | error('Multiplication only supported for finite scalars.')
537 | end
538 | end
539 |
540 | % divide: A./x
541 | function A = rdivide(A,x)
542 | if isa(x,'gpuSparse')
543 | error('Division by gpuSparse array not supported.');
544 | end
545 | A = times(A,1./x);
546 | end
547 |
548 | % divide: A/x (scalar x only)
549 | function A = mrdivide(A,x)
550 | A = A./x;
551 | end
552 |
553 | % power: A.^x
554 | function A = power(A,x)
555 | if isa(x,'gpuSparse') || ~isscalar(x)
556 | error('Power A.^x only supported for gpuSparse A and scalar x.');
557 | end
558 | A.val = A.val.^x;
559 | end
560 |
561 | % full transpose: A.'
562 | function AT = full_transpose(A)
563 | if A.trans
564 | AT = A;
565 | AT.trans = 0;
566 | if ~isreal(A) && A.trans==2
567 | AT.val = conj(AT.val);
568 | end
569 | else
570 | [m n] = size(A);
571 | AT = gpuSparse([],[],[],n,m,nnz(A));
572 |
573 | if nnz(A) % cuSPARSE breaks if nnz==0 so avoid call
574 | if 1 % older cuSPARSE used excessive memory - seems OK now
575 | [AT.col AT.row AT.val] = csr2csc(A.row,A.col,A.val,m,n);
576 | else % cpu version
577 | row = gather(A.row);
578 | col = gather(A.col);
579 | val = gather(A.val);
580 | [col row val] = csr2csc_cpu(row,col,val,m,n);
581 | AT.col = gpuArray(col);
582 | AT.row = gpuArray(row);
583 | AT.val = gpuArray(val);
584 | end
585 | end
586 | end
587 | end
588 |
589 | % full ctranspose: A'
590 | function AT = full_ctranspose(A)
591 | if A.trans
592 | AT = A;
593 | AT.trans = 0;
594 | else
595 | AT = full_transpose(A);
596 | end
597 | if ~isreal(A) && A.trans~=2
598 | AT.val = conj(AT.val);
599 | end
600 | end
601 |
602 | % lazy transpose (flag): A.'
603 | function AT = transpose(A)
604 | AT = A; % lazy copy
605 | switch A.trans
606 | case 0; AT.trans = 1;
607 | case 1; AT.trans = 0;
608 | case 2; AT.trans = 0; AT.val = conj(AT.val);
609 | end
610 | end
611 |
612 | % lazy transpose (flag): A'
613 | function AT = ctranspose(A)
614 | AT = A; % lazy copy
615 | switch A.trans
616 | case 0; if isreal(A); AT.trans = 1; else; AT.trans = 2; end
617 | case 1; AT.trans = 0; if ~isreal(A); AT.val = conj(AT.val); end
618 | case 2; AT.trans = 0;
619 | end
620 | end
621 |
622 | % remove zeros from sparse matrix
623 | function A = drop_zeros(A,tol)
624 | if nargin<2
625 | nz = (A.val ~= 0);
626 | else
627 | validateattributes(tol,{'numeric'},{'nonnegative','scalar'},'','tol');
628 | nz = abs(A.val) < tol;
629 | end
630 | if any(nz)
631 | A.row = csr2coo(A.row,A.nrows);
632 | A.row = A.row(nz);
633 | A.row = coo2csr(A.row,A.nrows);
634 | A.col = A.col(nz);
635 | A.val = A.val(nz);
636 | end
637 | end
638 |
639 | % sparse: returns sparse matrix on GPU
640 | function A_sp = sparse(A)
641 | [m n] = size(A);
642 | i = csr2coo(A.row,A.nrows);
643 | j = A.col;
644 | v = double(A.val);
645 | switch A.trans
646 | % int32 indices ok (2020a)
647 | case 0; A_sp = sparse(i,j,v,m,n);
648 | case 1; A_sp = sparse(j,i,v,m,n);
649 | case 2; A_sp = sparse(j,i,conj(v),m,n);
650 | end
651 | end
652 |
653 | % gather: returns sparse matrix on CPU - gather(sparse(A)) is faster but memory intensive
654 | function A_sp = gather(A)
655 | [m n] = size(A);
656 | i = gather(csr2coo(A.row,A.nrows));
657 | j = gather(A.col);
658 | v = gather(double(A.val)); % double for sparse
659 | switch A.trans
660 | % sparse int32 indices ok (2020a)
661 | case 0; A_sp = sparse(i,j,v,m,n);
662 | case 1; A_sp = sparse(j,i,v,m,n);
663 | case 2; A_sp = sparse(j,i,conj(v),m,n);
664 | end
665 | end
666 |
667 | % full: returns full matrix on CPU (not efficient, mainly for debugging)
668 | function A_f = full(A)
669 | i = gather(csr2coo(A.row,A.nrows));
670 | j = gather(A.col);
671 | v = gather(A.val);
672 | switch A.trans
673 | % sparse int32 indices ok (2020a)
674 | case 0; k = sub2ind(size(A),i,j);
675 | case 1; k = sub2ind(size(A),j,i);
676 | case 2; k = sub2ind(size(A),j,i); v = conj(v);
677 | end
678 | A_f = zeros(size(A),'like',v);
679 | A_f(k) = v;
680 | end
681 |
682 | % numel - should it be 1 object or prod(size(A)) elements?
683 | function retval = numel(A)
684 | retval = prod(size(A));
685 | end
686 |
687 | % cat
688 | function C = cat(dim,A,B)
689 | switch dim
690 | case 1; C = vertcat(A,B);
691 | case 2; C = horzcat(A,B);
692 | otherwise; error('Concatenation only supported for dim=1 or 2.');
693 | end
694 | end
695 |
696 | % vertcat
697 | function C = vertcat(A,B)
698 | if ~isa(B,'gpuSparse')
699 | error('Concatenation only supported for gpuSparse.');
700 | end
701 | if A.trans || B.trans
702 | error('Concatenation not supported with transpose.');
703 | end
704 | if size(A,2)~=size(B,2)
705 | error('Concatenation requires number of cols be equal.');
706 | end
707 | C = gpuSparse(size(A,1)+size(B,1),size(A,2));
708 | C.row = [A.row;B.row(2:end)+numel(A.val)];
709 | C.col = [A.col;B.col];
710 | C.val = [A.val;B.val];
711 | end
712 |
713 | % horzcat - possible to avoid csr2coo calls?
714 | function C = horzcat(A,B)
715 | if ~isa(B,'gpuSparse') || A.trans || B.trans
716 | error('Concatenation only supported for non-tranposed gpuSparse.');
717 | end
718 | if A.trans || B.trans
719 | error('Concatenation not supported with transpose.');
720 | end
721 | if size(A,1)~=size(B,1)
722 | error('Concatenation requires number of rows be equal.');
723 | end
724 | i = [csr2coo(A.row,A.nrows);csr2coo(B.row,B.nrows)];
725 | j = [A.col;B.col+size(A,2)];
726 | v = [A.val;B.val];
727 | C = gpuSparse(i,j,v,size(A,1),size(A,2)+size(B,2));
728 | end
729 |
730 | % Mathworks suggested this to help fix . indexing
731 | function retval = numArgumentsFromSubscript(A, s, ic)
732 | retval = builtin('numArgumentsFromSubscript', A, s, ic);
733 | end
734 |
735 | % the following are hard - don't implement
736 | function retval = subsref(A,s)
737 | if isequal(s.type,'.')
738 | retval = A.(s.subs);
739 | else
740 | error('subsref not implemented.');
741 | end
742 | end
743 | function retval = subsasgn(A,s,b)
744 | error('subsasgn not implemented.');
745 | end
746 | function A = reshape(A,m,n)
747 | error('reshape not implemented.');
748 | end
749 | end
750 | end
751 |
--------------------------------------------------------------------------------
/private/coo2csr.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE format converter (coo2csr).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | // MATLAB related
20 | #include "mex.h"
21 | #include "gpu/mxGPUArray.h"
22 | #include "mxShowCriticalErrorMessage.h"
23 |
24 | // Input Arguments
25 | #define ROW prhs[0]
26 | #define NROWS prhs[1]
27 |
28 | // Output Arguments
29 | #define ROW_CSR plhs[0]
30 |
31 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
32 | {
33 | // Checks
34 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
35 | if (nrhs != 2) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
36 |
37 | // Initialize the MathWorks GPU API
38 | mxInitGPU();
39 |
40 | // Create Matlab pointers on the GPU
41 | mxGPUArray const *row = mxGPUCreateFromMxArray(ROW);
42 |
43 | // Checks - note rows must be in COO (uncompressed) format
44 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
45 | if (mxGPUGetClassID(row) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW argument is not int32");
46 |
47 | mwSize nrows = mxGetScalar(NROWS);
48 | mwSize nnz = mxGPUGetNumberOfElements(row);
49 |
50 | // Create space for output vector
51 | const mwSize ndim = 1;
52 | mwSize dims[ndim] = {nrows+1};
53 | mxGPUArray *row_csr = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
54 | if (row_csr==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
55 |
56 | // Get handle to the CUBLAS context
57 | cublasHandle_t cublasHandle = 0;
58 | cublasStatus_t cublasStatus;
59 | cublasStatus = cublasCreate(&cublasHandle);
60 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
61 |
62 | // Get handle to the CUSPARSE context
63 | cusparseHandle_t cusparseHandle = 0;
64 | cusparseStatus_t cusparseStatus;
65 | cusparseStatus = cusparseCreate(&cusparseHandle);
66 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
67 | cusparseMatDescr_t descr = 0;
68 | cusparseStatus = cusparseCreateMatDescr(&descr);
69 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
70 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
71 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
72 |
73 | // Convert from matlab pointers to native pointers
74 | const int * const d_row = (int*)mxGPUGetDataReadOnly(row);
75 | int *d_row_csr = (int*)mxGPUGetData(row_csr);
76 | char message[128] = {'\0'};
77 | int *buffer = NULL;
78 |
79 | // Call coo2csr - returns uninitialized when nnz==0 so need to handle separately
80 | if (nnz == 0)
81 | {
82 | buffer = (int *)mxMalloc((nrows+1)*sizeof(int));
83 | if (buffer == NULL) mxShowCriticalErrorMessage("mxMalloc failed");
84 | for (int j=0; j
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | // MATLAB related
20 | #include "mex.h"
21 | #include "gpu/mxGPUArray.h"
22 | #include "mxShowCriticalErrorMessage.h"
23 |
24 | // Input Arguments
25 | #define ROW prhs[0]
26 | #define COL prhs[1]
27 | #define VAL prhs[2]
28 | #define NROWS prhs[3]
29 | #define NCOLS prhs[4]
30 |
31 | // Output Arguments
32 | #define ROW_SORT plhs[0]
33 | #define COL_SORT plhs[1]
34 | #define VAL_SORT plhs[2]
35 |
36 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
37 | {
38 | // Checks
39 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
40 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
41 |
42 | // Initialize the MathWorks GPU API
43 | mxInitGPU();
44 |
45 | // Create Matlab pointers on the GPU
46 | mxGPUArray const *row = mxGPUCreateFromMxArray(ROW);
47 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL);
48 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL);
49 |
50 | // Checks - note vectors must be in COO (uncompressed) format
51 | int nnz = mxGPUGetNumberOfElements(val);
52 | if (mxGPUGetNumberOfElements(row) != nnz) mxShowCriticalErrorMessage("ROW and VAL argument length mismatch");
53 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL and VAL argument length mismatch");
54 |
55 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
56 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
57 |
58 | if (mxGPUGetClassID(row) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW argument is not int32");
59 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
60 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
61 |
62 | int nrows = (int)mxGetScalar(NROWS);
63 | int ncols = (int)mxGetScalar(NCOLS);
64 |
65 | // Create space for output vectors
66 | const mwSize ndim = 1;
67 | mwSize dims[ndim];
68 |
69 | dims[0] = nnz;
70 | mxGPUArray *row_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
71 | if (row_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
72 |
73 | mxGPUArray *col_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
74 | if (col_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
75 |
76 | mxComplexity ccx = mxGPUGetComplexity(val);
77 | mxGPUArray *val_sort = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES);
78 | if (val_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
79 |
80 | // Get handle to the CUBLAS context
81 | cublasHandle_t cublasHandle = 0;
82 | cublasStatus_t cublasStatus;
83 | cublasStatus = cublasCreate(&cublasHandle);
84 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
85 |
86 | // Get handle to the CUSPARSE context
87 | cudaError_t cudaStatus;
88 | cusparseStatus_t cusparseStatus;
89 | cusparseHandle_t cusparseHandle = 0;
90 | cusparseStatus = cusparseCreate(&cusparseHandle);
91 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
92 | cusparseMatDescr_t descr = 0;
93 | cusparseStatus = cusparseCreateMatDescr(&descr);
94 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
95 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
96 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
97 |
98 | // Convert from matlab pointers to native pointers
99 | const int * const d_row = (int*)mxGPUGetDataReadOnly(row);
100 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col);
101 | int *d_col_sort = (int*)mxGPUGetData(col_sort);
102 | int *d_row_sort = (int*)mxGPUGetData(row_sort);
103 |
104 | // Since sort is in-place, copy the read-only vectors to the read-write ones
105 | cudaStatus = cudaMemcpy((void *)d_row_sort, d_row, nnz*sizeof(int), cudaMemcpyDeviceToDevice);
106 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus);
107 |
108 | cudaStatus = cudaMemcpy((void *)d_col_sort, d_col, nnz*sizeof(int), cudaMemcpyDeviceToDevice);
109 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus);
110 |
111 | if (ccx == mxREAL)
112 | {
113 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val);
114 | float *d_val_sort = (float*)mxGPUGetData(val_sort);
115 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice);
116 | }
117 | else
118 | {
119 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
120 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort);
121 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(cuFloatComplex), cudaMemcpyDeviceToDevice);
122 | }
123 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus);
124 |
125 | // Sort by rows
126 | int *P = NULL;
127 | void *pBuffer = NULL;
128 | size_t pBufferSizeInBytes = 0;
129 |
130 | if (nnz > 0)
131 | {
132 | // step 1: allocate buffer
133 | cusparseStatus = cusparseXcoosort_bufferSizeExt(cusparseHandle, nrows, ncols, nnz, d_row, d_col, &pBufferSizeInBytes);
134 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosort_bufferSizeExt failed",cusparseStatus);
135 |
136 | cudaStatus = cudaMalloc( &pBuffer, sizeof(char)*pBufferSizeInBytes);
137 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus);
138 |
139 | // step 2: setup permutation vector P to identity
140 | cudaStatus = cudaMalloc( &P, sizeof(int)*nnz);
141 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus);
142 |
143 | cusparseStatus = cusparseCreateIdentityPermutation(cusparseHandle, nnz, P);
144 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseCreateIdentityPermutation failed",cusparseStatus);
145 |
146 | // step 3: sort COO format by Row
147 | cusparseStatus = cusparseXcoosortByRow(cusparseHandle, nrows, ncols, nnz, d_row_sort, d_col_sort, P, pBuffer);
148 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosortByRow failed",cusparseStatus);
149 |
150 | // step 4: gather sorted cooVals
151 | if (ccx == mxREAL)
152 | {
153 | float *d_val = (float*)mxGPUGetDataReadOnly(val);
154 | float *d_val_sort = (float*)mxGPUGetData(val_sort);
155 | #if CUDART_VERSION >= 11000
156 | cusparseHandle_t handle = NULL;
157 | cusparseDnVecDescr_t vec_values;
158 | cusparseSpVecDescr_t vec_permutation;
159 | cusparseCreate(&handle);
160 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_R_32F);
161 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); // MUST USE BASE_ZERO
162 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation);
163 | cusparseDestroyDnVec(vec_values);
164 | cusparseDestroySpVec(vec_permutation);
165 | cusparseDestroy(handle);
166 | #else
167 | cusparseStatus = cusparseSgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO
168 | #endif
169 | }
170 | else
171 | {
172 | cuFloatComplex *d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
173 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort);
174 | #if CUDART_VERSION >= 11000
175 | cusparseHandle_t handle = NULL;
176 | cusparseDnVecDescr_t vec_values;
177 | cusparseSpVecDescr_t vec_permutation;
178 | cusparseCreate(&handle);
179 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_C_32F);
180 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F); // MUST USE BASE_ZERO
181 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation);
182 | cusparseDestroyDnVec(vec_values);
183 | cusparseDestroySpVec(vec_permutation);
184 | cusparseDestroy(handle);
185 | #else
186 | cusparseStatus = cusparseCgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO
187 | #endif
188 | }
189 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseSgthr or cusparseCgthr failed",cusparseStatus);
190 |
191 | }
192 |
193 | // Return result
194 | ROW_SORT = mxGPUCreateMxArrayOnGPU(row_sort);
195 | COL_SORT = mxGPUCreateMxArrayOnGPU(col_sort);
196 | VAL_SORT = mxGPUCreateMxArrayOnGPU(val_sort);
197 |
198 | // Make sure operations are finished before deleting
199 | //cudaDeviceSynchronize();
200 |
201 | // Clean up
202 | cusparseDestroyMatDescr(descr);
203 | cusparseDestroy(cusparseHandle);
204 | cublasDestroy(cublasHandle);
205 | mxGPUDestroyGPUArray(row);
206 | mxGPUDestroyGPUArray(row_sort);
207 | mxGPUDestroyGPUArray(col);
208 | mxGPUDestroyGPUArray(col_sort);
209 | mxGPUDestroyGPUArray(val);
210 | mxGPUDestroyGPUArray(val_sort);
211 | if (pBuffer) cudaFree(pBuffer);
212 | if (P) cudaFree(P);
213 |
214 | return;
215 | }
216 |
--------------------------------------------------------------------------------
/private/csr2coo.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE format converter (csr2coo).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | // MATLAB related
20 | #include "mex.h"
21 | #include "gpu/mxGPUArray.h"
22 | #include "mxShowCriticalErrorMessage.h"
23 |
24 | // Input Arguments
25 | #define ROW_CSR prhs[0]
26 | #define NROWS prhs[1]
27 |
28 | // Output Arguments
29 | #define ROW plhs[0]
30 |
31 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
32 | {
33 | // Checks
34 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
35 | if (nrhs != 2) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
36 |
37 | // Initialize the MathWorks GPU API
38 | mxInitGPU();
39 |
40 | // Create Matlab pointers on the GPU
41 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR);
42 |
43 | // Checks - note rows must be in CSR format
44 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
45 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
46 | mwSize nrows = mxGetScalar(NROWS);
47 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument is wrong size",mxGPUGetNumberOfElements(row_csr));
48 |
49 | // Get handle to the CUBLAS context
50 | cublasHandle_t cublasHandle = 0;
51 | cublasStatus_t cublasStatus;
52 | cublasStatus = cublasCreate(&cublasHandle);
53 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
54 |
55 | // Get handle to the CUSPARSE context
56 | cusparseStatus_t status;
57 | cusparseHandle_t cusparseHandle = 0;
58 | cusparseStatus_t cusparseStatus;
59 | cusparseStatus = cusparseCreate(&cusparseHandle);
60 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
61 | cusparseMatDescr_t descr = 0;
62 | cusparseStatus = cusparseCreateMatDescr(&descr);
63 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
64 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
65 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
66 |
67 | // Convert from matlab pointers to native pointers
68 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr);
69 |
70 | // Now we can access the arrays, we can do some checks
71 | int base;
72 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
73 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing");
74 |
75 | int nnz;
76 | cudaMemcpy(&nnz, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
77 | nnz -= CUSPARSE_INDEX_BASE_ONE;
78 | if (nnz < 0) mxShowCriticalErrorMessage("ROW_CSR returned negative nnz");
79 |
80 | // Create space for output vector
81 | const mwSize ndim = 1;
82 | mwSize dims[ndim] = {(mwSize)nnz}; // we checked that nnz is >=0 so cast is safe
83 | mxGPUArray *row = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
84 | if (row==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
85 |
86 | // Convert from matlab pointers to native pointers
87 | int *d_row = (int*)mxGPUGetData(row);
88 |
89 | // Call csr2coo
90 | status = cusparseXcsr2coo(cusparseHandle, d_row_csr, nnz, nrows, d_row, CUSPARSE_INDEX_BASE_ONE);
91 |
92 | if (status == CUSPARSE_STATUS_SUCCESS)
93 | {
94 | // Return result
95 | ROW = mxGPUCreateMxArrayOnGPU(row);
96 |
97 | // Make sure operations are finished before deleting
98 | //cudaDeviceSynchronize();
99 | }
100 |
101 | // Clean up
102 | cusparseDestroyMatDescr(descr);
103 | cusparseDestroy(cusparseHandle);
104 | cublasDestroy(cublasHandle);
105 | mxGPUDestroyGPUArray(row);
106 | mxGPUDestroyGPUArray(row_csr);
107 |
108 | // Failure
109 | if (status != CUSPARSE_STATUS_SUCCESS)
110 | {
111 | mxShowCriticalErrorMessage("Operation cusparseXcsr2coo failed",status);
112 | }
113 |
114 | return;
115 | }
116 |
--------------------------------------------------------------------------------
/private/csr2csc.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE format converter (csr2csc) to do transpose.
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | #if CUDART_VERSION >= 11000
20 | #include "wrappers_to_cuda_11.h"
21 | #endif
22 |
23 | // MATLAB related
24 | #include "mex.h"
25 | #include "gpu/mxGPUArray.h"
26 | #include "mxShowCriticalErrorMessage.h"
27 |
28 | // Input Arguments
29 | #define ROW_CSR prhs[0] // CSR format
30 | #define COL prhs[1]
31 | #define VAL prhs[2]
32 | #define NROWS prhs[3]
33 | #define NCOLS prhs[4]
34 |
35 | // Output Arguments
36 | #define ROW plhs[0]
37 | #define COL_CSC plhs[1] // CSC format
38 | #define VAL_CSC plhs[2]
39 |
40 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
41 | {
42 | // Checks
43 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
44 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
45 |
46 | // Initialize the MathWorks GPU API
47 | mxInitGPU();
48 |
49 | // Create Matlab pointers on the GPU
50 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR);
51 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL);
52 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL);
53 |
54 | // Checks - note rows must be in CSR format
55 | int nnz = mxGPUGetNumberOfElements(val);
56 |
57 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
58 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
59 |
60 | int nrows = (int)mxGetScalar(NROWS);
61 | int ncols = (int)mxGetScalar(NCOLS);
62 |
63 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
64 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
65 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
66 |
67 | // Create space for output vectors
68 | const mwSize ndim = 1;
69 | mwSize dims[ndim];
70 |
71 | dims[0] = ncols+1;
72 | mxGPUArray *col_csc = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
73 | if (col_csc==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
74 |
75 | dims[0] = nnz;
76 | mxGPUArray *row = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
77 | if (row==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
78 |
79 | mxComplexity ccx = mxGPUGetComplexity(val);
80 | mxGPUArray *val_csc = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES);
81 | if (val_csc==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
82 |
83 | // Get handle to the CUBLAS context
84 | cublasHandle_t cublasHandle = 0;
85 | cublasStatus_t cublasStatus;
86 | cublasStatus = cublasCreate(&cublasHandle);
87 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
88 |
89 | // Get handle to the CUSPARSE context
90 | cusparseHandle_t cusparseHandle = 0;
91 | cusparseStatus_t cusparseStatus;
92 | cusparseStatus = cusparseCreate(&cusparseHandle);
93 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
94 | cusparseMatDescr_t descr = 0;
95 | cusparseStatus = cusparseCreateMatDescr(&descr);
96 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
97 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
98 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
99 |
100 | // Convert from matlab pointers to native pointers
101 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr);
102 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col);
103 |
104 | int *d_row = (int*)mxGPUGetData(row);
105 | int *d_col_csc = (int*)mxGPUGetData(col_csc);
106 |
107 | // Now we can access row_csr[] array
108 | int base;
109 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
110 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing");
111 |
112 | int nnz_check;
113 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
114 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
115 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check);
116 |
117 | // Convert from CSR to CSC
118 | cusparseStatus_t status;
119 |
120 | if (ccx == mxREAL)
121 | {
122 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val);
123 | float *d_val_csc = (float*)mxGPUGetData(val_csc);
124 | #if CUDART_VERSION >= 11000
125 | status = cusparseXcsr2csc_wrapper(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE);
126 | #else
127 | status = cusparseScsr2csc(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE);
128 | #endif
129 | }
130 | else
131 | {
132 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
133 | cuFloatComplex *d_val_csc = (cuFloatComplex*)mxGPUGetData(val_csc);
134 | #if CUDART_VERSION >= 11000
135 | status = cusparseXcsr2csc_wrapper(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE);
136 | #else
137 | status = cusparseCcsr2csc(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE);
138 | #endif
139 | }
140 |
141 | if (status == CUSPARSE_STATUS_SUCCESS)
142 | {
143 | // Return result
144 | ROW = mxGPUCreateMxArrayOnGPU(row);
145 | COL_CSC = mxGPUCreateMxArrayOnGPU(col_csc);
146 | VAL_CSC = mxGPUCreateMxArrayOnGPU(val_csc);
147 |
148 | // Make sure operations are finished before deleting
149 | //cudaDeviceSynchronize();
150 | }
151 |
152 | // Clean up
153 | cusparseDestroyMatDescr(descr);
154 | cusparseDestroy(cusparseHandle);
155 | cublasDestroy(cublasHandle);
156 | mxGPUDestroyGPUArray(val);
157 | mxGPUDestroyGPUArray(col);
158 | mxGPUDestroyGPUArray(row_csr);
159 | mxGPUDestroyGPUArray(val_csc);
160 | mxGPUDestroyGPUArray(col_csc);
161 | mxGPUDestroyGPUArray(row);
162 |
163 | // Failure
164 | if (status != CUSPARSE_STATUS_SUCCESS)
165 | {
166 | mxShowCriticalErrorMessage("Operation cusparseScsr2csc or cusparseCcsr2csc failed",status);
167 | }
168 |
169 | return;
170 | }
171 |
172 |
173 |
--------------------------------------------------------------------------------
/private/csr2csc_cpu.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to C code format converter (csr2csc) to do transpose.
3 | //
4 | // Inspired by:
5 | // http://www.dgate.org/~brg/files/dis/smvm/frontend/matrix_io.c
6 | //
7 | template // template val_real to accept float or mxComplex for MX_HAS_INTERLEAVED_COMPLEX
8 | void csr2csc(const int nrows, const int ncols, const int *row_csr, const int *col, const T *val_real, const float *val_imag,
9 | int *row, int *col_csc, T *val_csc_real, float *val_csc_imag)
10 | {
11 | int i, j, k, l;
12 |
13 | // Base index (0 or 1) and number of nonzeros
14 | const int base = row_csr[0];
15 | const int nnz = row_csr[nrows]-base;
16 |
17 | // Determine column lengths
18 | for (i=0; i<=ncols; i++) col_csc[i] = 0;
19 | for (i=0; i0; i--) col_csc[i] = col_csc[i-1]+base;
37 |
38 | col_csc[0] = base;
39 | }
40 |
41 | // includes, system
42 | #include
43 | #include
44 | #include
45 |
46 | // MATLAB related
47 | #include "mex.h"
48 | #include "mxShowCriticalErrorMessage.h"
49 |
50 | // Input Arguments
51 | #define ROW_CSR prhs[0] // CSR format
52 | #define COL prhs[1]
53 | #define VAL prhs[2]
54 | #define NROWS prhs[3]
55 | #define NCOLS prhs[4]
56 |
57 | // Output Arguments
58 | #define ROW plhs[0]
59 | #define COL_CSC plhs[1] // CSC format
60 | #define VAL_CSC plhs[2]
61 |
62 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
63 | {
64 | // Checks
65 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
66 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
67 |
68 | // Checks - note rows must be in CSR format
69 | int nnz = mxGetNumberOfElements(VAL);
70 |
71 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
72 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
73 |
74 | int nrows = (int)mxGetScalar(NROWS);
75 | int ncols = (int)mxGetScalar(NCOLS);
76 |
77 | if (mxGetClassID(ROW_CSR) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
78 | if (mxGetClassID(COL) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
79 | if (mxGetClassID(VAL) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
80 |
81 | // Create space for output vectors
82 | const mwSize ndim = 1;
83 | mwSize dims[ndim];
84 |
85 | dims[0] = ncols+1;
86 | COL_CSC = mxCreateUninitNumericArray(ndim, dims, mxINT32_CLASS, mxREAL);
87 | if (COL_CSC==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
88 |
89 | dims[0] = nnz;
90 | ROW = mxCreateUninitNumericArray(ndim, dims, mxINT32_CLASS, mxREAL);
91 | if (ROW==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
92 |
93 | mxComplexity ccx = mxIsComplex(VAL) ? mxCOMPLEX : mxREAL;
94 | VAL_CSC = mxCreateUninitNumericArray(ndim, dims, mxSINGLE_CLASS, ccx);
95 | if (VAL_CSC==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
96 |
97 | // Pointers to the raw data
98 | const int * const row_csr = (int *)mxGetData(ROW_CSR);
99 | const int * const col = (int *)mxGetData(COL);
100 | void *val_real = mxGetData(VAL);
101 | #if MX_HAS_INTERLEAVED_COMPLEX
102 | void *val_imag = NULL;
103 | #else
104 | void *val_imag = mxGetImagData(VAL);
105 | #endif
106 |
107 | int *row = (int *)mxGetData(ROW);
108 | int *col_csc = (int *)mxGetData(COL_CSC);
109 | void *val_csc_real = mxGetData(VAL_CSC);
110 | #if MX_HAS_INTERLEAVED_COMPLEX
111 | void *val_csc_imag = NULL;
112 | #else
113 | void *val_csc_imag = mxGetImagData(VAL_CSC);
114 | #endif
115 |
116 | // Now we can access the arrays, we can do some checks
117 | const int base = row_csr[0];
118 | if (base != 1) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing");
119 |
120 | int nnz_check = row_csr[nrows];
121 | nnz_check -= 1;
122 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check);
123 |
124 | // Convert from CSR to CSC
125 | #if MX_HAS_INTERLEAVED_COMPLEX
126 | if(ccx == mxCOMPLEX)
127 | csr2csc(nrows, ncols, row_csr, col, (mxComplexSingle*)val_real, (float*)val_imag, row, col_csc, (mxComplexSingle*)val_csc_real, (float*)val_csc_imag);
128 | else
129 | csr2csc(nrows, ncols, row_csr, col, (float*)val_real, (float*)val_imag, row, col_csc, (float*)val_csc_real, (float*)val_csc_imag);
130 | #else
131 | csr2csc(nrows, ncols, row_csr, col, (float*)val_real, (float*)val_imag, row, col_csc, (float*)val_csc_real, (float*)val_csc_imag);
132 | #endif
133 |
134 |
135 | return;
136 | }
137 |
138 |
--------------------------------------------------------------------------------
/private/csrgeam.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE matrix-matrix addition (csrgeam).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | #if CUDART_VERSION >= 11000
20 | #include "wrappers_to_cuda_11.h"
21 | #endif
22 |
23 | // MATLAB related
24 | #include "mex.h"
25 | #include "gpu/mxGPUArray.h"
26 | #include "mxShowCriticalErrorMessage.h"
27 |
28 | // Input Arguments
29 | #define A_ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu)
30 | #define A_COL prhs[1]
31 | #define A_VAL prhs[2]
32 | #define NROWS prhs[3]
33 | #define NCOLS prhs[4]
34 | #define B_ROW_CSR prhs[5] // this in CSR format (returned from coo2csr.cu)
35 | #define B_COL prhs[6]
36 | #define B_VAL prhs[7]
37 | #define ALPHA prhs[8] // scalar: C = ALPHA*A + BETA*B
38 | #define BETA prhs[9] // scalar: C = ALPHA*A + BETA*B
39 |
40 | // Output Arguments
41 | #define C_ROW_CSR plhs[0] // this in CSR format (returned from coo2csr.cu)
42 | #define C_COL plhs[1]
43 | #define C_VAL plhs[2]
44 |
45 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
46 | {
47 | // Checks
48 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
49 | if (nrhs != 10) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
50 |
51 | if(!mxIsGPUArray(A_ROW_CSR)) mxShowCriticalErrorMessage("A_ROW_CSR argument is not on GPU");
52 | if(!mxIsGPUArray(A_COL)) mxShowCriticalErrorMessage("A_COL argument is not on GPU");
53 | if(!mxIsGPUArray(A_VAL)) mxShowCriticalErrorMessage("A_VAL argument is not on GPU");
54 |
55 | if (!mxIsScalar(ALPHA)) mxShowCriticalErrorMessage("ALPHA argument must be a scalar");
56 | if (!mxIsScalar(BETA)) mxShowCriticalErrorMessage("BETA argument must be a scalar");
57 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
58 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
59 |
60 | // Initialize the MathWorks GPU API
61 | mxInitGPU();
62 |
63 | // Create Matlab pointers on the GPU
64 | mxGPUArray const *a_row_csr = mxGPUCreateFromMxArray(A_ROW_CSR);
65 | mxGPUArray const *a_col = mxGPUCreateFromMxArray(A_COL);
66 | mxGPUArray const *a_val = mxGPUCreateFromMxArray(A_VAL);
67 | mxGPUArray const *b_row_csr = mxGPUCreateFromMxArray(B_ROW_CSR);
68 | mxGPUArray const *b_col = mxGPUCreateFromMxArray(B_COL);
69 | mxGPUArray const *b_val = mxGPUCreateFromMxArray(B_VAL);
70 |
71 | // Check sizes - note rows are in CSR (compressed row) format
72 | int a_nnz = mxGPUGetNumberOfElements(a_val);
73 | int b_nnz = mxGPUGetNumberOfElements(b_val);
74 |
75 | mwSize nrows = mxGetScalar(NROWS);
76 | mwSize ncols = mxGetScalar(NCOLS);
77 |
78 | if (mxGPUGetNumberOfElements(a_row_csr) != nrows+1) mxShowCriticalErrorMessage("A_ROW_CSR argument wrong size",mxGPUGetNumberOfElements(a_row_csr));
79 | if (mxGPUGetNumberOfElements(a_col) != a_nnz) mxShowCriticalErrorMessage("A_COL argument wrong size",mxGPUGetNumberOfElements(a_col));
80 |
81 | if (mxGPUGetNumberOfElements(b_row_csr) != nrows+1) mxShowCriticalErrorMessage("B_ROW_CSR argument wrong size",mxGPUGetNumberOfElements(b_row_csr));
82 | if (mxGPUGetNumberOfElements(b_col) != b_nnz) mxShowCriticalErrorMessage("B_COL argument wrong size",mxGPUGetNumberOfElements(b_col));
83 |
84 | if (mxGPUGetClassID(a_row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("A_ROW_CSR argument is not int32");
85 | if (mxGPUGetClassID(a_col) != mxINT32_CLASS) mxShowCriticalErrorMessage("A_COL argument is not int32");
86 | if (mxGPUGetClassID(a_val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("A_VAL argument is not single");
87 |
88 | if (mxGPUGetClassID(b_row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("B_ROW argument is not int32");
89 | if (mxGPUGetClassID(b_col) != mxINT32_CLASS) mxShowCriticalErrorMessage("B_COL argument is not int32");
90 | if (mxGPUGetClassID(b_val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("B_VAL argument is not single");
91 |
92 | // Allocate space for output row vector
93 | const mwSize ndim = 1;
94 | mwSize dims[ndim] = {nrows+1};
95 | mxGPUArray *c_row_csr = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE);
96 | if (c_row_csr==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
97 |
98 | // Get handle to the CUBLAS context
99 | cublasHandle_t cublasHandle = 0;
100 | cublasStatus_t cublasStatus;
101 | cublasStatus = cublasCreate(&cublasHandle);
102 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
103 |
104 | // Get handle to the CUSPARSE context
105 | cusparseHandle_t cusparseHandle = 0;
106 | cusparseStatus_t cusparseStatus;
107 | cusparseStatus = cusparseCreate(&cusparseHandle);
108 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
109 | cusparseMatDescr_t descr = 0;
110 | cusparseStatus = cusparseCreateMatDescr(&descr);
111 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
112 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
113 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
114 |
115 | // Convert from matlab pointers to native pointers
116 | const int* const d_a_col = (int*)mxGPUGetDataReadOnly(a_col);
117 | const int* const d_b_col = (int*)mxGPUGetDataReadOnly(b_col);
118 |
119 | const float* const d_a_val = (float*)mxGPUGetDataReadOnly(a_val);
120 | const float* const d_b_val = (float*)mxGPUGetDataReadOnly(b_val);
121 |
122 | const int* const d_a_row_csr = (int*)mxGPUGetDataReadOnly(a_row_csr);
123 | const int* const d_b_row_csr = (int*)mxGPUGetDataReadOnly(b_row_csr);
124 |
125 | int *d_c_col = NULL;
126 | float *d_c_val = NULL;
127 | int *d_c_row_csr = (int*)mxGPUGetData(c_row_csr);
128 |
129 | const float alpha = (float)mxGetScalar(ALPHA);
130 | const float beta = (float)mxGetScalar(BETA);
131 |
132 | // Now we can access the arrays, we can do some checks
133 | int base;
134 | cudaMemcpy(&base, d_a_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
135 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("A_ROW_CSR not using 1-based indexing");
136 |
137 | int nnz_check;
138 | cudaMemcpy(&nnz_check, d_a_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
139 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
140 | if (nnz_check != a_nnz) mxShowCriticalErrorMessage("A_ROW_CSR argument last element != nnz",nnz_check);
141 |
142 | cudaMemcpy(&base, d_b_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
143 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("B_ROW_CSR not using 1-based indexing");
144 |
145 | cudaMemcpy(&nnz_check, d_b_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
146 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
147 | if (nnz_check != b_nnz) mxShowCriticalErrorMessage("B_ROW_CSR argument last element != nnz",nnz_check);
148 |
149 | // Get sparsity pattern and nnz of output matrix
150 | int c_nnz;
151 | int *nnzTotalDevHostPtr = &c_nnz;
152 | cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST);
153 |
154 | char *buffer = NULL;
155 | size_t bufferSizeInBytes;
156 |
157 | #if CUDART_VERSION >= 11000
158 | cusparseScsrgeam2_bufferSizeExt(cusparseHandle, nrows, ncols,
159 | &alpha,
160 | descr, a_nnz, d_a_val, d_a_row_csr, d_a_col,
161 | &beta,
162 | descr, b_nnz, d_b_val, d_b_row_csr, d_b_col,
163 | descr, d_c_val, d_c_row_csr, d_c_col,
164 | &bufferSizeInBytes);
165 |
166 | cudaError_t status0 = cudaMalloc((void**)&buffer, sizeof(char)*bufferSizeInBytes);
167 | if (status0 != cudaSuccess)
168 | {
169 | mxShowCriticalErrorMessage("Operation cudaMalloc failed",status0);
170 | }
171 |
172 | cusparseStatus_t status1 =
173 | cusparseXcsrgeam2Nnz(cusparseHandle, nrows, ncols,
174 | descr, a_nnz, d_a_row_csr, d_a_col,
175 | descr, b_nnz, d_b_row_csr, d_b_col,
176 | descr, d_c_row_csr, nnzTotalDevHostPtr, buffer);
177 | #else
178 | cusparseStatus_t status1 =
179 | cusparseXcsrgeamNnz(cusparseHandle, nrows, ncols,
180 | descr, a_nnz, d_a_row_csr, d_a_col,
181 | descr, b_nnz, d_b_row_csr, d_b_col,
182 | descr, d_c_row_csr, nnzTotalDevHostPtr);
183 | #endif
184 |
185 | // Failure
186 | if (status1 != CUSPARSE_STATUS_SUCCESS)
187 | {
188 | mxShowCriticalErrorMessage("Operation cusparseXcsrgeamNnz failed",status1);
189 | }
190 |
191 | if (NULL != nnzTotalDevHostPtr)
192 | {
193 | c_nnz = *nnzTotalDevHostPtr;
194 | }
195 | else
196 | {
197 | int baseC = CUSPARSE_INDEX_BASE_ONE;
198 | cudaMemcpy(&c_nnz, d_c_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
199 | cudaMemcpy(&baseC, c_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
200 | c_nnz -= baseC;
201 | }
202 |
203 | // Allocate space for output vectors
204 | dims[0] = {(mwSize)c_nnz};
205 | mxGPUArray *c_col = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE);
206 | if (c_col==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
207 |
208 | mxGPUArray *c_val = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE);
209 | if (c_val==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
210 |
211 | // Convert from matlab pointers to native pointers
212 | d_c_col = (int*)mxGPUGetData(c_col);
213 | d_c_val = (float*)mxGPUGetData(c_val);
214 |
215 | // Addition here
216 | #if CUDART_VERSION >= 11000
217 | cusparseStatus_t status2 =
218 | cusparseScsrgeam2(cusparseHandle, nrows, ncols,
219 | &alpha,
220 | descr, a_nnz,
221 | d_a_val, d_a_row_csr, d_a_col,
222 | &beta,
223 | descr, b_nnz,
224 | d_b_val, d_b_row_csr, d_b_col,
225 | descr,
226 | d_c_val, d_c_row_csr, d_c_col, buffer);
227 | #else
228 | cusparseStatus_t status2 =
229 | cusparseScsrgeam(cusparseHandle, nrows, ncols,
230 | &alpha,
231 | descr, a_nnz,
232 | d_a_val, d_a_row_csr, d_a_col,
233 | &beta,
234 | descr, b_nnz,
235 | d_b_val, d_b_row_csr, d_b_col,
236 | descr,
237 | d_c_val, d_c_row_csr, d_c_col);
238 | #endif
239 |
240 | if (status2 == CUSPARSE_STATUS_SUCCESS)
241 | {
242 | // Return results
243 | C_ROW_CSR = mxGPUCreateMxArrayOnGPU(c_row_csr);
244 | C_COL = mxGPUCreateMxArrayOnGPU(c_col);
245 | C_VAL = mxGPUCreateMxArrayOnGPU(c_val);
246 |
247 | // Make sure operations are finished before deleting
248 | //cudaDeviceSynchronize();
249 | }
250 |
251 | // Clean up
252 | cusparseDestroyMatDescr(descr);
253 | cusparseDestroy(cusparseHandle);
254 | cublasDestroy(cublasHandle);
255 | if(buffer) cudaFree(buffer);
256 | mxGPUDestroyGPUArray(a_row_csr);
257 | mxGPUDestroyGPUArray(a_col);
258 | mxGPUDestroyGPUArray(a_val);
259 | mxGPUDestroyGPUArray(b_row_csr);
260 | mxGPUDestroyGPUArray(b_col);
261 | mxGPUDestroyGPUArray(b_val);
262 | mxGPUDestroyGPUArray(c_row_csr);
263 | mxGPUDestroyGPUArray(c_col);
264 | mxGPUDestroyGPUArray(c_val);
265 |
266 | // Failure
267 | if (status2 != CUSPARSE_STATUS_SUCCESS)
268 | {
269 | mxShowCriticalErrorMessage("Operation cusparseScsrgeam failed",status2);
270 | }
271 |
272 | return;
273 | }
274 |
--------------------------------------------------------------------------------
/private/csrmm.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE matrix-matrix multiply (csrmm).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | #if CUDART_VERSION >= 11000
20 | #include "wrappers_to_cuda_11.h"
21 | #endif
22 |
23 | // MATLAB related
24 | #include "mex.h"
25 | #include "gpu/mxGPUArray.h"
26 | #include "mxShowCriticalErrorMessage.h"
27 |
28 | // Input Arguments
29 | #define ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu)
30 | #define COL prhs[1]
31 | #define VAL prhs[2]
32 | #define NROWS prhs[3]
33 | #define NCOLS prhs[4]
34 | #define TRANS prhs[5]
35 | #define B prhs[6] // dense matrix
36 |
37 | // Output Arguments
38 | #define C plhs[0] // C = alpha * op(A) * B + beta * C (sparse A, dense B)
39 |
40 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
41 | {
42 | // Checks
43 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
44 | if (nrhs != 7) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
45 |
46 | if(!mxIsGPUArray(ROW_CSR)) mxShowCriticalErrorMessage("ROW_CSR argument is not on GPU");
47 | if(!mxIsGPUArray(COL)) mxShowCriticalErrorMessage("COL argument is not on GPU");
48 | if(!mxIsGPUArray(VAL)) mxShowCriticalErrorMessage("VAL argument is not on GPU");
49 | if(!mxIsGPUArray(B)) mxShowCriticalErrorMessage("B argument is not on GPU");
50 |
51 | // Initialize the MathWorks GPU API
52 | mxInitGPU();
53 |
54 | // Create Matlab pointers on the GPU
55 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR);
56 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL);
57 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL);
58 | mxGPUArray const *b = mxGPUCreateFromMxArray(B);
59 |
60 | // Check sizes of A - note rows are in CSR (compressed row) format
61 | mwSize nnz = mxGPUGetNumberOfElements(val);
62 |
63 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
64 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
65 | if (!mxIsScalar(TRANS)) mxShowCriticalErrorMessage("TRANS argument must be a scalar");
66 |
67 | mwSize m = mxGetScalar(NROWS);
68 | mwSize k = mxGetScalar(NCOLS);
69 |
70 | if (mxGPUGetNumberOfElements(row_csr) != m+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size",mxGPUGetNumberOfElements(row_csr));
71 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL argument wrong size",mxGPUGetNumberOfElements(col));
72 |
73 | // Check sizes of B
74 | if (mxGPUGetNumberOfDimensions(b) > 2) mxShowCriticalErrorMessage("B has too many dimensions",mxGPUGetNumberOfDimensions(b));
75 |
76 | mwSize *bdims = (mwSize*)mxGPUGetDimensions(b); // dims always has >= 2 elements
77 | mwSize ldb = bdims[0]; // leading dimension of B
78 | mwSize n = bdims[1];
79 |
80 | cusparseOperation_t trans = (cusparseOperation_t)mxGetScalar(TRANS);
81 | if (trans == CUSPARSE_OPERATION_NON_TRANSPOSE)
82 | {
83 | if (ldb != k) mxShowCriticalErrorMessage("B argument wrong size for multiply",ldb);
84 | }
85 | else
86 | {
87 | if (ldb != m) mxShowCriticalErrorMessage("B argument wrong size for transpose multiply",ldb);
88 | }
89 |
90 | // Check types
91 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
92 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
93 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
94 | if (mxGPUGetClassID(b) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("B argument is not single");
95 |
96 | // Check real/complex - mixed is not supported except special case (real A / complex B)
97 | mxComplexity cca = mxGPUGetComplexity(val);
98 | mxComplexity ccb = mxGPUGetComplexity(b);
99 | mxComplexity ccc = (ccb==mxCOMPLEX || cca==mxCOMPLEX) ? mxCOMPLEX : mxREAL;
100 | if(ccb==mxREAL && cca==mxCOMPLEX) mxShowCriticalErrorMessage("Complex matrix and real vector not supported");
101 |
102 | // Create space for output vectors
103 | const mwSize ndim = 2;
104 | mwSize cdims[ndim] = {trans == CUSPARSE_OPERATION_NON_TRANSPOSE ? m : k, n};
105 | mxClassID cid = mxGPUGetClassID(b); // same class as B matrix
106 | int ldc = cdims[0]; // leading dimension of C
107 | mxGPUArray *c;
108 |
109 | // Get handle to the CUBLAS context
110 | cublasHandle_t cublasHandle = 0;
111 | cublasStatus_t cublasStatus;
112 | cublasStatus = cublasCreate(&cublasHandle);
113 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
114 |
115 | // Get handle to the CUSPARSE context
116 | cusparseHandle_t cusparseHandle = 0;
117 | cusparseStatus_t cusparseStatus;
118 | cusparseStatus = cusparseCreate(&cusparseHandle);
119 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
120 | cusparseMatDescr_t descr = 0;
121 | cusparseStatus = cusparseCreateMatDescr(&descr);
122 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
123 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
124 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
125 |
126 | // Convert from matlab pointers to native pointers
127 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr);
128 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col);
129 |
130 | // Now we can access the arrays, we can do some checks
131 | int base;
132 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
133 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing");
134 |
135 | int nnz_check;
136 | cudaMemcpy(&nnz_check, d_row_csr+m, sizeof(int), cudaMemcpyDeviceToHost);
137 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
138 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check);
139 |
140 | // Call cusparse multiply function in (S)ingle precision
141 | if (cca==mxREAL && ccb==mxREAL)
142 | {
143 | const float alpha = 1.0;
144 | const float beta = 0.0;
145 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val);
146 | const float * const d_b = (float*)mxGPUGetDataReadOnly(b);
147 |
148 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES);
149 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
150 | float *d_c = (float*)mxGPUGetData(c);
151 |
152 | #if CUDART_VERSION >= 11000
153 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc);
154 | #else
155 | cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc);
156 | #endif
157 | }
158 | else if (cca==mxREAL && ccb==mxCOMPLEX)
159 | {
160 | #if 0 // CUDART_VERSION >= 12040 // use 12.4 mixed real/complex operation
161 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
162 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
163 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES);
164 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
165 | cuFloatComplex* d_c = (cuFloatComplex*)mxGPUGetData(c);
166 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val);
167 | const cuFloatComplex* const d_b = (cuFloatComplex*)mxGPUGetDataReadOnly(b);
168 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc);
169 | #else
170 | const float alpha = 1.0;
171 | const float beta = 0.0;
172 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val);
173 |
174 | mxGPUArray* c_real = mxGPUCreateGPUArray(ndim, cdims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES);
175 | mxGPUArray* c_imag = mxGPUCreateGPUArray(ndim, cdims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES);
176 | if(!c_real || !c_imag) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
177 | float* d_c_real = (float*)mxGPUGetDataReadOnly(c_real);
178 | float* d_c_imag = (float*)mxGPUGetDataReadOnly(c_imag);
179 |
180 | for(int i = 0; i<2; i++)
181 | {
182 | mxGPUArray const *b_tmp;
183 | if(i==0) b_tmp = mxGPUCopyReal(b);
184 | if(i==1) b_tmp = mxGPUCopyImag(b);
185 | const float* const d_b = (float*)mxGPUGetDataReadOnly(b_tmp);
186 |
187 | #if CUDART_VERSION >= 11000
188 | if(i==0) cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_real, ldc);
189 | if(i==1) cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_imag, ldc);
190 | #else
191 | if(i==0) cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_real, ldc);
192 | if(i==1) cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_imag, ldc);
193 | #endif
194 | mxGPUDestroyGPUArray(b_tmp);
195 | if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("csrmm failed.");
196 | }
197 | c = mxGPUCreateComplexGPUArray(c_real,c_imag);
198 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateComplexGPUArray failed.");
199 | mxGPUDestroyGPUArray(c_real);
200 | mxGPUDestroyGPUArray(c_imag);
201 | #endif
202 | }
203 | else
204 | {
205 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
206 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
207 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
208 | const cuFloatComplex * const d_b = (cuFloatComplex*)mxGPUGetDataReadOnly(b);
209 |
210 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES);
211 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
212 | cuFloatComplex *d_c = (cuFloatComplex*)mxGPUGetData(c);
213 |
214 | #if CUDART_VERSION >= 11000
215 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc);
216 | #else
217 | cusparseStatus = cusparseCcsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc);
218 | #endif
219 | }
220 |
221 | // Return result
222 | if (cusparseStatus == CUSPARSE_STATUS_SUCCESS)
223 | {
224 | C = mxGPUCreateMxArrayOnGPU(c);
225 | }
226 |
227 | // Clean up
228 | cusparseDestroyMatDescr(descr);
229 | cusparseDestroy(cusparseHandle);
230 | cublasDestroy(cublasHandle);
231 | mxGPUDestroyGPUArray(row_csr);
232 | mxGPUDestroyGPUArray(col);
233 | mxGPUDestroyGPUArray(val);
234 | mxGPUDestroyGPUArray(b);
235 | mxGPUDestroyGPUArray(c);
236 | mxFree(bdims);
237 |
238 | return;
239 | }
240 |
241 |
--------------------------------------------------------------------------------
/private/csrmv.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE matrix-vector multiply (csrmv).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 | #include
14 |
15 | /* Using updated (v2) interfaces to cublas */
16 | #include
17 | #include
18 | #include
19 |
20 | #if CUDART_VERSION >= 11000
21 | #include "wrappers_to_cuda_11.h"
22 | #endif
23 |
24 | // MATLAB related
25 | #include "mex.h"
26 | #include "gpu/mxGPUArray.h"
27 | #include "mxShowCriticalErrorMessage.h"
28 |
29 | // Input Arguments
30 | #define ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu)
31 | #define COL prhs[1]
32 | #define VAL prhs[2]
33 | #define NROWS prhs[3]
34 | #define NCOLS prhs[4]
35 | #define TRANS prhs[5]
36 | #define X prhs[6]
37 |
38 | // Output Arguments
39 | #define Y plhs[0]
40 |
41 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
42 | {
43 | // Checks
44 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
45 | if (nrhs != 7) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
46 |
47 | if(!mxIsGPUArray(ROW_CSR)) mxShowCriticalErrorMessage("ROW_CSR argument is not on GPU");
48 | if(!mxIsGPUArray(COL)) mxShowCriticalErrorMessage("COL argument is not on GPU");
49 | if(!mxIsGPUArray(VAL)) mxShowCriticalErrorMessage("VAL argument is not on GPU");
50 | if(!mxIsGPUArray(X)) mxShowCriticalErrorMessage("X argument is not on GPU");
51 |
52 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
53 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
54 | if (!mxIsScalar(TRANS)) mxShowCriticalErrorMessage("TRANS argument must be a scalar");
55 |
56 | // Initialize the MathWorks GPU API
57 | mxInitGPU();
58 |
59 | // Create Matlab pointers on the GPU
60 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR);
61 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL);
62 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL);
63 | mxGPUArray const *x = mxGPUCreateFromMxArray(X);
64 |
65 | // Check sizes - note rows are in CSR (compressed row) format
66 | mwSize nnz = mxGPUGetNumberOfElements(val);
67 | mwSize nrows = mxGetScalar(NROWS);
68 | mwSize ncols = mxGetScalar(NCOLS);
69 |
70 | mwSize *xdims = (mwSize*)mxGPUGetDimensions(x); // xdims always has >= 2 elements
71 | if (mxGPUGetNumberOfDimensions(x) > 2) mxShowCriticalErrorMessage("X argument has too many dimensions",mxGPUGetNumberOfDimensions(x));
72 | if (xdims[1] != 1) mxShowCriticalErrorMessage("X argument is not a column vector");
73 |
74 | int nx = xdims[0];
75 |
76 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size",mxGPUGetNumberOfElements(row_csr));
77 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL argument wrong size",mxGPUGetNumberOfElements(col));
78 |
79 | cusparseOperation_t trans = (cusparseOperation_t)mxGetScalar(TRANS);
80 | if (trans == CUSPARSE_OPERATION_NON_TRANSPOSE)
81 | {
82 | if (nx != ncols) mxShowCriticalErrorMessage("X argument wrong size for multiply",nx);
83 | }
84 | else
85 | {
86 | if (nx != nrows) mxShowCriticalErrorMessage("X argument wrong size for transpose multiply",nx);
87 | }
88 |
89 | // Check types
90 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
91 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
92 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
93 | if (mxGPUGetClassID(x) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("X argument is not single");
94 |
95 | // Check real/complex - mixed is not supported except special case (real A / complex x)
96 | mxComplexity ccx = mxGPUGetComplexity(x);
97 | mxComplexity ccv = mxGPUGetComplexity(val);
98 | mxComplexity ccy = (ccx==mxCOMPLEX || ccv==mxCOMPLEX) ? mxCOMPLEX : mxREAL;
99 | if(ccx==mxREAL && ccv==mxCOMPLEX) mxShowCriticalErrorMessage("Complex matrix and real vector not supported");
100 |
101 | // Create space for output vector
102 | const mwSize ndim = 1;
103 | mwSize dims[ndim] = {trans == CUSPARSE_OPERATION_NON_TRANSPOSE ? nrows : ncols};
104 | mxClassID cid = mxGPUGetClassID(x);
105 | mxGPUArray *y;
106 |
107 | // Get handle to the CUBLAS context
108 | cublasHandle_t cublasHandle = 0;
109 | cublasStatus_t cublasStatus;
110 | cublasStatus = cublasCreate(&cublasHandle);
111 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
112 |
113 | // Get handle to the CUSPARSE context
114 | cusparseHandle_t cusparseHandle = 0;
115 | cusparseStatus_t cusparseStatus;
116 | cusparseStatus = cusparseCreate(&cusparseHandle);
117 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
118 | cusparseMatDescr_t descr = 0;
119 | cusparseStatus = cusparseCreateMatDescr(&descr);
120 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
121 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
122 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
123 |
124 | // Convert from matlab pointers to native pointers
125 | const int* const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr);
126 | const int* const d_col = (int*)mxGPUGetDataReadOnly(col);
127 |
128 | // Now we can access the arrays, we can do some checks
129 | int base;
130 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
131 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing");
132 |
133 | int nnz_check;
134 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
135 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
136 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check);
137 |
138 | // Call cusparse multiply function in (S)ingle precision
139 | if (ccv==mxREAL && ccx==mxREAL)
140 | {
141 | const float alpha = 1.0;
142 | const float beta = 0.0;
143 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES);
144 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
145 | float* d_y = (float*)mxGPUGetData(y);
146 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val);
147 | const float* const d_x = (float*)mxGPUGetDataReadOnly(x);
148 | #if CUDART_VERSION >= 11000
149 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y);
150 | #else
151 | cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y);
152 | #endif
153 | }
154 | else if (ccv==mxREAL && ccx==mxCOMPLEX)
155 | {
156 | #if CUDART_VERSION >= 11020 // use 11.2 mixed real/complex operation
157 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
158 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
159 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES);
160 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
161 | cuFloatComplex* d_y = (cuFloatComplex*)mxGPUGetData(y);
162 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val);
163 | const cuFloatComplex* const d_x = (cuFloatComplex*)mxGPUGetDataReadOnly(x);
164 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y);
165 | #else
166 | const float alpha = 1.0;
167 | const float beta = 0.0;
168 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val);
169 |
170 | mxGPUArray* y_real = mxGPUCreateGPUArray(ndim, dims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES);
171 | mxGPUArray* y_imag = mxGPUCreateGPUArray(ndim, dims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES);
172 | if(!y_real || !y_imag) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
173 | float* d_y_real = (float*)mxGPUGetDataReadOnly(y_real);
174 | float* d_y_imag = (float*)mxGPUGetDataReadOnly(y_imag);
175 |
176 | for(int i = 0; i<2; i++)
177 | {
178 | mxGPUArray const *x_tmp;
179 | if(i==0) x_tmp = mxGPUCopyReal(x);
180 | if(i==1) x_tmp = mxGPUCopyImag(x);
181 | const float* const d_x = (float*)mxGPUGetDataReadOnly(x_tmp);
182 | #if CUDART_VERSION >= 11000
183 | if(i==0) cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_real);
184 | if(i==1) cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_imag);
185 | #else
186 | if(i==0) cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_real);
187 | if(i==1) cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_imag);
188 | #endif
189 | mxGPUDestroyGPUArray(x_tmp);
190 | if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("csrmv failed.");
191 | }
192 | y = mxGPUCreateComplexGPUArray(y_real,y_imag);
193 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateComplexGPUArray failed.");
194 | mxGPUDestroyGPUArray(y_real);
195 | mxGPUDestroyGPUArray(y_imag);
196 | #endif
197 | }
198 | else
199 | {
200 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
201 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
202 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES);
203 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed.");
204 | cuFloatComplex* d_y = (cuFloatComplex*)mxGPUGetData(y);
205 | const cuFloatComplex* const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
206 | const cuFloatComplex* const d_x = (cuFloatComplex*)mxGPUGetDataReadOnly(x);
207 | #if CUDART_VERSION >= 11000
208 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y);
209 | #else
210 | cusparseStatus = cusparseCcsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y);
211 | #endif
212 | }
213 |
214 | // Return result
215 | if (cusparseStatus == CUSPARSE_STATUS_SUCCESS)
216 | {
217 | Y = mxGPUCreateMxArrayOnGPU(y);
218 | }
219 | else
220 | {
221 | mxShowCriticalErrorMessage("unknown failure",cusparseStatus);
222 | }
223 |
224 | // Clean up
225 | cusparseDestroyMatDescr(descr);
226 | cusparseDestroy(cusparseHandle);
227 | cublasDestroy(cublasHandle);
228 | mxGPUDestroyGPUArray(row_csr);
229 | mxGPUDestroyGPUArray(col);
230 | mxGPUDestroyGPUArray(val);
231 | mxGPUDestroyGPUArray(x);
232 | mxGPUDestroyGPUArray(y);
233 | mxFree(xdims);
234 |
235 | return;
236 | }
237 |
238 |
--------------------------------------------------------------------------------
/private/csrsort.cu:
--------------------------------------------------------------------------------
1 | //
2 | // Mex wrapper to CUSPARSE sort for CSR format (csrsort).
3 | //
4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse.
5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv
6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid
7 | //
8 |
9 | // includes, system
10 | #include
11 | #include
12 | #include
13 |
14 | /* Using updated (v2) interfaces to cublas */
15 | #include
16 | #include
17 | #include
18 |
19 | // MATLAB related
20 | #include "mex.h"
21 | #include "gpu/mxGPUArray.h"
22 | #include "mxShowCriticalErrorMessage.h"
23 |
24 | // Input Arguments
25 | #define ROW_CSR prhs[0]
26 | #define COL prhs[1]
27 | #define VAL prhs[2]
28 | #define NROWS prhs[3]
29 | #define NCOLS prhs[4]
30 |
31 | // Output Arguments
32 | #define COL_SORT plhs[0]
33 | #define VAL_SORT plhs[1]
34 |
35 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[])
36 | {
37 | // Checks
38 | if (nlhs > 2) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs);
39 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs);
40 |
41 | // Initialize the MathWorks GPU API
42 | mxInitGPU();
43 |
44 | // Create Matlab pointers on the GPU
45 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR);
46 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL);
47 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL);
48 |
49 | // Checks - note vectors must be in CSR format
50 | int nnz = mxGPUGetNumberOfElements(val);
51 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL and VAL argument length mismatch");
52 |
53 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar");
54 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar");
55 |
56 | int ncols = (int)mxGetScalar(NCOLS);
57 | int nrows = (int)mxGetScalar(NROWS);
58 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size");
59 |
60 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32");
61 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32");
62 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single");
63 |
64 | // Create space for output vectors
65 | const mwSize ndim = 1;
66 | mwSize dims[ndim];
67 |
68 | dims[0] = nnz;
69 | mxGPUArray *col_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES);
70 | if (col_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
71 |
72 | mxComplexity ccx = mxGPUGetComplexity(val);
73 | mxGPUArray *val_sort = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES);
74 | if (val_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed");
75 |
76 | // Get handle to the CUBLAS context
77 | cublasHandle_t cublasHandle = 0;
78 | cublasStatus_t cublasStatus;
79 | cublasStatus = cublasCreate(&cublasHandle);
80 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus);
81 |
82 | // Get handle to the CUSPARSE context
83 | cudaError_t cudaStatus;
84 | cusparseStatus_t cusparseStatus;
85 | cusparseHandle_t cusparseHandle = 0;
86 | cusparseStatus = cusparseCreate(&cusparseHandle);
87 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
88 | cusparseMatDescr_t descr = 0;
89 | cusparseStatus = cusparseCreateMatDescr(&descr);
90 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus);
91 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
92 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE);
93 |
94 | // Convert from matlab pointers to native pointers
95 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr);
96 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col);
97 | int *d_col_sort = (int*)mxGPUGetData(col_sort);
98 |
99 | // Now we can access the arrays, we can do some checks
100 | int base;
101 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost);
102 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("A_ROW_CSR not using 1-based indexing");
103 |
104 | int nnz_check;
105 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost);
106 | nnz_check -= CUSPARSE_INDEX_BASE_ONE;
107 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check);
108 |
109 | // Since sort is in-place, copy the read-only vectors to read-write ones
110 | cudaStatus = cudaMemcpy((void *)d_col_sort, d_col, nnz*sizeof(int), cudaMemcpyDeviceToDevice);
111 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed");
112 |
113 | if (ccx == mxREAL)
114 | {
115 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val);
116 | float *d_val_sort = (float*)mxGPUGetData(val_sort);
117 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice);
118 | }
119 | else
120 | {
121 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
122 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort);
123 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(cuFloatComplex), cudaMemcpyDeviceToDevice);
124 | }
125 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus);
126 |
127 | // Sort by rows
128 | int *P = NULL;
129 | void *pBuffer = NULL;
130 | size_t pBufferSizeInBytes = 0;
131 |
132 | if (nnz > 0)
133 | {
134 | // step 1: allocate buffer
135 | cusparseStatus = cusparseXcsrsort_bufferSizeExt(cusparseHandle, nrows, ncols, nnz, d_row_csr, d_col, &pBufferSizeInBytes);
136 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosort_bufferSizeExt failed",cusparseStatus);
137 |
138 | cudaStatus = cudaMalloc( &pBuffer, sizeof(char)*pBufferSizeInBytes);
139 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus);
140 |
141 | // step 2: setup permutation vector P to identity
142 | cudaStatus = cudaMalloc( &P, sizeof(int)*nnz);
143 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus);
144 |
145 | cusparseStatus = cusparseCreateIdentityPermutation(cusparseHandle, nnz, P);
146 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseCreateIdentityPermutation failed",cusparseStatus);
147 |
148 | // step 3: sort COO format by Row
149 | cusparseStatus = cusparseXcsrsort(cusparseHandle, nrows, ncols, nnz, descr, d_row_csr, d_col_sort, P, pBuffer);
150 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcsrsort failed",cusparseStatus);
151 |
152 | // step 4: gather sorted cooVals
153 | if (ccx == mxREAL)
154 | {
155 | float *d_val = (float*)mxGPUGetDataReadOnly(val);
156 | float *d_val_sort = (float*)mxGPUGetData(val_sort);
157 | #if CUDART_VERSION >= 11000
158 | cusparseHandle_t handle = NULL;
159 | cusparseDnVecDescr_t vec_values;
160 | cusparseSpVecDescr_t vec_permutation;
161 | cusparseCreate(&handle);
162 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_R_32F);
163 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); // MUST USE BASE_ZERO
164 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation);
165 | cusparseDestroyDnVec(vec_values);
166 | cusparseDestroySpVec(vec_permutation);
167 | cusparseDestroy(handle);
168 | #else
169 | cusparseStatus = cusparseSgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO
170 | #endif
171 | }
172 | else
173 | {
174 | cuFloatComplex *d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val);
175 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort);
176 | #if CUDART_VERSION >= 11000
177 | cusparseHandle_t handle = NULL;
178 | cusparseDnVecDescr_t vec_values;
179 | cusparseSpVecDescr_t vec_permutation;
180 | cusparseCreate(&handle);
181 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_C_32F);
182 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F); // MUST USE BASE_ZERO
183 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation);
184 | cusparseDestroyDnVec(vec_values);
185 | cusparseDestroySpVec(vec_permutation);
186 | cusparseDestroy(handle);
187 | #else
188 | cusparseStatus = cusparseCgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO
189 | #endif
190 | }
191 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseSgthr or cusparseCgthr failed",cusparseStatus);
192 |
193 | }
194 |
195 | // Return result
196 | COL_SORT = mxGPUCreateMxArrayOnGPU(col_sort);
197 | VAL_SORT = mxGPUCreateMxArrayOnGPU(val_sort);
198 |
199 | // Make sure operations are finished before deleting
200 | //cudaDeviceSynchronize();
201 |
202 | // Clean up
203 | cusparseDestroyMatDescr(descr);
204 | cusparseDestroy(cusparseHandle);
205 | cublasDestroy(cublasHandle);
206 | mxGPUDestroyGPUArray(row_csr);
207 | mxGPUDestroyGPUArray(col);
208 | mxGPUDestroyGPUArray(col_sort);
209 | mxGPUDestroyGPUArray(val);
210 | mxGPUDestroyGPUArray(val_sort);
211 | if (pBuffer) cudaFree(pBuffer);
212 | if (P) cudaFree(P);
213 |
214 | return;
215 | }
216 |
--------------------------------------------------------------------------------
/private/mex_all.m:
--------------------------------------------------------------------------------
1 | function mex_all()
2 |
3 | % checks
4 | if ~exist('/usr/local/cuda','dir')
5 | warning('/usr/local/cuda directory not found. Try:\n%s','"sudo ln -s /usr/local/cuda-11 /usr/local/cuda"')
6 | end
7 |
8 | % override MATLAB's supplied version of nvcc - not sure what difference this makes
9 | setenv('MW_ALLOW_ANY_CUDA','1')
10 | setenv('MW_NVCC_PATH', '/usr/local/cuda/bin')
11 |
12 | % need to be in the current directory for mexcuda
13 | oldpath = pwd;
14 | newpath = fileparts(mfilename('fullpath'));
15 | cd(newpath);
16 |
17 | % if the mexcuda fails, we are stuck - rethrow error
18 | try
19 | mex_all_compile();
20 | cd(oldpath)
21 | catch ME
22 | cd(oldpath)
23 | rethrow(ME)
24 | end
25 |
26 | %% call mexcuda
27 | function mex_all_compile()
28 |
29 | mexcuda csrgeam.cu -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic -v
30 | mexcuda csrmv.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
31 | mexcuda coo2csr.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
32 | mexcuda csr2csc.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
33 | mexcuda csr2csc_cpu.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
34 | mexcuda csrmm.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
35 | mexcuda csr2coo.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
36 | mexcuda csrsort.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
37 | mexcuda coosortByRow.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic
38 |
--------------------------------------------------------------------------------
/private/mxShowCriticalErrorMessage.h:
--------------------------------------------------------------------------------
1 | // Beautified mex error message macro.
2 | //
3 | // Inspired by
4 | // http://www.advanpix.com/2016/02/14/short-and-informative-error-messages-from-mex/
5 | //
6 | // Usage:
7 | // mxShowCriticalErrorMessage("my message");
8 | // mxShowCriticalErrorMessage("my message", 123);
9 | //
10 | #ifndef mxShowCriticalErrorMessage
11 |
12 | #include "mex.h"
13 | #include
14 | #include
15 |
16 | // Macro to strip the path off __FILE__ (platform independent alternative to basename)
17 | #define __FUNC__ std::max(__FILE__,std::max(strrchr(__FILE__,'\\')+1,strrchr(__FILE__,'/')+1))
18 |
19 | // Use macro to expand __FUNC__ and __LINE__ correctly
20 | #define mxShowCriticalErrorMessage(...) err_fn(__FUNC__,__LINE__,##__VA_ARGS__)
21 |
22 | // Use overloads to handle __VA_ARGS__ correctly
23 | void err_fn(const char *fn_name, int line_no, const char *err_message, int err_code)
24 | {
25 | const int nargs = 5;
26 | mxArray *err_args[nargs];
27 | err_args[0] = mxCreateString("\n%s(%i): %s (%i).\n");
28 | err_args[1] = mxCreateString(fn_name);
29 | err_args[2] = mxCreateDoubleMatrix(1,1,mxREAL);
30 | err_args[3] = mxCreateString(err_message);
31 | err_args[4] = mxCreateDoubleMatrix(1,1,mxREAL);
32 | *mxGetPr(err_args[2]) = line_no;
33 | *mxGetPr(err_args[4]) = err_code;
34 | mexCallMATLAB(0,0,nargs,err_args,"error");
35 | }
36 |
37 | void err_fn(const char *fn_name, int line_no, const char *err_message)
38 | {
39 | const int nargs = 4;
40 | mxArray *err_args[nargs];
41 | err_args[0] = mxCreateString("\n%s(%i): %s.\n");
42 | err_args[1] = mxCreateString(fn_name);
43 | err_args[2] = mxCreateDoubleMatrix(1,1,mxREAL);
44 | err_args[3] = mxCreateString(err_message);
45 | *mxGetPr(err_args[2]) = line_no;
46 | mexCallMATLAB(0,0,nargs,err_args,"error");
47 | }
48 |
49 | void err_fn(const char *fn_name, int line_no, int err_code)
50 | {
51 | err_fn(fn_name, line_no, "Error occurred", err_code);
52 | }
53 |
54 | void err_fn(const char *fn_name, int line_no)
55 | {
56 | err_fn(fn_name, line_no, "Error occurred");
57 | }
58 |
59 | #endif
60 |
--------------------------------------------------------------------------------
/private/wrappers_to_cuda_11.h:
--------------------------------------------------------------------------------
1 | // wrappers to the new CUDA 11 interface for pre-11 code
2 |
3 | #include
4 | #include
5 | #include "mxShowCriticalErrorMessage.h"
6 | #include
7 |
8 | // for cuda 12
9 | #if CUDART_VERSION >= 12000
10 |
11 | #define CUSPARSE_MV_ALG_DEFAULT CUSPARSE_SPMV_ALG_DEFAULT
12 | #define CUSPARSE_CSR2CSC_ALG2 CUSPARSE_CSR2CSC_ALG1
13 |
14 | #endif
15 | // end of cuda 12
16 |
17 | #define CHECK_CUSPARSE(func) \
18 | { \
19 | cusparseStatus_t status = (func); \
20 | if (status != CUSPARSE_STATUS_SUCCESS) \
21 | mxShowCriticalErrorMessage(cusparseGetErrorString(status),status); \
22 | }
23 |
24 | template cudaDataType type_to_enum();
25 | template<> cudaDataType type_to_enum() { return CUDA_R_32F; }
26 | template<> cudaDataType type_to_enum() { return CUDA_C_32F; }
27 |
28 | // -------------------------------------------------------------------------------//
29 | template
30 | cusparseStatus_t
31 | cusparseXcsrmv_wrapper(cusparseHandle_t handle,
32 | cusparseOperation_t transA,
33 | int A_num_rows,
34 | int A_num_cols,
35 | int A_num_nnz,
36 | const T* alpha,
37 | const cusparseMatDescr_t descrA,
38 | const S* dA_values,
39 | const int* dA_csrOffsets,
40 | const int* dA_columns,
41 | const T* dX,
42 | const T* beta,
43 | void* dY)
44 | {
45 | cusparseSpMatDescr_t matA;
46 | cusparseDnVecDescr_t vecX, vecY;
47 | void* buffer = NULL;
48 | size_t bufferSize = 0;
49 | cudaDataType typeA = type_to_enum();
50 | cudaDataType typeX = type_to_enum();
51 | cudaDataType typeY = (typeA==CUDA_C_32F || typeX==CUDA_C_32F) ? CUDA_C_32F : CUDA_R_32F;
52 |
53 | //std::cout << "typeA " << typeA << " typeX " << typeX << " typeY " << typeY << std::endl;
54 |
55 | // Create sparse matrix A in CSR format
56 | CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
57 | (void*)dA_csrOffsets, (void*)dA_columns, (void*)dA_values,
58 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
59 | cusparseGetMatIndexBase(descrA), typeA) )
60 | // Create dense vector X
61 | int X_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_cols : A_num_rows;
62 | CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, X_rows, (void*)dX, typeX) )
63 | // Create dense vector y
64 | int Y_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_rows : A_num_cols;
65 | CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, Y_rows, (void*)dY, typeY) )
66 | // allocate an external buffer if needed
67 | CHECK_CUSPARSE( cusparseSpMV_bufferSize(
68 | handle, transA,
69 | alpha, matA, vecX, beta, vecY, typeY,
70 | CUSPARSE_MV_ALG_DEFAULT, &bufferSize) )
71 | if (bufferSize > 0)
72 | {
73 | cudaError_t status = cudaMalloc(&buffer, bufferSize);
74 | if (status != cudaSuccess)
75 | return CUSPARSE_STATUS_ALLOC_FAILED;
76 | }
77 |
78 | //std::cout << "bufferSize " << bufferSize << " CUSPARSE_STATUS_NOT_SUPPORTED " << CUSPARSE_STATUS_NOT_SUPPORTED << std::endl;
79 |
80 | // execute SpMV
81 | CHECK_CUSPARSE( cusparseSpMV(handle, transA,
82 | alpha, matA, vecX, beta, vecY, typeY,
83 | CUSPARSE_MV_ALG_DEFAULT, buffer) )
84 |
85 |
86 | // destroy matrix/vector descriptors
87 | CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
88 | CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) )
89 | CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) )
90 | if(buffer) cudaFree(buffer);
91 | return CUSPARSE_STATUS_SUCCESS;
92 | }
93 |
94 | // -------------------------------------------------------------------------------//
95 | template
96 | cusparseStatus_t
97 | cusparseXcsrmm_wrapper(cusparseHandle_t handle,
98 | cusparseOperation_t transA,
99 | int A_num_rows,
100 | int A_num_cols,
101 | int B_num_cols,
102 | int A_num_nnz,
103 | const T* alpha,
104 | const cusparseMatDescr_t descrA,
105 | const S* dA_values,
106 | const int* dA_csrOffsets,
107 | const int* dA_columns,
108 | const T* dB,
109 | int ldb,
110 | const T* beta,
111 | void* dC,
112 | int ldc)
113 | {
114 | cusparseSpMatDescr_t matA;
115 | cusparseDnMatDescr_t matB, matC;
116 | void* buffer = NULL;
117 | size_t bufferSize = 0;
118 | cudaDataType typeA = type_to_enum();
119 | cudaDataType typeB = type_to_enum();
120 | cudaDataType typeC = (typeA==CUDA_C_32F || typeB==CUDA_C_32F) ? CUDA_C_32F : CUDA_R_32F;
121 |
122 | // handle some limited transpose functionality (A or A' only)
123 | cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
124 | int B_num_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_cols : A_num_rows;
125 | int C_num_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_rows : A_num_cols;
126 | int C_num_cols = (transB==CUSPARSE_OPERATION_NON_TRANSPOSE) ? B_num_cols : B_num_rows;
127 |
128 | // Create sparse matrix A in CSR format
129 | CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz,
130 | (void*)dA_csrOffsets, (void*)dA_columns, (void*)dA_values,
131 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
132 | cusparseGetMatIndexBase(descrA), typeA) )
133 | // Create dense vector B
134 | CHECK_CUSPARSE( cusparseCreateDnMat(&matB, B_num_rows, B_num_cols, ldb, (void*)dB, typeB, CUSPARSE_ORDER_COL) )
135 | // Create dense vector C
136 | CHECK_CUSPARSE( cusparseCreateDnMat(&matC, C_num_rows, C_num_cols, ldc, (void*)dC, typeC, CUSPARSE_ORDER_COL) )
137 | // allocate an external buffer if needed
138 | CHECK_CUSPARSE( cusparseSpMM_bufferSize(
139 | handle, transA, transB,
140 | (void*)alpha, matA, matB, (void*)beta, matC, typeC,
141 | CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) )
142 | if (bufferSize > 0) {
143 | cudaError_t status = cudaMalloc(&buffer, bufferSize);
144 | if (status != cudaSuccess)
145 | return CUSPARSE_STATUS_ALLOC_FAILED;
146 | }
147 | // execute SpMM
148 | CHECK_CUSPARSE( cusparseSpMM(handle, transA, transB,
149 | alpha, matA, matB, beta, matC, typeC,
150 | CUSPARSE_SPMM_ALG_DEFAULT, buffer) )
151 | // destroy matrix/vector descriptors
152 | CHECK_CUSPARSE( cusparseDestroySpMat(matA) )
153 | CHECK_CUSPARSE( cusparseDestroyDnMat(matB) )
154 | CHECK_CUSPARSE( cusparseDestroyDnMat(matC) )
155 | if(buffer) cudaFree(buffer);
156 | return CUSPARSE_STATUS_SUCCESS;
157 | }
158 |
159 | // -------------------------------------------------------------------------------//
160 | template
161 | cusparseStatus_t
162 | cusparseXcsr2csc_wrapper(cusparseHandle_t handle,
163 | int m,
164 | int n,
165 | int nnz,
166 | const T* csrVal,
167 | const int* csrRowPtr,
168 | const int* csrColInd,
169 | T* cscVal,
170 | int* cscRowInd,
171 | int* cscColPtr,
172 | cusparseAction_t copyValues,
173 | cusparseIndexBase_t idxBase)
174 | {
175 | void* buffer = NULL;
176 | size_t bufferSize = 0;
177 | cudaDataType valType = type_to_enum();
178 | cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG2;
179 |
180 | // fails if nnz==0
181 | if(nnz==0)
182 | {
183 | mxShowCriticalErrorMessage("BUG: cusparseCsr2cscEx2 fails when nnz=0");
184 | return CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED;
185 | }
186 |
187 | // make buffer
188 | CHECK_CUSPARSE( cusparseCsr2cscEx2_bufferSize(
189 | handle,
190 | m,
191 | n,
192 | nnz,
193 | csrVal,
194 | csrRowPtr,
195 | csrColInd,
196 | cscVal,
197 | cscColPtr,
198 | cscRowInd,
199 | valType,
200 | copyValues,
201 | idxBase,
202 | alg,
203 | &bufferSize) )
204 |
205 | if (bufferSize > 0)
206 | {
207 | cudaError_t status = cudaMalloc(&buffer, bufferSize);
208 | if (status != cudaSuccess)
209 | return CUSPARSE_STATUS_ALLOC_FAILED;
210 | }
211 |
212 | CHECK_CUSPARSE( cusparseCsr2cscEx2(
213 | handle,
214 | m,
215 | n,
216 | nnz,
217 | csrVal,
218 | csrRowPtr,
219 | csrColInd,
220 | cscVal,
221 | cscColPtr,
222 | cscRowInd,
223 | valType,
224 | copyValues,
225 | idxBase,
226 | alg,
227 | buffer) )
228 |
229 | if(buffer) cudaFree(buffer);
230 | return CUSPARSE_STATUS_SUCCESS;
231 | }
232 |
233 |
234 |
235 |
--------------------------------------------------------------------------------
/test_gpuSparse.m:
--------------------------------------------------------------------------------
1 | % test gpuSparse class
2 | clear all
3 | reset(gpuDevice(1))
4 |
5 | M = 121401;
6 | N = 113331;
7 | P = 5e-4;
8 |
9 | %M = 4;
10 | %N = 4;
11 | %P = 1;
12 |
13 | disp('---SETUP---')
14 |
15 | rand('state',0);
16 | randn('state',0);
17 |
18 | tic; fprintf('Making sparse... ')
19 | A = sprandn(M,N,P);
20 | toc
21 |
22 | % remove unwanted precision
23 | [i j v] = find(A);
24 | v = double(single(v));
25 | A = sparse(i,j,v,M,N);
26 |
27 | tic; fprintf('Converting to gpuSparse... ')
28 | a = gpuSparse(A); validate(a)
29 | toc
30 |
31 | fprintf('Sorted index conversion to gpuSparse: ')
32 | [i j v] = find(a);
33 | tic;
34 | b = gpuSparse(i,j,v,M,N); validate(b)
35 | fprintf('errors = [%i %i %i]. ',any(a.row~=b.row),any(a.col~=b.col),any(a.val~=b.val))
36 | toc
37 |
38 | fprintf('Unsorted index conversion to gpuSparse: ')
39 | k = randperm(numel(v));
40 | i = i(k);
41 | j = j(k);
42 | v = v(k);
43 | tic;
44 | b = gpuSparse(i,j,v,M,N); validate(b)
45 | fprintf('errors = [%i %i %i]. ',any(a.row~=b.row),any(a.col~=b.col),any(a.val~=b.val))
46 | toc
47 |
48 | x = randn(N,1,'gpuArray');
49 | y = randn(M,1,'gpuArray');
50 |
51 | % remove unwanted precision
52 | x = double(single(x));
53 | y = double(single(y));
54 |
55 | %% Expected failues (bounds etc)
56 | disp('---CATCH ERRORS---')
57 |
58 | try; gpuSparse('test'); warning('failed'); end
59 | try; gpuSparse(rand(3,3,3)); warning('failed'); end
60 | try; gpuSparse(1,-1); warning('failed'); end
61 | try; gpuSparse(-1,1); warning('failed'); end
62 | try; gpuSparse(1,Inf); warning('failed'); end
63 | try; gpuSparse(1,NaN); warning('failed'); end
64 | try; gpuSparse(Inf,1); warning('failed'); end
65 | try; gpuSparse(NaN,1); warning('failed'); end
66 | try; gpuSparse(intmax('int32'),1); warning('failed'); end
67 | try; gpuSparse(1,intmax('int32')); warning('failed'); end
68 | try; gpuSparse(1,-1,0); warning('failed'); end
69 | try; gpuSparse(-1,1,0); warning('failed'); end
70 | try; gpuSparse(1,Inf,0); warning('failed'); end
71 | try; gpuSparse(1,NaN,0); warning('failed'); end
72 | try; gpuSparse(Inf,1,0); warning('failed'); end
73 | try; gpuSparse(NaN,1,0); warning('failed'); end
74 | try; gpuSparse(intmax('int32'),1,0); warning('failed'); end
75 | try; gpuSparse(1,intmax('int32'),0); warning('failed'); end
76 | try; gpuSparse(1,1,'test'); warning('failed'); end
77 | try; gpuSparse(1:2,1:1,1:2); warning('failed'); end
78 | try; gpuSparse(1:1,1:2,1:2); warning('failed'); end
79 | %try; gpuSparse(1:2,1:2,1:1); warning('failed'); end % this works... why is it here?!
80 | try; gpuSparse(1:1,1:1,1:2); warning('failed'); end
81 | try; gpuSparse(1,1,1,10,0); warning('failed'); end
82 | try; gpuSparse(1,1,1,0,10); warning('failed'); end
83 | try; gpuSparse(1,1,1,10,intmax('int32')); warning('failed'); end
84 | try; gpuSparse(1,1,1,intmax('int32'),10); warning('failed'); end
85 | try; gpuSparse(1,1,10,10,'test'); warning('failed'); end
86 | try; gpuSparse(1,1,10,'test',10); warning('failed'); end
87 | try; gpuSparse(10,10,1,10,9); warning('failed'); end
88 | try; gpuSparse(10,10,1,9,10); warning('failed'); end
89 | try; gpuSparse(10,10,1,10,10,-1); warning('failed'); end
90 | try; gpuSparse(10,10,1,10,10,Inf); warning('failed'); end
91 | try; gpuSparse(1.5,1,1,10,10,1); warning('failed'); end
92 | try; gpuSparse(1,1.5,1,10,10,1); warning('failed'); end
93 | try; gpuSparse(1,1,1,10.5,10,1); warning('failed'); end
94 | try; gpuSparse(1,1,1,10,10.5,1); warning('failed'); end
95 | try; gpuSparse(1,1,1,10,10,1.5); warning('failed'); end
96 | try; gpuSparse(1,1,1,10:11,10,1); warning('failed'); end
97 | try; gpuSparse(1,1,1,10,10:11,1); warning('failed'); end
98 | try; gpuSparse(1,1,1,10,10,1:2); warning('failed'); end
99 |
100 | %% accuracy
101 | disp('---ACCURACY---')
102 |
103 | disp([' Ax ' num2str(norm(A*x-a*single(x),Inf))])
104 | disp([' A''*y ' num2str(norm(A'*y-a'*single(y),Inf))])
105 |
106 | B = sprandn(M,N,P);
107 |
108 | % remove unwanted precision
109 | [i j v] = find(B);
110 | v = double(single(v));
111 | B = sparse(i,j,v,M,N);
112 |
113 | b = gpuSparse(B); validate(b)
114 |
115 | C=A+B;
116 | c=a+b; validate(c)
117 | disp(['(A+B)x ' num2str(norm(C*x-c*single(x),Inf))])
118 | disp(['(A+B)''*y ' num2str(norm(C'*y-c'*single(y),Inf))])
119 |
120 | C=A-B;
121 | c=a-b; validate(c)
122 | disp(['(A-B)x ' num2str(norm(C*x-c*single(x),Inf))])
123 | disp(['(A-B)''*y ' num2str(norm(C'*y-c'*single(y),Inf))])
124 |
125 | d = a - (a')'; validate(d)
126 | disp(['max(a-a'''') ' num2str(max(d.val))])
127 | disp(['min(a-a'''') ' num2str(min(d.val))])
128 |
129 | d = a - full_transpose(full_transpose(a)); validate(d)
130 | disp(['max(a-a'''') ' num2str(max(d.val)) ' (full_transpose)'])
131 | disp(['min(a-a'''') ' num2str(min(d.val)) ' (full_transpose)'])
132 |
133 | B = double(single(randn(N,3)));
134 | b = gpuArray(B);
135 |
136 | C = A*B;
137 | c = a*single(b);
138 | disp(['(A*B-a*b) ' num2str([norm([C-c],Inf)])])
139 |
140 | B = double(single(randn(M,4)));
141 | b = gpuArray(B);
142 |
143 | C = A'*B;
144 | c = a'*single(b);
145 | disp(['(A''*B-a''*b) ' num2str([norm([C-c],Inf)])])
146 |
147 | %% miscellaneous operations
148 |
149 | disp('---MISCELLANEOUS---')
150 |
151 | % mixed real/complex multiplies
152 |
153 | A = A + 1i*sprandn(A);
154 |
155 | % remove unwanted precision
156 | [i j v] = find(A);
157 | v = double(single(v));
158 | A = sparse(i,j,v,M,N);
159 |
160 | a = gpuSparse(A); validate(a)
161 |
162 | x = single(randn(N,1) + 1i*randn(N,1,'gpuArray'));
163 | y = single(randn(M,1) + 1i*randn(M,1,'gpuArray'));
164 |
165 | disp('real multiply')
166 | disp(norm(real(A)*double(real(x)) - real(a)*real(x),Inf))
167 | disp(norm(real(A')*double(real(y)) - real(a')*real(y),Inf))
168 | disp(norm(real(A.')*double(real(y)) - real(a.')*real(y),Inf))
169 |
170 | disp('complex multiply')
171 | disp(norm(A*double(x) - a*x,Inf))
172 | disp(norm(A'*double(y) - a'*y,Inf))
173 | disp(norm(A.'*double(y) - a.'*y,Inf))
174 |
175 | disp('mixed real/complex multiply')
176 | disp(norm(A*real(double(x)) - a*real(x),Inf))
177 | disp(norm(real(A)*double(x) - real(a)*x,Inf))
178 | disp(norm(A'*real(double(y)) - a'*real(y),Inf))
179 | disp(norm(A.'*real(double(y)) - a.'*real(y),Inf))
180 | disp(norm(real(A')*double(y) - real(a')*y,Inf))
181 | disp(norm(real(A.')*double(y) - real(a.')*y,Inf))
182 |
183 | disp('max')
184 | disp(norm(full(max(A,[],2)) - max(a,[],2)))
185 |
186 | disp('sum')
187 | disp(norm(sum(A,1) - sum(a,1),inf))
188 | disp(norm(sum(A,2) - sum(a,2),inf))
189 |
190 | disp('norm')
191 | disp(norm(A,1) - norm(a,1))
192 | disp(norm(A,inf) - norm(a,inf))
193 | disp(norm(A,'fro') - norm(a,'fro'))
194 |
195 | disp('sparse');
196 | disp(norm(sparse(a)-A,inf));
197 | disp(norm(sparse(a')-A',inf));
198 | disp(norm(sparse(a.')-A.',inf));
199 | disp('full_transpose(a)')
200 | at = full_transpose(a); validate(at);
201 | disp(norm(sparse(at)-A.',inf))
202 | disp('full_ctranspose(a)')
203 | at = full_ctranspose(a); validate(at);
204 | disp(norm(sparse(at)-A',inf))
205 | disp('full_transpose(a.'')')
206 | att = full_transpose(a.'); validate(att);
207 | disp(norm(sparse(att)-(A.').',inf))
208 | disp('full_transpose(a'')')
209 | att = full_transpose(a'); validate(att);
210 | disp(norm(sparse(att)-(A').',inf))
211 | disp('full_ctranspose(a.'')')
212 | att = full_ctranspose(a.'); validate(att);
213 | disp(norm(sparse(att)-(A.')',inf))
214 | disp('full_ctranspose(a'')')
215 | att = full_ctranspose(a'); validate(att);
216 | disp(norm(sparse(att)-(A')',inf))
217 |
218 | disp('find')
219 | [i j v] = find(A); [i2 j2 v2] = find(a);
220 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2))
221 | [i j v] = find(A'); [i2 j2 v2] = find(a');
222 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2))
223 | [i j v] = find(A.'); [i2 j2 v2] = find(a.');
224 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2))
225 |
226 | % these fail - values are in different order
227 | %disp('nonzeros')
228 | %disp(norm(nonzeros(A)-nonzeros(a),inf))
229 | %disp(norm(nonzeros(A')-nonzeros(a'),inf))
230 | %disp(norm(nonzeros(A.')-nonzeros(a.'),inf))
231 |
232 | disp('addition')
233 | B = sprandn(M,N,P);
234 |
235 | [i j v] = find(B); % remove unwanted precision
236 | v = double(single(v));
237 | B = sparse(i,j,v,M,N);
238 |
239 | b = gpuSparse(B); validate(b)
240 |
241 | A = real(A); B = real(B);
242 | a = real(a); validate(a);
243 | b = real(b); validate(b);
244 | c = a+b; validate(c);
245 |
246 | disp(norm((A+B) - sparse(a+b),Inf))
247 | disp(norm((A'+B') - sparse(a'+b'),Inf))
248 | disp(norm((A.'+B.') - sparse(a.'+b.'),Inf))
249 | disp(norm((A+B)' - sparse((a+b)'),Inf))
250 |
251 | disp('cat')
252 | C = [A;B];
253 | c = [a;b];
254 | disp(norm(C-c,'fro'))
255 | C = [A B];
256 | c = [a b];
257 | disp(norm(C-c,'fro'))
258 |
259 | %% timings
260 | disp('---TIMINGS---')
261 |
262 | for j = 1:2
263 |
264 | A = gather(A);
265 | x = gather(x);
266 | y = gather(y);
267 |
268 | x = double(x);
269 | y = double(y);
270 |
271 | % to test mm as well as mv multiply
272 | if j==1
273 | fprintf('\n============= Matrix-vector multiply =============\n');
274 | else
275 | x = repmat(x,1,5);
276 | y = repmat(y,1,5);
277 | fprintf('\n========= Matrix-matrix multiply (cols %i) =========\n',size(x,2));
278 | end
279 |
280 | tic; fprintf('A*x (sparse) : ')
281 | for k = 1:20
282 | z = A*x; wait(gpuDevice);
283 | end
284 | toc;
285 |
286 | AT = A';
287 | tic; fprintf('AT*y (sparse) : ')
288 | for k = 1:20
289 | z = AT*y; wait(gpuDevice);
290 | end
291 | toc;
292 |
293 | tic; fprintf('A''*y (sparse) : ')
294 | for k = 1:20
295 | z = A'*y; wait(gpuDevice);
296 | end
297 | toc;
298 |
299 | A = gpuArray(A);
300 | x = gpuArray(x);
301 | y = gpuArray(y);
302 |
303 | tic; fprintf('\nA*x (gpuArray) : ')
304 | for k = 1:20
305 | z = A*x; wait(gpuDevice);
306 | end
307 | toc;
308 |
309 | AT = A';
310 | tic; fprintf('AT*y (gpuArray) : ')
311 | for k = 1:20
312 | z = AT*y; wait(gpuDevice);
313 | end
314 | toc;
315 |
316 | tic; fprintf('A''*y (gpuArray) : ')
317 | for k = 1:20
318 | z = A'*y; wait(gpuDevice);
319 | end
320 | toc;
321 |
322 | a = gpuSparse(A); validate(a)
323 | x = single(x);
324 | y = single(y);
325 |
326 | tic; fprintf('\nA*x (gpuSparse): ')
327 | for k = 1:20
328 | z = a*x; wait(gpuDevice);
329 | end
330 | toc;
331 |
332 | at = full_transpose(a); validate(at)
333 | tic; fprintf('At*y (gpuSparse): ')
334 | for k = 1:20
335 | z = at*y; wait(gpuDevice);
336 | end
337 | toc;
338 |
339 | tic; fprintf('A''*y (gpuSparse): ')
340 | for k = 1:20
341 | z = a'*y; wait(gpuDevice);
342 | end
343 | toc;
344 |
345 | end
--------------------------------------------------------------------------------