├── README.md ├── gpuSparse.m ├── private ├── coo2csr.cu ├── coosortByRow.cu ├── csr2coo.cu ├── csr2csc.cu ├── csr2csc_cpu.cu ├── csrgeam.cu ├── csrmm.cu ├── csrmv.cu ├── csrsort.cu ├── mex_all.m ├── mxShowCriticalErrorMessage.h └── wrappers_to_cuda_11.h └── test_gpuSparse.m /README.md: -------------------------------------------------------------------------------- 1 | # gpuSparse 2 | 3 | Matlab mex wrappers to NVIDIA cuSPARSE (https://developer.nvidia.com/cusparse). 4 | 5 | 6 | Uses int32 and single precision to save memory (Matlab sparse uses int64 and double). 7 | 8 | 9 | ## Installation 10 | 11 | 12 | 1. Save in a folder called @gpuSparse on the Matlab path 13 | 14 | 2. ```A = gpuSparse('recompile')``` to trigger compilation of mex 15 | 16 | 3. Recommended: CUDA-11 for much faster transpose-multiply 17 | 18 | ## Timings 19 |

20 | Due to memory layout (row/col-major) multiply and transpose-multiply differ in performance.
21 | 
22 | size(A) = 221,401 x 213,331
23 | nnz(A)  = 23,609,791 (0.05%)
24 | AT      = precomputed transpose of A
25 | 
26 | CPU sparse
27 | A*x  (sparse)   : Elapsed time is 1.370207 seconds.
28 | AT*y (sparse)   : Elapsed time is 1.347447 seconds.
29 | A'*y (sparse)   : Elapsed time is 0.267259 seconds.
30 | 
31 | GPU sparse
32 | A*x  (gpuArray) : Elapsed time is 0.137195 seconds.
33 | AT*y (gpuArray) : Elapsed time is 0.106331 seconds.
34 | A'*y (gpuArray) : Elapsed time is 0.232057 seconds. (CUDA 11)
35 | A'*y (gpuArray) : Elapsed time is 16.733638 seconds.
36 | 
37 | GPU gpuSparse
38 | A*x  (gpuSparse): Elapsed time is 0.068451 seconds.
39 | AT*y (gpuSparse): Elapsed time is 0.063651 seconds.
40 | A'*y (gpuSparse): Elapsed time is 0.059236 seconds. (CUDA 11)
41 | A'*y (gpuSparse): Elapsed time is 3.094271 seconds.
42 |

43 | -------------------------------------------------------------------------------- /gpuSparse.m: -------------------------------------------------------------------------------- 1 | classdef gpuSparse 2 | %% 3 | % Sparse GPU array class (mex wrappers to cuSPARSE) 4 | % using int32 indices and single precision values. 5 | % 6 | % Usage: A = gpuSparse(row,col,val,nrows,ncols,nzmax) 7 | % 8 | % To recompile mex call gpuSparse('recompile') 9 | % 10 | % The nzmax argument can be used to check sufficient 11 | % memory: gpuSparse([],[],[],nrows,ncols,nzmax) 12 | % 13 | %% 14 | properties (SetAccess = private) %immutable) 15 | 16 | nrows(1,1) int32 % number of rows 17 | ncols(1,1) int32 % number of columns 18 | 19 | end 20 | 21 | properties (SetAccess = private, Hidden = true) 22 | 23 | row(:,1) gpuArray % int32 row index (CSR format) 24 | col(:,1) gpuArray % int32 column index 25 | val(:,1) gpuArray % single precision values 26 | trans(1,1) int32 % lazy transpose flag (passed to cuSPARSE) 27 | % 0 = CUSPARSE_OPERATION_NON_TRANSPOSE 28 | % 1 = CUSPARSE_OPERATION_TRANSPOSE 29 | % 2 = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 30 | 31 | end 32 | 33 | %% 34 | methods 35 | 36 | %% constructor: same syntax as matlab's sparse 37 | function A = gpuSparse(row,col,val,nrows,ncols,nzmax) 38 | 39 | % empty gpuSparse matrix 40 | if nargin==0 41 | row = []; col = []; val = []; 42 | end 43 | 44 | % expecting a matrix, return gpuSparse ("row" is the first argument) 45 | if nargin==1 46 | if isa(row,'gpuSparse'); A = row; return; end % return unchanged 47 | if isequal(row,'recompile'); mex_all; return; end % recompile mex 48 | if ~isnumeric(row) && ~islogical(row); error('Cannot convert ''%s'' to gpuSparse.',class(row)); end 49 | if ~ismatrix(row); error('Cannot convert ND array to gpuSparse.'); end 50 | [nrows ncols] = size(row); 51 | [row col val] = find(row); % if sparse, could grab the CSR vectors directly but needs mex = hassle 52 | end 53 | 54 | % empty m x n matrix 55 | if nargin==2 56 | nrows = row; ncols = col; 57 | row = []; col = []; val = []; 58 | end 59 | 60 | % catch illegal no. arguments 61 | if nargin==4 || nargin>6 62 | error('Wrong number of arguments.'); 63 | end 64 | 65 | % validate argument types 66 | validateattributes(row,{'numeric','gpuArray'},{'integer'},'','row'); 67 | validateattributes(col,{'numeric','gpuArray'},{'integer'},'','col'); 68 | validateattributes(val,{'numeric','gpuArray','logical'},{},'','val'); 69 | 70 | % check vector lengths 71 | row = reshape(row,[],1); 72 | col = reshape(col,[],1); 73 | val = reshape(val,[],1); 74 | if numel(row)~=numel(col) 75 | error('Vectors must be the same length (row=%i col=%i).',numel(row),numel(col)); 76 | end 77 | if numel(val)~=numel(row) 78 | if numel(val)==1 79 | val = repmat(val,numel(row),1); 80 | else 81 | error('Vectors must be the same length (row=%i val=%i).',numel(row),numel(val)); 82 | end 83 | end 84 | 85 | % check bounds of indices 86 | if numel(row) > 0 87 | A.nrows = gather(max(row)); 88 | if min(row)<1 || A.nrows==intmax('int32') 89 | error('row indices must be between 1 and %i.',intmax('int32')-1); 90 | end 91 | A.ncols = gather(max(col)); 92 | if min(col)<1 || A.ncols==intmax('int32') 93 | error('col indices must be between 1 and %i.',intmax('int32')-1); 94 | end 95 | end 96 | 97 | % check and apply user-supplied matrix dims 98 | if exist('nrows','var') 99 | nrows = gather(nrows); 100 | validateattributes(nrows,{'numeric'},{'scalar','integer','>=',A.nrows,'<',intmax('int32')},'','nrows'); 101 | A.nrows = nrows; 102 | end 103 | if exist('ncols','var') 104 | ncols = gather(ncols); 105 | validateattributes(ncols,{'numeric'},{'scalar','integer','>=',A.ncols,'<',intmax('int32')},'','ncols'); 106 | A.ncols = ncols; 107 | end 108 | 109 | % simple memory check - needs work 110 | if ~exist('nzmax','var') 111 | nzmax = numel(val); 112 | else 113 | nzmax = gather(nzmax); 114 | validateattributes(nzmax,{'numeric'},{'scalar','integer','>=',numel(val)},'','nzmax'); 115 | end 116 | RequiredMemory = 4*double(A.nrows+1)/1E9; 117 | RequiredMemory = RequiredMemory+4*double(nzmax)/1E9; 118 | RequiredMemory = RequiredMemory+4*double(nzmax)/1E9; 119 | AvailableMemory = getfield(gpuDevice(),'AvailableMemory') / 1E9; 120 | if RequiredMemory > AvailableMemory 121 | error('Not enough memory (%.1fGb required, %.1fGb available).',RequiredMemory,AvailableMemory); 122 | end 123 | 124 | % cast to required class 125 | row = int32(row); 126 | col = int32(col); 127 | val = single(val); 128 | 129 | % sort row and col for COO to CSR conversion (MATLAB version) 130 | %[B I] = sortrows([row col]); 131 | %A.row = B(:,1); 132 | %A.col = B(:,2); 133 | %A.val = val(I); 134 | %clear B I row col val 135 | 136 | % sort row and col for COO to CSR conversion (CUDA version) 137 | try 138 | [A.row A.col A.val] = coosortByRow(row,col,val,A.nrows,A.ncols); 139 | catch ME 140 | error('%s Try gpuSparse(''recompile'') to recompile mex.',ME.message); 141 | end 142 | 143 | % convert from COO to CSR 144 | A.row = coo2csr(A.row,A.nrows); 145 | 146 | end 147 | 148 | %% enforce some class properties - inexpensive checks only 149 | function A = set.row(A,row) 150 | if ~iscolumn(row) || ~isequal(classUnderlying(row),'int32') 151 | error('Property row must be a column vector of int32s.') 152 | end 153 | A.row = row; 154 | end 155 | function A = set.col(A,col) 156 | if ~iscolumn(col) || ~isequal(classUnderlying(col),'int32') 157 | error('Property col must be a column vector of int32s.') 158 | end 159 | A.col = col; 160 | end 161 | function A = set.val(A,val) 162 | if ~iscolumn(val) || ~isequal(classUnderlying(val),'single') 163 | error('Property val must be a column vector of singles.') 164 | end 165 | A.val = val; 166 | end 167 | function A = set.trans(A,trans) 168 | if trans~=0 && trans~=1 && trans~=2 169 | error('Property trans must be 0, 1 or 2.') 170 | end 171 | if isreal(A) && trans==2 172 | error('Real matrix trans flag must be 0 or 1'); 173 | end 174 | A.trans = trans; 175 | end 176 | 177 | %% validation - helpful for testing 178 | function validate(A) 179 | 180 | message = 'Validation failure.'; 181 | 182 | % fast checks 183 | if ~isa(A.nrows,'int32'); error(message); end 184 | if ~isa(A.ncols,'int32'); error(message); end 185 | if ~isa(A.trans,'int32'); error(message); end 186 | if ~isa(A.row,'gpuArray'); error(message); end 187 | if ~isa(A.col,'gpuArray'); error(message);end 188 | if ~isa(A.val,'gpuArray'); error(message); end 189 | if ~isequal(classUnderlying(A.row),'int32'); error(message); end 190 | if ~isequal(classUnderlying(A.col),'int32'); error(message); end 191 | if ~isequal(classUnderlying(A.val),'single'); error(message); end 192 | if A.nrows < 0; error(message); end 193 | if A.ncols < 0; error(message); end 194 | if A.nrows == intmax('int32'); error(message); end 195 | if A.ncols == intmax('int32'); error(message); end 196 | if ~iscolumn(A.row); error(message); end 197 | if ~iscolumn(A.col); error(message); end 198 | if ~iscolumn(A.val); error(message); end 199 | if numel(A.col) ~= numel(A.val); error(message); end 200 | if numel(A.row) ~= A.nrows+1; error(message); end 201 | if A.row(1) ~= 1; error(message); end 202 | if A.row(end) ~= numel(A.val)+1; error(message); end 203 | if A.trans~=0 && A.trans~=1 && A.trans~=2; error(message); end 204 | if isreal(A) && A.trans==2; error(message); end 205 | 206 | % slow checks 207 | if numel(A.val) > 0 208 | if min(A.col) < 1; error(message); end 209 | if max(A.col) > A.ncols; error(message); end 210 | rowcol = gather([csr2coo(A.row,A.nrows) A.col]); 211 | if ~issorted(rowcol,'rows'); error(message); end 212 | end 213 | 214 | end 215 | 216 | %% overloaded functions 217 | 218 | % isreal 219 | function retval = isreal(A) 220 | retval = isreal(A.val); 221 | end 222 | 223 | % real 224 | function A = real(A) 225 | A.val = real(A.val); 226 | if A.trans==2; A.trans = 1; end 227 | A = drop_zeros(A); 228 | end 229 | 230 | % imag 231 | function A = imag(A) 232 | A.val = imag(A.val); 233 | if A.trans==2; A.trans = 1; end 234 | A = drop_zeros(A); 235 | end 236 | 237 | % abs 238 | function A = abs(A) 239 | A.val = abs(A.val); 240 | if A.trans==2; A.trans = 1; end 241 | end 242 | 243 | % angle 244 | function A = angle(A) 245 | A.val = angle(A.val); 246 | if A.trans==2; A.trans = 1; end 247 | A = drop_zeros(A); 248 | end 249 | 250 | % conj 251 | function A = conj(A) 252 | A.val = conj(A.val); 253 | end 254 | 255 | % sign 256 | function A = sign(A) 257 | A.val = sign(A.val); 258 | if A.trans==2; A.trans = 1; end 259 | end 260 | 261 | % complex 262 | function A = complex(A) 263 | A.val = complex(A.val); 264 | end 265 | 266 | % classUnderlying 267 | function str = classUnderlying(A) 268 | str = classUnderlying(A.val); 269 | end 270 | 271 | % gt (only support scalar) 272 | function A = gt(A,tol); 273 | if ~isscalar(tol) 274 | error('Non-scalar argument not supported.'); 275 | end 276 | A.val = cast(A.val > tol,classUnderlying(A)); 277 | if A.trans==2; A.trans = 1; end 278 | A = drop_zeros(A); 279 | end 280 | 281 | % lt (only support scalar) 282 | function A = lt(A,tol); 283 | if ~isscalar(tol) 284 | error('Non-scalar argument not supported.'); 285 | end 286 | A.val = cast(A.val < tol,classUnderlying(A)); 287 | if A.trans==2; A.trans = 1; end 288 | A = drop_zeros(A); 289 | end 290 | 291 | % eq (only support scalar) 292 | function A = eq(A,tol); 293 | if ~isscalar(tol) 294 | error('Non-scalar argument not supported.'); 295 | end 296 | A.val = cast(A.val == tol,classUnderlying(A)); 297 | if A.trans==2; A.trans = 1; end 298 | A = drop_zeros(A); 299 | end 300 | 301 | % nnz 302 | function retval = nnz(A) 303 | retval = nnz(A.val); 304 | end 305 | 306 | % length 307 | function retval = length(A) 308 | retval = max(size(A)); 309 | end 310 | 311 | % nzmax 312 | function retval = nzmax(A) 313 | retval = numel(A.val); 314 | end 315 | 316 | % mean: only A and DIM args are supported 317 | function retval = mean(A,DIM) 318 | if nargin==1; DIM = 1; end 319 | retval = sum(A,DIM) / size(A,DIM); 320 | end 321 | 322 | % nonzeros 323 | function val = nonzeros(A) 324 | val = nonzeros(A.val); 325 | if A.trans==2 326 | val = conj(val); 327 | end 328 | end 329 | 330 | % sum: only A and DIM args are supported 331 | function retval = sum(A,DIM) 332 | if nargin==1 333 | DIM = 1; 334 | else 335 | validateattributes(DIM,{'numeric'},{'integer','positive'},'','DIM') 336 | end 337 | if numel(A)==0 338 | retval = sum(zeros(size(A)),DIM); 339 | retval = gpuSparse(retval); 340 | else 341 | switch DIM 342 | case 1; retval =(A'* ones(size(A,1),1,'like',A.val))'; 343 | case 2; retval = A * ones(size(A,2),1,'like',A.val); 344 | otherwise; retval = A; 345 | end 346 | end 347 | end 348 | 349 | % norm: support same types as sparse 350 | function retval = norm(A,p); 351 | if nargin<2; p = 2; end 352 | if isvector(A) 353 | retval = norm(A.val,p); 354 | else 355 | if isequal(p,2) 356 | error('gpuSparse norm(A,2) is not supported.'); 357 | elseif isequal(p,1) 358 | retval = max(sum(abs(A),1)); 359 | elseif isequal(p,Inf) 360 | retval = max(sum(abs(A),2)); 361 | elseif isequal(p,'fro'); 362 | retval = norm(A.val); 363 | else 364 | error('The only matrix norms supported are 1, 2, inf, and ''fro''.'); 365 | end 366 | end 367 | end 368 | 369 | % max: support for max(A,[],2) only 370 | function retval = max(A,Y,DIM); 371 | if nargin ~= 3 || ~isempty(Y) || ~isequal(DIM,2) 372 | error('Only 3 argument form supported: max(A,[],2).'); 373 | end 374 | if A.trans 375 | error('Transpose max not supported - try full_transpose(A).') 376 | end 377 | 378 | % do it on CPU to reduce transfer overhead 379 | row = gather(A.row); 380 | val = gather(A.val); 381 | retval = zeros(A.nrows,1,'like',val); 382 | 383 | for j = 1:A.nrows 384 | k = row(j):row(j+1)-1; 385 | if ~isempty(k) 386 | retval(j) = max(val(k)); 387 | end 388 | end 389 | end 390 | 391 | % size 392 | function varargout = size(A,DIM) 393 | if A.trans==0 394 | m = double(A.nrows); 395 | n = double(A.ncols); 396 | else 397 | n = double(A.nrows); 398 | m = double(A.ncols); 399 | end 400 | if nargin>1 401 | if nargout>1 402 | error('too many output arguments.'); 403 | end 404 | if ~isscalar(DIM) || DIM<=0 || mod(DIM,1) 405 | error('Dimension argument must be a positive integer scalar.') 406 | elseif DIM==1 407 | varargout{1} = m; 408 | elseif DIM==2 409 | varargout{1} = n; 410 | else 411 | varargout{1} = 1; 412 | end 413 | else 414 | if nargout==0 || nargout==1 415 | varargout{1} = [m n]; 416 | else 417 | varargout{1} = m; 418 | varargout{2} = n; 419 | for k = 3:nargout 420 | varargout{k} = 1; 421 | end 422 | end 423 | end 424 | end 425 | 426 | % find: returns indices on the GPU (not efficient, mainly for debugging) 427 | function varargout = find(A) 428 | if nargin>1; error('only 1 input argument supported'); end 429 | if nargout>3; error('too many ouput arguments'); end 430 | 431 | % COO format on GPU 432 | i = csr2coo(A.row,A.nrows); 433 | j = A.col; 434 | v = A.val; 435 | 436 | % remove explicit zeros 437 | nz = (v ~= 0); 438 | i = i(nz); 439 | j = j(nz); 440 | v = v(nz); 441 | 442 | % MATLAB style, double precision, sorted columns 443 | if A.trans 444 | [i j] = deal(j,i); 445 | else 446 | [~,k] = sortrows([j i]); 447 | i = i(k); 448 | j = j(k); 449 | end 450 | i = double(i); 451 | j = double(j); 452 | 453 | if nargout==0 || nargout==1 454 | varargout{1} = sub2ind(size(A),i,j); 455 | else 456 | varargout{1} = i; 457 | varargout{2} = j; 458 | end 459 | if nargout==3 460 | if A.trans==0; varargout{3} = v(k); end 461 | if A.trans==1; varargout{3} = v; end 462 | if A.trans==2; varargout{3} = conj(v); end 463 | end 464 | end 465 | 466 | % add: C = A+B 467 | function C = plus(A,B) 468 | C = geam(A,B,1,1); 469 | end 470 | 471 | % minus: C = A-B 472 | function C = minus(A,B) 473 | C = geam(A,B,1,-1); 474 | end 475 | 476 | % csrgeam: C = a*A + b*B 477 | function C = geam(A,B,a,b) 478 | A = gpuSparse(A); 479 | B = gpuSparse(B); 480 | if ~isequal(size(A),size(B)) 481 | error('Matrices must be the same size.') 482 | end 483 | if ~isreal(A) || ~isreal(B) 484 | error('Complex addition not supported at the moment.') 485 | end 486 | if A.trans ~= B.trans 487 | error('Matrix addition with lazy transpose not fully supported.') 488 | end 489 | validateattributes(a,{'numeric'},{'real','scalar','finite'},'','a'); 490 | validateattributes(b,{'numeric'},{'real','scalar','finite'},'','b'); 491 | if A.trans 492 | [n m] = size(A); 493 | else 494 | [m n] = size(A); 495 | end 496 | C = gpuSparse(m,n); 497 | C.trans = A.trans; 498 | [C.row C.col C.val] = csrgeam(A.row,A.col,A.val,m,n,B.row,B.col,B.val,a,b); 499 | end 500 | 501 | % mtimes: A*x (or x*A for scalar x) 502 | function y = mtimes(A,x) 503 | if isa(x,'gpuSparse') && ~isa(A,'gpuSparse') 504 | [A x] = deal(x,A); 505 | end 506 | if ~isnumeric(x) && islogical(x) 507 | error('Argument x must be numeric (%s not supported).',class(x)) 508 | elseif isscalar(x) && ~iscolumn(A) 509 | y = A; 510 | y.val = y.val * x; 511 | elseif isvector(x) 512 | if isreal(A) 513 | y = csrmv(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,x); 514 | else 515 | y = csrmv(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,complex(x)); 516 | end 517 | elseif ismatrix(x) 518 | if isreal(A) 519 | y = csrmm(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,x); 520 | else 521 | y = csrmm(A.row,A.col,A.val,A.nrows,A.ncols,A.trans,complex(x)); 522 | end 523 | end 524 | end 525 | 526 | % times: A.*x or x.*A (scalar x only) 527 | function A = times(A,x) 528 | if isa(x,'gpuSparse') && ~isa(A,'gpuSparse') 529 | [A x] = deal(x,A); 530 | end 531 | if ~isnumeric(x) && ~islogical(x) && ~isempty(x) 532 | error('Argument x must be numeric (%s not supported).',class(x)) 533 | elseif isscalar(x) && isfinite(x) 534 | A.val = A.val .* x; 535 | else 536 | error('Multiplication only supported for finite scalars.') 537 | end 538 | end 539 | 540 | % divide: A./x 541 | function A = rdivide(A,x) 542 | if isa(x,'gpuSparse') 543 | error('Division by gpuSparse array not supported.'); 544 | end 545 | A = times(A,1./x); 546 | end 547 | 548 | % divide: A/x (scalar x only) 549 | function A = mrdivide(A,x) 550 | A = A./x; 551 | end 552 | 553 | % power: A.^x 554 | function A = power(A,x) 555 | if isa(x,'gpuSparse') || ~isscalar(x) 556 | error('Power A.^x only supported for gpuSparse A and scalar x.'); 557 | end 558 | A.val = A.val.^x; 559 | end 560 | 561 | % full transpose: A.' 562 | function AT = full_transpose(A) 563 | if A.trans 564 | AT = A; 565 | AT.trans = 0; 566 | if ~isreal(A) && A.trans==2 567 | AT.val = conj(AT.val); 568 | end 569 | else 570 | [m n] = size(A); 571 | AT = gpuSparse([],[],[],n,m,nnz(A)); 572 | 573 | if nnz(A) % cuSPARSE breaks if nnz==0 so avoid call 574 | if 1 % older cuSPARSE used excessive memory - seems OK now 575 | [AT.col AT.row AT.val] = csr2csc(A.row,A.col,A.val,m,n); 576 | else % cpu version 577 | row = gather(A.row); 578 | col = gather(A.col); 579 | val = gather(A.val); 580 | [col row val] = csr2csc_cpu(row,col,val,m,n); 581 | AT.col = gpuArray(col); 582 | AT.row = gpuArray(row); 583 | AT.val = gpuArray(val); 584 | end 585 | end 586 | end 587 | end 588 | 589 | % full ctranspose: A' 590 | function AT = full_ctranspose(A) 591 | if A.trans 592 | AT = A; 593 | AT.trans = 0; 594 | else 595 | AT = full_transpose(A); 596 | end 597 | if ~isreal(A) && A.trans~=2 598 | AT.val = conj(AT.val); 599 | end 600 | end 601 | 602 | % lazy transpose (flag): A.' 603 | function AT = transpose(A) 604 | AT = A; % lazy copy 605 | switch A.trans 606 | case 0; AT.trans = 1; 607 | case 1; AT.trans = 0; 608 | case 2; AT.trans = 0; AT.val = conj(AT.val); 609 | end 610 | end 611 | 612 | % lazy transpose (flag): A' 613 | function AT = ctranspose(A) 614 | AT = A; % lazy copy 615 | switch A.trans 616 | case 0; if isreal(A); AT.trans = 1; else; AT.trans = 2; end 617 | case 1; AT.trans = 0; if ~isreal(A); AT.val = conj(AT.val); end 618 | case 2; AT.trans = 0; 619 | end 620 | end 621 | 622 | % remove zeros from sparse matrix 623 | function A = drop_zeros(A,tol) 624 | if nargin<2 625 | nz = (A.val ~= 0); 626 | else 627 | validateattributes(tol,{'numeric'},{'nonnegative','scalar'},'','tol'); 628 | nz = abs(A.val) < tol; 629 | end 630 | if any(nz) 631 | A.row = csr2coo(A.row,A.nrows); 632 | A.row = A.row(nz); 633 | A.row = coo2csr(A.row,A.nrows); 634 | A.col = A.col(nz); 635 | A.val = A.val(nz); 636 | end 637 | end 638 | 639 | % sparse: returns sparse matrix on GPU 640 | function A_sp = sparse(A) 641 | [m n] = size(A); 642 | i = csr2coo(A.row,A.nrows); 643 | j = A.col; 644 | v = double(A.val); 645 | switch A.trans 646 | % int32 indices ok (2020a) 647 | case 0; A_sp = sparse(i,j,v,m,n); 648 | case 1; A_sp = sparse(j,i,v,m,n); 649 | case 2; A_sp = sparse(j,i,conj(v),m,n); 650 | end 651 | end 652 | 653 | % gather: returns sparse matrix on CPU - gather(sparse(A)) is faster but memory intensive 654 | function A_sp = gather(A) 655 | [m n] = size(A); 656 | i = gather(csr2coo(A.row,A.nrows)); 657 | j = gather(A.col); 658 | v = gather(double(A.val)); % double for sparse 659 | switch A.trans 660 | % sparse int32 indices ok (2020a) 661 | case 0; A_sp = sparse(i,j,v,m,n); 662 | case 1; A_sp = sparse(j,i,v,m,n); 663 | case 2; A_sp = sparse(j,i,conj(v),m,n); 664 | end 665 | end 666 | 667 | % full: returns full matrix on CPU (not efficient, mainly for debugging) 668 | function A_f = full(A) 669 | i = gather(csr2coo(A.row,A.nrows)); 670 | j = gather(A.col); 671 | v = gather(A.val); 672 | switch A.trans 673 | % sparse int32 indices ok (2020a) 674 | case 0; k = sub2ind(size(A),i,j); 675 | case 1; k = sub2ind(size(A),j,i); 676 | case 2; k = sub2ind(size(A),j,i); v = conj(v); 677 | end 678 | A_f = zeros(size(A),'like',v); 679 | A_f(k) = v; 680 | end 681 | 682 | % numel - should it be 1 object or prod(size(A)) elements? 683 | function retval = numel(A) 684 | retval = prod(size(A)); 685 | end 686 | 687 | % cat 688 | function C = cat(dim,A,B) 689 | switch dim 690 | case 1; C = vertcat(A,B); 691 | case 2; C = horzcat(A,B); 692 | otherwise; error('Concatenation only supported for dim=1 or 2.'); 693 | end 694 | end 695 | 696 | % vertcat 697 | function C = vertcat(A,B) 698 | if ~isa(B,'gpuSparse') 699 | error('Concatenation only supported for gpuSparse.'); 700 | end 701 | if A.trans || B.trans 702 | error('Concatenation not supported with transpose.'); 703 | end 704 | if size(A,2)~=size(B,2) 705 | error('Concatenation requires number of cols be equal.'); 706 | end 707 | C = gpuSparse(size(A,1)+size(B,1),size(A,2)); 708 | C.row = [A.row;B.row(2:end)+numel(A.val)]; 709 | C.col = [A.col;B.col]; 710 | C.val = [A.val;B.val]; 711 | end 712 | 713 | % horzcat - possible to avoid csr2coo calls? 714 | function C = horzcat(A,B) 715 | if ~isa(B,'gpuSparse') || A.trans || B.trans 716 | error('Concatenation only supported for non-tranposed gpuSparse.'); 717 | end 718 | if A.trans || B.trans 719 | error('Concatenation not supported with transpose.'); 720 | end 721 | if size(A,1)~=size(B,1) 722 | error('Concatenation requires number of rows be equal.'); 723 | end 724 | i = [csr2coo(A.row,A.nrows);csr2coo(B.row,B.nrows)]; 725 | j = [A.col;B.col+size(A,2)]; 726 | v = [A.val;B.val]; 727 | C = gpuSparse(i,j,v,size(A,1),size(A,2)+size(B,2)); 728 | end 729 | 730 | % Mathworks suggested this to help fix . indexing 731 | function retval = numArgumentsFromSubscript(A, s, ic) 732 | retval = builtin('numArgumentsFromSubscript', A, s, ic); 733 | end 734 | 735 | % the following are hard - don't implement 736 | function retval = subsref(A,s) 737 | if isequal(s.type,'.') 738 | retval = A.(s.subs); 739 | else 740 | error('subsref not implemented.'); 741 | end 742 | end 743 | function retval = subsasgn(A,s,b) 744 | error('subsasgn not implemented.'); 745 | end 746 | function A = reshape(A,m,n) 747 | error('reshape not implemented.'); 748 | end 749 | end 750 | end 751 | -------------------------------------------------------------------------------- /private/coo2csr.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE format converter (coo2csr). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | // MATLAB related 20 | #include "mex.h" 21 | #include "gpu/mxGPUArray.h" 22 | #include "mxShowCriticalErrorMessage.h" 23 | 24 | // Input Arguments 25 | #define ROW prhs[0] 26 | #define NROWS prhs[1] 27 | 28 | // Output Arguments 29 | #define ROW_CSR plhs[0] 30 | 31 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 32 | { 33 | // Checks 34 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 35 | if (nrhs != 2) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 36 | 37 | // Initialize the MathWorks GPU API 38 | mxInitGPU(); 39 | 40 | // Create Matlab pointers on the GPU 41 | mxGPUArray const *row = mxGPUCreateFromMxArray(ROW); 42 | 43 | // Checks - note rows must be in COO (uncompressed) format 44 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 45 | if (mxGPUGetClassID(row) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW argument is not int32"); 46 | 47 | mwSize nrows = mxGetScalar(NROWS); 48 | mwSize nnz = mxGPUGetNumberOfElements(row); 49 | 50 | // Create space for output vector 51 | const mwSize ndim = 1; 52 | mwSize dims[ndim] = {nrows+1}; 53 | mxGPUArray *row_csr = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 54 | if (row_csr==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 55 | 56 | // Get handle to the CUBLAS context 57 | cublasHandle_t cublasHandle = 0; 58 | cublasStatus_t cublasStatus; 59 | cublasStatus = cublasCreate(&cublasHandle); 60 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 61 | 62 | // Get handle to the CUSPARSE context 63 | cusparseHandle_t cusparseHandle = 0; 64 | cusparseStatus_t cusparseStatus; 65 | cusparseStatus = cusparseCreate(&cusparseHandle); 66 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 67 | cusparseMatDescr_t descr = 0; 68 | cusparseStatus = cusparseCreateMatDescr(&descr); 69 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 70 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 71 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 72 | 73 | // Convert from matlab pointers to native pointers 74 | const int * const d_row = (int*)mxGPUGetDataReadOnly(row); 75 | int *d_row_csr = (int*)mxGPUGetData(row_csr); 76 | char message[128] = {'\0'}; 77 | int *buffer = NULL; 78 | 79 | // Call coo2csr - returns uninitialized when nnz==0 so need to handle separately 80 | if (nnz == 0) 81 | { 82 | buffer = (int *)mxMalloc((nrows+1)*sizeof(int)); 83 | if (buffer == NULL) mxShowCriticalErrorMessage("mxMalloc failed"); 84 | for (int j=0; j 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | // MATLAB related 20 | #include "mex.h" 21 | #include "gpu/mxGPUArray.h" 22 | #include "mxShowCriticalErrorMessage.h" 23 | 24 | // Input Arguments 25 | #define ROW prhs[0] 26 | #define COL prhs[1] 27 | #define VAL prhs[2] 28 | #define NROWS prhs[3] 29 | #define NCOLS prhs[4] 30 | 31 | // Output Arguments 32 | #define ROW_SORT plhs[0] 33 | #define COL_SORT plhs[1] 34 | #define VAL_SORT plhs[2] 35 | 36 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 37 | { 38 | // Checks 39 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 40 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 41 | 42 | // Initialize the MathWorks GPU API 43 | mxInitGPU(); 44 | 45 | // Create Matlab pointers on the GPU 46 | mxGPUArray const *row = mxGPUCreateFromMxArray(ROW); 47 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL); 48 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL); 49 | 50 | // Checks - note vectors must be in COO (uncompressed) format 51 | int nnz = mxGPUGetNumberOfElements(val); 52 | if (mxGPUGetNumberOfElements(row) != nnz) mxShowCriticalErrorMessage("ROW and VAL argument length mismatch"); 53 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL and VAL argument length mismatch"); 54 | 55 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 56 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 57 | 58 | if (mxGPUGetClassID(row) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW argument is not int32"); 59 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 60 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 61 | 62 | int nrows = (int)mxGetScalar(NROWS); 63 | int ncols = (int)mxGetScalar(NCOLS); 64 | 65 | // Create space for output vectors 66 | const mwSize ndim = 1; 67 | mwSize dims[ndim]; 68 | 69 | dims[0] = nnz; 70 | mxGPUArray *row_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 71 | if (row_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 72 | 73 | mxGPUArray *col_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 74 | if (col_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 75 | 76 | mxComplexity ccx = mxGPUGetComplexity(val); 77 | mxGPUArray *val_sort = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES); 78 | if (val_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 79 | 80 | // Get handle to the CUBLAS context 81 | cublasHandle_t cublasHandle = 0; 82 | cublasStatus_t cublasStatus; 83 | cublasStatus = cublasCreate(&cublasHandle); 84 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 85 | 86 | // Get handle to the CUSPARSE context 87 | cudaError_t cudaStatus; 88 | cusparseStatus_t cusparseStatus; 89 | cusparseHandle_t cusparseHandle = 0; 90 | cusparseStatus = cusparseCreate(&cusparseHandle); 91 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 92 | cusparseMatDescr_t descr = 0; 93 | cusparseStatus = cusparseCreateMatDescr(&descr); 94 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 95 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 96 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 97 | 98 | // Convert from matlab pointers to native pointers 99 | const int * const d_row = (int*)mxGPUGetDataReadOnly(row); 100 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col); 101 | int *d_col_sort = (int*)mxGPUGetData(col_sort); 102 | int *d_row_sort = (int*)mxGPUGetData(row_sort); 103 | 104 | // Since sort is in-place, copy the read-only vectors to the read-write ones 105 | cudaStatus = cudaMemcpy((void *)d_row_sort, d_row, nnz*sizeof(int), cudaMemcpyDeviceToDevice); 106 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus); 107 | 108 | cudaStatus = cudaMemcpy((void *)d_col_sort, d_col, nnz*sizeof(int), cudaMemcpyDeviceToDevice); 109 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus); 110 | 111 | if (ccx == mxREAL) 112 | { 113 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val); 114 | float *d_val_sort = (float*)mxGPUGetData(val_sort); 115 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice); 116 | } 117 | else 118 | { 119 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 120 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort); 121 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(cuFloatComplex), cudaMemcpyDeviceToDevice); 122 | } 123 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus); 124 | 125 | // Sort by rows 126 | int *P = NULL; 127 | void *pBuffer = NULL; 128 | size_t pBufferSizeInBytes = 0; 129 | 130 | if (nnz > 0) 131 | { 132 | // step 1: allocate buffer 133 | cusparseStatus = cusparseXcoosort_bufferSizeExt(cusparseHandle, nrows, ncols, nnz, d_row, d_col, &pBufferSizeInBytes); 134 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosort_bufferSizeExt failed",cusparseStatus); 135 | 136 | cudaStatus = cudaMalloc( &pBuffer, sizeof(char)*pBufferSizeInBytes); 137 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus); 138 | 139 | // step 2: setup permutation vector P to identity 140 | cudaStatus = cudaMalloc( &P, sizeof(int)*nnz); 141 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus); 142 | 143 | cusparseStatus = cusparseCreateIdentityPermutation(cusparseHandle, nnz, P); 144 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseCreateIdentityPermutation failed",cusparseStatus); 145 | 146 | // step 3: sort COO format by Row 147 | cusparseStatus = cusparseXcoosortByRow(cusparseHandle, nrows, ncols, nnz, d_row_sort, d_col_sort, P, pBuffer); 148 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosortByRow failed",cusparseStatus); 149 | 150 | // step 4: gather sorted cooVals 151 | if (ccx == mxREAL) 152 | { 153 | float *d_val = (float*)mxGPUGetDataReadOnly(val); 154 | float *d_val_sort = (float*)mxGPUGetData(val_sort); 155 | #if CUDART_VERSION >= 11000 156 | cusparseHandle_t handle = NULL; 157 | cusparseDnVecDescr_t vec_values; 158 | cusparseSpVecDescr_t vec_permutation; 159 | cusparseCreate(&handle); 160 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_R_32F); 161 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); // MUST USE BASE_ZERO 162 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation); 163 | cusparseDestroyDnVec(vec_values); 164 | cusparseDestroySpVec(vec_permutation); 165 | cusparseDestroy(handle); 166 | #else 167 | cusparseStatus = cusparseSgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO 168 | #endif 169 | } 170 | else 171 | { 172 | cuFloatComplex *d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 173 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort); 174 | #if CUDART_VERSION >= 11000 175 | cusparseHandle_t handle = NULL; 176 | cusparseDnVecDescr_t vec_values; 177 | cusparseSpVecDescr_t vec_permutation; 178 | cusparseCreate(&handle); 179 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_C_32F); 180 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F); // MUST USE BASE_ZERO 181 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation); 182 | cusparseDestroyDnVec(vec_values); 183 | cusparseDestroySpVec(vec_permutation); 184 | cusparseDestroy(handle); 185 | #else 186 | cusparseStatus = cusparseCgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO 187 | #endif 188 | } 189 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseSgthr or cusparseCgthr failed",cusparseStatus); 190 | 191 | } 192 | 193 | // Return result 194 | ROW_SORT = mxGPUCreateMxArrayOnGPU(row_sort); 195 | COL_SORT = mxGPUCreateMxArrayOnGPU(col_sort); 196 | VAL_SORT = mxGPUCreateMxArrayOnGPU(val_sort); 197 | 198 | // Make sure operations are finished before deleting 199 | //cudaDeviceSynchronize(); 200 | 201 | // Clean up 202 | cusparseDestroyMatDescr(descr); 203 | cusparseDestroy(cusparseHandle); 204 | cublasDestroy(cublasHandle); 205 | mxGPUDestroyGPUArray(row); 206 | mxGPUDestroyGPUArray(row_sort); 207 | mxGPUDestroyGPUArray(col); 208 | mxGPUDestroyGPUArray(col_sort); 209 | mxGPUDestroyGPUArray(val); 210 | mxGPUDestroyGPUArray(val_sort); 211 | if (pBuffer) cudaFree(pBuffer); 212 | if (P) cudaFree(P); 213 | 214 | return; 215 | } 216 | -------------------------------------------------------------------------------- /private/csr2coo.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE format converter (csr2coo). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | // MATLAB related 20 | #include "mex.h" 21 | #include "gpu/mxGPUArray.h" 22 | #include "mxShowCriticalErrorMessage.h" 23 | 24 | // Input Arguments 25 | #define ROW_CSR prhs[0] 26 | #define NROWS prhs[1] 27 | 28 | // Output Arguments 29 | #define ROW plhs[0] 30 | 31 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 32 | { 33 | // Checks 34 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 35 | if (nrhs != 2) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 36 | 37 | // Initialize the MathWorks GPU API 38 | mxInitGPU(); 39 | 40 | // Create Matlab pointers on the GPU 41 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR); 42 | 43 | // Checks - note rows must be in CSR format 44 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 45 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 46 | mwSize nrows = mxGetScalar(NROWS); 47 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument is wrong size",mxGPUGetNumberOfElements(row_csr)); 48 | 49 | // Get handle to the CUBLAS context 50 | cublasHandle_t cublasHandle = 0; 51 | cublasStatus_t cublasStatus; 52 | cublasStatus = cublasCreate(&cublasHandle); 53 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 54 | 55 | // Get handle to the CUSPARSE context 56 | cusparseStatus_t status; 57 | cusparseHandle_t cusparseHandle = 0; 58 | cusparseStatus_t cusparseStatus; 59 | cusparseStatus = cusparseCreate(&cusparseHandle); 60 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 61 | cusparseMatDescr_t descr = 0; 62 | cusparseStatus = cusparseCreateMatDescr(&descr); 63 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 64 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 65 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 66 | 67 | // Convert from matlab pointers to native pointers 68 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr); 69 | 70 | // Now we can access the arrays, we can do some checks 71 | int base; 72 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 73 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing"); 74 | 75 | int nnz; 76 | cudaMemcpy(&nnz, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 77 | nnz -= CUSPARSE_INDEX_BASE_ONE; 78 | if (nnz < 0) mxShowCriticalErrorMessage("ROW_CSR returned negative nnz"); 79 | 80 | // Create space for output vector 81 | const mwSize ndim = 1; 82 | mwSize dims[ndim] = {(mwSize)nnz}; // we checked that nnz is >=0 so cast is safe 83 | mxGPUArray *row = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 84 | if (row==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 85 | 86 | // Convert from matlab pointers to native pointers 87 | int *d_row = (int*)mxGPUGetData(row); 88 | 89 | // Call csr2coo 90 | status = cusparseXcsr2coo(cusparseHandle, d_row_csr, nnz, nrows, d_row, CUSPARSE_INDEX_BASE_ONE); 91 | 92 | if (status == CUSPARSE_STATUS_SUCCESS) 93 | { 94 | // Return result 95 | ROW = mxGPUCreateMxArrayOnGPU(row); 96 | 97 | // Make sure operations are finished before deleting 98 | //cudaDeviceSynchronize(); 99 | } 100 | 101 | // Clean up 102 | cusparseDestroyMatDescr(descr); 103 | cusparseDestroy(cusparseHandle); 104 | cublasDestroy(cublasHandle); 105 | mxGPUDestroyGPUArray(row); 106 | mxGPUDestroyGPUArray(row_csr); 107 | 108 | // Failure 109 | if (status != CUSPARSE_STATUS_SUCCESS) 110 | { 111 | mxShowCriticalErrorMessage("Operation cusparseXcsr2coo failed",status); 112 | } 113 | 114 | return; 115 | } 116 | -------------------------------------------------------------------------------- /private/csr2csc.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE format converter (csr2csc) to do transpose. 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | #if CUDART_VERSION >= 11000 20 | #include "wrappers_to_cuda_11.h" 21 | #endif 22 | 23 | // MATLAB related 24 | #include "mex.h" 25 | #include "gpu/mxGPUArray.h" 26 | #include "mxShowCriticalErrorMessage.h" 27 | 28 | // Input Arguments 29 | #define ROW_CSR prhs[0] // CSR format 30 | #define COL prhs[1] 31 | #define VAL prhs[2] 32 | #define NROWS prhs[3] 33 | #define NCOLS prhs[4] 34 | 35 | // Output Arguments 36 | #define ROW plhs[0] 37 | #define COL_CSC plhs[1] // CSC format 38 | #define VAL_CSC plhs[2] 39 | 40 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 41 | { 42 | // Checks 43 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 44 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 45 | 46 | // Initialize the MathWorks GPU API 47 | mxInitGPU(); 48 | 49 | // Create Matlab pointers on the GPU 50 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR); 51 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL); 52 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL); 53 | 54 | // Checks - note rows must be in CSR format 55 | int nnz = mxGPUGetNumberOfElements(val); 56 | 57 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 58 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 59 | 60 | int nrows = (int)mxGetScalar(NROWS); 61 | int ncols = (int)mxGetScalar(NCOLS); 62 | 63 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 64 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 65 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 66 | 67 | // Create space for output vectors 68 | const mwSize ndim = 1; 69 | mwSize dims[ndim]; 70 | 71 | dims[0] = ncols+1; 72 | mxGPUArray *col_csc = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 73 | if (col_csc==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 74 | 75 | dims[0] = nnz; 76 | mxGPUArray *row = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 77 | if (row==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 78 | 79 | mxComplexity ccx = mxGPUGetComplexity(val); 80 | mxGPUArray *val_csc = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES); 81 | if (val_csc==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 82 | 83 | // Get handle to the CUBLAS context 84 | cublasHandle_t cublasHandle = 0; 85 | cublasStatus_t cublasStatus; 86 | cublasStatus = cublasCreate(&cublasHandle); 87 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 88 | 89 | // Get handle to the CUSPARSE context 90 | cusparseHandle_t cusparseHandle = 0; 91 | cusparseStatus_t cusparseStatus; 92 | cusparseStatus = cusparseCreate(&cusparseHandle); 93 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 94 | cusparseMatDescr_t descr = 0; 95 | cusparseStatus = cusparseCreateMatDescr(&descr); 96 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 97 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 98 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 99 | 100 | // Convert from matlab pointers to native pointers 101 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr); 102 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col); 103 | 104 | int *d_row = (int*)mxGPUGetData(row); 105 | int *d_col_csc = (int*)mxGPUGetData(col_csc); 106 | 107 | // Now we can access row_csr[] array 108 | int base; 109 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 110 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing"); 111 | 112 | int nnz_check; 113 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 114 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 115 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check); 116 | 117 | // Convert from CSR to CSC 118 | cusparseStatus_t status; 119 | 120 | if (ccx == mxREAL) 121 | { 122 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val); 123 | float *d_val_csc = (float*)mxGPUGetData(val_csc); 124 | #if CUDART_VERSION >= 11000 125 | status = cusparseXcsr2csc_wrapper(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE); 126 | #else 127 | status = cusparseScsr2csc(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE); 128 | #endif 129 | } 130 | else 131 | { 132 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 133 | cuFloatComplex *d_val_csc = (cuFloatComplex*)mxGPUGetData(val_csc); 134 | #if CUDART_VERSION >= 11000 135 | status = cusparseXcsr2csc_wrapper(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE); 136 | #else 137 | status = cusparseCcsr2csc(cusparseHandle, nrows, ncols, nnz, d_val, d_row_csr, d_col, d_val_csc, d_row, d_col_csc, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ONE); 138 | #endif 139 | } 140 | 141 | if (status == CUSPARSE_STATUS_SUCCESS) 142 | { 143 | // Return result 144 | ROW = mxGPUCreateMxArrayOnGPU(row); 145 | COL_CSC = mxGPUCreateMxArrayOnGPU(col_csc); 146 | VAL_CSC = mxGPUCreateMxArrayOnGPU(val_csc); 147 | 148 | // Make sure operations are finished before deleting 149 | //cudaDeviceSynchronize(); 150 | } 151 | 152 | // Clean up 153 | cusparseDestroyMatDescr(descr); 154 | cusparseDestroy(cusparseHandle); 155 | cublasDestroy(cublasHandle); 156 | mxGPUDestroyGPUArray(val); 157 | mxGPUDestroyGPUArray(col); 158 | mxGPUDestroyGPUArray(row_csr); 159 | mxGPUDestroyGPUArray(val_csc); 160 | mxGPUDestroyGPUArray(col_csc); 161 | mxGPUDestroyGPUArray(row); 162 | 163 | // Failure 164 | if (status != CUSPARSE_STATUS_SUCCESS) 165 | { 166 | mxShowCriticalErrorMessage("Operation cusparseScsr2csc or cusparseCcsr2csc failed",status); 167 | } 168 | 169 | return; 170 | } 171 | 172 | 173 | -------------------------------------------------------------------------------- /private/csr2csc_cpu.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to C code format converter (csr2csc) to do transpose. 3 | // 4 | // Inspired by: 5 | // http://www.dgate.org/~brg/files/dis/smvm/frontend/matrix_io.c 6 | // 7 | template // template val_real to accept float or mxComplex for MX_HAS_INTERLEAVED_COMPLEX 8 | void csr2csc(const int nrows, const int ncols, const int *row_csr, const int *col, const T *val_real, const float *val_imag, 9 | int *row, int *col_csc, T *val_csc_real, float *val_csc_imag) 10 | { 11 | int i, j, k, l; 12 | 13 | // Base index (0 or 1) and number of nonzeros 14 | const int base = row_csr[0]; 15 | const int nnz = row_csr[nrows]-base; 16 | 17 | // Determine column lengths 18 | for (i=0; i<=ncols; i++) col_csc[i] = 0; 19 | for (i=0; i0; i--) col_csc[i] = col_csc[i-1]+base; 37 | 38 | col_csc[0] = base; 39 | } 40 | 41 | // includes, system 42 | #include 43 | #include 44 | #include 45 | 46 | // MATLAB related 47 | #include "mex.h" 48 | #include "mxShowCriticalErrorMessage.h" 49 | 50 | // Input Arguments 51 | #define ROW_CSR prhs[0] // CSR format 52 | #define COL prhs[1] 53 | #define VAL prhs[2] 54 | #define NROWS prhs[3] 55 | #define NCOLS prhs[4] 56 | 57 | // Output Arguments 58 | #define ROW plhs[0] 59 | #define COL_CSC plhs[1] // CSC format 60 | #define VAL_CSC plhs[2] 61 | 62 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 63 | { 64 | // Checks 65 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 66 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 67 | 68 | // Checks - note rows must be in CSR format 69 | int nnz = mxGetNumberOfElements(VAL); 70 | 71 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 72 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 73 | 74 | int nrows = (int)mxGetScalar(NROWS); 75 | int ncols = (int)mxGetScalar(NCOLS); 76 | 77 | if (mxGetClassID(ROW_CSR) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 78 | if (mxGetClassID(COL) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 79 | if (mxGetClassID(VAL) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 80 | 81 | // Create space for output vectors 82 | const mwSize ndim = 1; 83 | mwSize dims[ndim]; 84 | 85 | dims[0] = ncols+1; 86 | COL_CSC = mxCreateUninitNumericArray(ndim, dims, mxINT32_CLASS, mxREAL); 87 | if (COL_CSC==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 88 | 89 | dims[0] = nnz; 90 | ROW = mxCreateUninitNumericArray(ndim, dims, mxINT32_CLASS, mxREAL); 91 | if (ROW==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 92 | 93 | mxComplexity ccx = mxIsComplex(VAL) ? mxCOMPLEX : mxREAL; 94 | VAL_CSC = mxCreateUninitNumericArray(ndim, dims, mxSINGLE_CLASS, ccx); 95 | if (VAL_CSC==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 96 | 97 | // Pointers to the raw data 98 | const int * const row_csr = (int *)mxGetData(ROW_CSR); 99 | const int * const col = (int *)mxGetData(COL); 100 | void *val_real = mxGetData(VAL); 101 | #if MX_HAS_INTERLEAVED_COMPLEX 102 | void *val_imag = NULL; 103 | #else 104 | void *val_imag = mxGetImagData(VAL); 105 | #endif 106 | 107 | int *row = (int *)mxGetData(ROW); 108 | int *col_csc = (int *)mxGetData(COL_CSC); 109 | void *val_csc_real = mxGetData(VAL_CSC); 110 | #if MX_HAS_INTERLEAVED_COMPLEX 111 | void *val_csc_imag = NULL; 112 | #else 113 | void *val_csc_imag = mxGetImagData(VAL_CSC); 114 | #endif 115 | 116 | // Now we can access the arrays, we can do some checks 117 | const int base = row_csr[0]; 118 | if (base != 1) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing"); 119 | 120 | int nnz_check = row_csr[nrows]; 121 | nnz_check -= 1; 122 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check); 123 | 124 | // Convert from CSR to CSC 125 | #if MX_HAS_INTERLEAVED_COMPLEX 126 | if(ccx == mxCOMPLEX) 127 | csr2csc(nrows, ncols, row_csr, col, (mxComplexSingle*)val_real, (float*)val_imag, row, col_csc, (mxComplexSingle*)val_csc_real, (float*)val_csc_imag); 128 | else 129 | csr2csc(nrows, ncols, row_csr, col, (float*)val_real, (float*)val_imag, row, col_csc, (float*)val_csc_real, (float*)val_csc_imag); 130 | #else 131 | csr2csc(nrows, ncols, row_csr, col, (float*)val_real, (float*)val_imag, row, col_csc, (float*)val_csc_real, (float*)val_csc_imag); 132 | #endif 133 | 134 | 135 | return; 136 | } 137 | 138 | -------------------------------------------------------------------------------- /private/csrgeam.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE matrix-matrix addition (csrgeam). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | #if CUDART_VERSION >= 11000 20 | #include "wrappers_to_cuda_11.h" 21 | #endif 22 | 23 | // MATLAB related 24 | #include "mex.h" 25 | #include "gpu/mxGPUArray.h" 26 | #include "mxShowCriticalErrorMessage.h" 27 | 28 | // Input Arguments 29 | #define A_ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu) 30 | #define A_COL prhs[1] 31 | #define A_VAL prhs[2] 32 | #define NROWS prhs[3] 33 | #define NCOLS prhs[4] 34 | #define B_ROW_CSR prhs[5] // this in CSR format (returned from coo2csr.cu) 35 | #define B_COL prhs[6] 36 | #define B_VAL prhs[7] 37 | #define ALPHA prhs[8] // scalar: C = ALPHA*A + BETA*B 38 | #define BETA prhs[9] // scalar: C = ALPHA*A + BETA*B 39 | 40 | // Output Arguments 41 | #define C_ROW_CSR plhs[0] // this in CSR format (returned from coo2csr.cu) 42 | #define C_COL plhs[1] 43 | #define C_VAL plhs[2] 44 | 45 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 46 | { 47 | // Checks 48 | if (nlhs > 3) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 49 | if (nrhs != 10) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 50 | 51 | if(!mxIsGPUArray(A_ROW_CSR)) mxShowCriticalErrorMessage("A_ROW_CSR argument is not on GPU"); 52 | if(!mxIsGPUArray(A_COL)) mxShowCriticalErrorMessage("A_COL argument is not on GPU"); 53 | if(!mxIsGPUArray(A_VAL)) mxShowCriticalErrorMessage("A_VAL argument is not on GPU"); 54 | 55 | if (!mxIsScalar(ALPHA)) mxShowCriticalErrorMessage("ALPHA argument must be a scalar"); 56 | if (!mxIsScalar(BETA)) mxShowCriticalErrorMessage("BETA argument must be a scalar"); 57 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 58 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 59 | 60 | // Initialize the MathWorks GPU API 61 | mxInitGPU(); 62 | 63 | // Create Matlab pointers on the GPU 64 | mxGPUArray const *a_row_csr = mxGPUCreateFromMxArray(A_ROW_CSR); 65 | mxGPUArray const *a_col = mxGPUCreateFromMxArray(A_COL); 66 | mxGPUArray const *a_val = mxGPUCreateFromMxArray(A_VAL); 67 | mxGPUArray const *b_row_csr = mxGPUCreateFromMxArray(B_ROW_CSR); 68 | mxGPUArray const *b_col = mxGPUCreateFromMxArray(B_COL); 69 | mxGPUArray const *b_val = mxGPUCreateFromMxArray(B_VAL); 70 | 71 | // Check sizes - note rows are in CSR (compressed row) format 72 | int a_nnz = mxGPUGetNumberOfElements(a_val); 73 | int b_nnz = mxGPUGetNumberOfElements(b_val); 74 | 75 | mwSize nrows = mxGetScalar(NROWS); 76 | mwSize ncols = mxGetScalar(NCOLS); 77 | 78 | if (mxGPUGetNumberOfElements(a_row_csr) != nrows+1) mxShowCriticalErrorMessage("A_ROW_CSR argument wrong size",mxGPUGetNumberOfElements(a_row_csr)); 79 | if (mxGPUGetNumberOfElements(a_col) != a_nnz) mxShowCriticalErrorMessage("A_COL argument wrong size",mxGPUGetNumberOfElements(a_col)); 80 | 81 | if (mxGPUGetNumberOfElements(b_row_csr) != nrows+1) mxShowCriticalErrorMessage("B_ROW_CSR argument wrong size",mxGPUGetNumberOfElements(b_row_csr)); 82 | if (mxGPUGetNumberOfElements(b_col) != b_nnz) mxShowCriticalErrorMessage("B_COL argument wrong size",mxGPUGetNumberOfElements(b_col)); 83 | 84 | if (mxGPUGetClassID(a_row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("A_ROW_CSR argument is not int32"); 85 | if (mxGPUGetClassID(a_col) != mxINT32_CLASS) mxShowCriticalErrorMessage("A_COL argument is not int32"); 86 | if (mxGPUGetClassID(a_val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("A_VAL argument is not single"); 87 | 88 | if (mxGPUGetClassID(b_row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("B_ROW argument is not int32"); 89 | if (mxGPUGetClassID(b_col) != mxINT32_CLASS) mxShowCriticalErrorMessage("B_COL argument is not int32"); 90 | if (mxGPUGetClassID(b_val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("B_VAL argument is not single"); 91 | 92 | // Allocate space for output row vector 93 | const mwSize ndim = 1; 94 | mwSize dims[ndim] = {nrows+1}; 95 | mxGPUArray *c_row_csr = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE); 96 | if (c_row_csr==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 97 | 98 | // Get handle to the CUBLAS context 99 | cublasHandle_t cublasHandle = 0; 100 | cublasStatus_t cublasStatus; 101 | cublasStatus = cublasCreate(&cublasHandle); 102 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 103 | 104 | // Get handle to the CUSPARSE context 105 | cusparseHandle_t cusparseHandle = 0; 106 | cusparseStatus_t cusparseStatus; 107 | cusparseStatus = cusparseCreate(&cusparseHandle); 108 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 109 | cusparseMatDescr_t descr = 0; 110 | cusparseStatus = cusparseCreateMatDescr(&descr); 111 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 112 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 113 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 114 | 115 | // Convert from matlab pointers to native pointers 116 | const int* const d_a_col = (int*)mxGPUGetDataReadOnly(a_col); 117 | const int* const d_b_col = (int*)mxGPUGetDataReadOnly(b_col); 118 | 119 | const float* const d_a_val = (float*)mxGPUGetDataReadOnly(a_val); 120 | const float* const d_b_val = (float*)mxGPUGetDataReadOnly(b_val); 121 | 122 | const int* const d_a_row_csr = (int*)mxGPUGetDataReadOnly(a_row_csr); 123 | const int* const d_b_row_csr = (int*)mxGPUGetDataReadOnly(b_row_csr); 124 | 125 | int *d_c_col = NULL; 126 | float *d_c_val = NULL; 127 | int *d_c_row_csr = (int*)mxGPUGetData(c_row_csr); 128 | 129 | const float alpha = (float)mxGetScalar(ALPHA); 130 | const float beta = (float)mxGetScalar(BETA); 131 | 132 | // Now we can access the arrays, we can do some checks 133 | int base; 134 | cudaMemcpy(&base, d_a_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 135 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("A_ROW_CSR not using 1-based indexing"); 136 | 137 | int nnz_check; 138 | cudaMemcpy(&nnz_check, d_a_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 139 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 140 | if (nnz_check != a_nnz) mxShowCriticalErrorMessage("A_ROW_CSR argument last element != nnz",nnz_check); 141 | 142 | cudaMemcpy(&base, d_b_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 143 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("B_ROW_CSR not using 1-based indexing"); 144 | 145 | cudaMemcpy(&nnz_check, d_b_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 146 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 147 | if (nnz_check != b_nnz) mxShowCriticalErrorMessage("B_ROW_CSR argument last element != nnz",nnz_check); 148 | 149 | // Get sparsity pattern and nnz of output matrix 150 | int c_nnz; 151 | int *nnzTotalDevHostPtr = &c_nnz; 152 | cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST); 153 | 154 | char *buffer = NULL; 155 | size_t bufferSizeInBytes; 156 | 157 | #if CUDART_VERSION >= 11000 158 | cusparseScsrgeam2_bufferSizeExt(cusparseHandle, nrows, ncols, 159 | &alpha, 160 | descr, a_nnz, d_a_val, d_a_row_csr, d_a_col, 161 | &beta, 162 | descr, b_nnz, d_b_val, d_b_row_csr, d_b_col, 163 | descr, d_c_val, d_c_row_csr, d_c_col, 164 | &bufferSizeInBytes); 165 | 166 | cudaError_t status0 = cudaMalloc((void**)&buffer, sizeof(char)*bufferSizeInBytes); 167 | if (status0 != cudaSuccess) 168 | { 169 | mxShowCriticalErrorMessage("Operation cudaMalloc failed",status0); 170 | } 171 | 172 | cusparseStatus_t status1 = 173 | cusparseXcsrgeam2Nnz(cusparseHandle, nrows, ncols, 174 | descr, a_nnz, d_a_row_csr, d_a_col, 175 | descr, b_nnz, d_b_row_csr, d_b_col, 176 | descr, d_c_row_csr, nnzTotalDevHostPtr, buffer); 177 | #else 178 | cusparseStatus_t status1 = 179 | cusparseXcsrgeamNnz(cusparseHandle, nrows, ncols, 180 | descr, a_nnz, d_a_row_csr, d_a_col, 181 | descr, b_nnz, d_b_row_csr, d_b_col, 182 | descr, d_c_row_csr, nnzTotalDevHostPtr); 183 | #endif 184 | 185 | // Failure 186 | if (status1 != CUSPARSE_STATUS_SUCCESS) 187 | { 188 | mxShowCriticalErrorMessage("Operation cusparseXcsrgeamNnz failed",status1); 189 | } 190 | 191 | if (NULL != nnzTotalDevHostPtr) 192 | { 193 | c_nnz = *nnzTotalDevHostPtr; 194 | } 195 | else 196 | { 197 | int baseC = CUSPARSE_INDEX_BASE_ONE; 198 | cudaMemcpy(&c_nnz, d_c_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 199 | cudaMemcpy(&baseC, c_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 200 | c_nnz -= baseC; 201 | } 202 | 203 | // Allocate space for output vectors 204 | dims[0] = {(mwSize)c_nnz}; 205 | mxGPUArray *c_col = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE); 206 | if (c_col==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 207 | 208 | mxGPUArray *c_val = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, mxREAL, MX_GPU_DO_NOT_INITIALIZE); 209 | if (c_val==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 210 | 211 | // Convert from matlab pointers to native pointers 212 | d_c_col = (int*)mxGPUGetData(c_col); 213 | d_c_val = (float*)mxGPUGetData(c_val); 214 | 215 | // Addition here 216 | #if CUDART_VERSION >= 11000 217 | cusparseStatus_t status2 = 218 | cusparseScsrgeam2(cusparseHandle, nrows, ncols, 219 | &alpha, 220 | descr, a_nnz, 221 | d_a_val, d_a_row_csr, d_a_col, 222 | &beta, 223 | descr, b_nnz, 224 | d_b_val, d_b_row_csr, d_b_col, 225 | descr, 226 | d_c_val, d_c_row_csr, d_c_col, buffer); 227 | #else 228 | cusparseStatus_t status2 = 229 | cusparseScsrgeam(cusparseHandle, nrows, ncols, 230 | &alpha, 231 | descr, a_nnz, 232 | d_a_val, d_a_row_csr, d_a_col, 233 | &beta, 234 | descr, b_nnz, 235 | d_b_val, d_b_row_csr, d_b_col, 236 | descr, 237 | d_c_val, d_c_row_csr, d_c_col); 238 | #endif 239 | 240 | if (status2 == CUSPARSE_STATUS_SUCCESS) 241 | { 242 | // Return results 243 | C_ROW_CSR = mxGPUCreateMxArrayOnGPU(c_row_csr); 244 | C_COL = mxGPUCreateMxArrayOnGPU(c_col); 245 | C_VAL = mxGPUCreateMxArrayOnGPU(c_val); 246 | 247 | // Make sure operations are finished before deleting 248 | //cudaDeviceSynchronize(); 249 | } 250 | 251 | // Clean up 252 | cusparseDestroyMatDescr(descr); 253 | cusparseDestroy(cusparseHandle); 254 | cublasDestroy(cublasHandle); 255 | if(buffer) cudaFree(buffer); 256 | mxGPUDestroyGPUArray(a_row_csr); 257 | mxGPUDestroyGPUArray(a_col); 258 | mxGPUDestroyGPUArray(a_val); 259 | mxGPUDestroyGPUArray(b_row_csr); 260 | mxGPUDestroyGPUArray(b_col); 261 | mxGPUDestroyGPUArray(b_val); 262 | mxGPUDestroyGPUArray(c_row_csr); 263 | mxGPUDestroyGPUArray(c_col); 264 | mxGPUDestroyGPUArray(c_val); 265 | 266 | // Failure 267 | if (status2 != CUSPARSE_STATUS_SUCCESS) 268 | { 269 | mxShowCriticalErrorMessage("Operation cusparseScsrgeam failed",status2); 270 | } 271 | 272 | return; 273 | } 274 | -------------------------------------------------------------------------------- /private/csrmm.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE matrix-matrix multiply (csrmm). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | #if CUDART_VERSION >= 11000 20 | #include "wrappers_to_cuda_11.h" 21 | #endif 22 | 23 | // MATLAB related 24 | #include "mex.h" 25 | #include "gpu/mxGPUArray.h" 26 | #include "mxShowCriticalErrorMessage.h" 27 | 28 | // Input Arguments 29 | #define ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu) 30 | #define COL prhs[1] 31 | #define VAL prhs[2] 32 | #define NROWS prhs[3] 33 | #define NCOLS prhs[4] 34 | #define TRANS prhs[5] 35 | #define B prhs[6] // dense matrix 36 | 37 | // Output Arguments 38 | #define C plhs[0] // C = alpha * op(A) * B + beta * C (sparse A, dense B) 39 | 40 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 41 | { 42 | // Checks 43 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 44 | if (nrhs != 7) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 45 | 46 | if(!mxIsGPUArray(ROW_CSR)) mxShowCriticalErrorMessage("ROW_CSR argument is not on GPU"); 47 | if(!mxIsGPUArray(COL)) mxShowCriticalErrorMessage("COL argument is not on GPU"); 48 | if(!mxIsGPUArray(VAL)) mxShowCriticalErrorMessage("VAL argument is not on GPU"); 49 | if(!mxIsGPUArray(B)) mxShowCriticalErrorMessage("B argument is not on GPU"); 50 | 51 | // Initialize the MathWorks GPU API 52 | mxInitGPU(); 53 | 54 | // Create Matlab pointers on the GPU 55 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR); 56 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL); 57 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL); 58 | mxGPUArray const *b = mxGPUCreateFromMxArray(B); 59 | 60 | // Check sizes of A - note rows are in CSR (compressed row) format 61 | mwSize nnz = mxGPUGetNumberOfElements(val); 62 | 63 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 64 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 65 | if (!mxIsScalar(TRANS)) mxShowCriticalErrorMessage("TRANS argument must be a scalar"); 66 | 67 | mwSize m = mxGetScalar(NROWS); 68 | mwSize k = mxGetScalar(NCOLS); 69 | 70 | if (mxGPUGetNumberOfElements(row_csr) != m+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size",mxGPUGetNumberOfElements(row_csr)); 71 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL argument wrong size",mxGPUGetNumberOfElements(col)); 72 | 73 | // Check sizes of B 74 | if (mxGPUGetNumberOfDimensions(b) > 2) mxShowCriticalErrorMessage("B has too many dimensions",mxGPUGetNumberOfDimensions(b)); 75 | 76 | mwSize *bdims = (mwSize*)mxGPUGetDimensions(b); // dims always has >= 2 elements 77 | mwSize ldb = bdims[0]; // leading dimension of B 78 | mwSize n = bdims[1]; 79 | 80 | cusparseOperation_t trans = (cusparseOperation_t)mxGetScalar(TRANS); 81 | if (trans == CUSPARSE_OPERATION_NON_TRANSPOSE) 82 | { 83 | if (ldb != k) mxShowCriticalErrorMessage("B argument wrong size for multiply",ldb); 84 | } 85 | else 86 | { 87 | if (ldb != m) mxShowCriticalErrorMessage("B argument wrong size for transpose multiply",ldb); 88 | } 89 | 90 | // Check types 91 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 92 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 93 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 94 | if (mxGPUGetClassID(b) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("B argument is not single"); 95 | 96 | // Check real/complex - mixed is not supported except special case (real A / complex B) 97 | mxComplexity cca = mxGPUGetComplexity(val); 98 | mxComplexity ccb = mxGPUGetComplexity(b); 99 | mxComplexity ccc = (ccb==mxCOMPLEX || cca==mxCOMPLEX) ? mxCOMPLEX : mxREAL; 100 | if(ccb==mxREAL && cca==mxCOMPLEX) mxShowCriticalErrorMessage("Complex matrix and real vector not supported"); 101 | 102 | // Create space for output vectors 103 | const mwSize ndim = 2; 104 | mwSize cdims[ndim] = {trans == CUSPARSE_OPERATION_NON_TRANSPOSE ? m : k, n}; 105 | mxClassID cid = mxGPUGetClassID(b); // same class as B matrix 106 | int ldc = cdims[0]; // leading dimension of C 107 | mxGPUArray *c; 108 | 109 | // Get handle to the CUBLAS context 110 | cublasHandle_t cublasHandle = 0; 111 | cublasStatus_t cublasStatus; 112 | cublasStatus = cublasCreate(&cublasHandle); 113 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 114 | 115 | // Get handle to the CUSPARSE context 116 | cusparseHandle_t cusparseHandle = 0; 117 | cusparseStatus_t cusparseStatus; 118 | cusparseStatus = cusparseCreate(&cusparseHandle); 119 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 120 | cusparseMatDescr_t descr = 0; 121 | cusparseStatus = cusparseCreateMatDescr(&descr); 122 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 123 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 124 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 125 | 126 | // Convert from matlab pointers to native pointers 127 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr); 128 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col); 129 | 130 | // Now we can access the arrays, we can do some checks 131 | int base; 132 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 133 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing"); 134 | 135 | int nnz_check; 136 | cudaMemcpy(&nnz_check, d_row_csr+m, sizeof(int), cudaMemcpyDeviceToHost); 137 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 138 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check); 139 | 140 | // Call cusparse multiply function in (S)ingle precision 141 | if (cca==mxREAL && ccb==mxREAL) 142 | { 143 | const float alpha = 1.0; 144 | const float beta = 0.0; 145 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val); 146 | const float * const d_b = (float*)mxGPUGetDataReadOnly(b); 147 | 148 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES); 149 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 150 | float *d_c = (float*)mxGPUGetData(c); 151 | 152 | #if CUDART_VERSION >= 11000 153 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc); 154 | #else 155 | cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc); 156 | #endif 157 | } 158 | else if (cca==mxREAL && ccb==mxCOMPLEX) 159 | { 160 | #if 0 // CUDART_VERSION >= 12040 // use 12.4 mixed real/complex operation 161 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); 162 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); 163 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES); 164 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 165 | cuFloatComplex* d_c = (cuFloatComplex*)mxGPUGetData(c); 166 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val); 167 | const cuFloatComplex* const d_b = (cuFloatComplex*)mxGPUGetDataReadOnly(b); 168 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc); 169 | #else 170 | const float alpha = 1.0; 171 | const float beta = 0.0; 172 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val); 173 | 174 | mxGPUArray* c_real = mxGPUCreateGPUArray(ndim, cdims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES); 175 | mxGPUArray* c_imag = mxGPUCreateGPUArray(ndim, cdims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES); 176 | if(!c_real || !c_imag) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 177 | float* d_c_real = (float*)mxGPUGetDataReadOnly(c_real); 178 | float* d_c_imag = (float*)mxGPUGetDataReadOnly(c_imag); 179 | 180 | for(int i = 0; i<2; i++) 181 | { 182 | mxGPUArray const *b_tmp; 183 | if(i==0) b_tmp = mxGPUCopyReal(b); 184 | if(i==1) b_tmp = mxGPUCopyImag(b); 185 | const float* const d_b = (float*)mxGPUGetDataReadOnly(b_tmp); 186 | 187 | #if CUDART_VERSION >= 11000 188 | if(i==0) cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_real, ldc); 189 | if(i==1) cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_imag, ldc); 190 | #else 191 | if(i==0) cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_real, ldc); 192 | if(i==1) cusparseStatus = cusparseScsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c_imag, ldc); 193 | #endif 194 | mxGPUDestroyGPUArray(b_tmp); 195 | if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("csrmm failed."); 196 | } 197 | c = mxGPUCreateComplexGPUArray(c_real,c_imag); 198 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateComplexGPUArray failed."); 199 | mxGPUDestroyGPUArray(c_real); 200 | mxGPUDestroyGPUArray(c_imag); 201 | #endif 202 | } 203 | else 204 | { 205 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); 206 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); 207 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 208 | const cuFloatComplex * const d_b = (cuFloatComplex*)mxGPUGetDataReadOnly(b); 209 | 210 | c = mxGPUCreateGPUArray(ndim, cdims, cid, ccc, MX_GPU_INITIALIZE_VALUES); 211 | if (c==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 212 | cuFloatComplex *d_c = (cuFloatComplex*)mxGPUGetData(c); 213 | 214 | #if CUDART_VERSION >= 11000 215 | cusparseStatus = cusparseXcsrmm_wrapper(cusparseHandle, trans, m, k, n, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc); 216 | #else 217 | cusparseStatus = cusparseCcsrmm(cusparseHandle, trans, m, n, k, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_b, ldb, &beta, d_c, ldc); 218 | #endif 219 | } 220 | 221 | // Return result 222 | if (cusparseStatus == CUSPARSE_STATUS_SUCCESS) 223 | { 224 | C = mxGPUCreateMxArrayOnGPU(c); 225 | } 226 | 227 | // Clean up 228 | cusparseDestroyMatDescr(descr); 229 | cusparseDestroy(cusparseHandle); 230 | cublasDestroy(cublasHandle); 231 | mxGPUDestroyGPUArray(row_csr); 232 | mxGPUDestroyGPUArray(col); 233 | mxGPUDestroyGPUArray(val); 234 | mxGPUDestroyGPUArray(b); 235 | mxGPUDestroyGPUArray(c); 236 | mxFree(bdims); 237 | 238 | return; 239 | } 240 | 241 | -------------------------------------------------------------------------------- /private/csrmv.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE matrix-vector multiply (csrmv). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | /* Using updated (v2) interfaces to cublas */ 16 | #include 17 | #include 18 | #include 19 | 20 | #if CUDART_VERSION >= 11000 21 | #include "wrappers_to_cuda_11.h" 22 | #endif 23 | 24 | // MATLAB related 25 | #include "mex.h" 26 | #include "gpu/mxGPUArray.h" 27 | #include "mxShowCriticalErrorMessage.h" 28 | 29 | // Input Arguments 30 | #define ROW_CSR prhs[0] // this in CSR format (returned from coo2csr.cu) 31 | #define COL prhs[1] 32 | #define VAL prhs[2] 33 | #define NROWS prhs[3] 34 | #define NCOLS prhs[4] 35 | #define TRANS prhs[5] 36 | #define X prhs[6] 37 | 38 | // Output Arguments 39 | #define Y plhs[0] 40 | 41 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 42 | { 43 | // Checks 44 | if (nlhs > 1) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 45 | if (nrhs != 7) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 46 | 47 | if(!mxIsGPUArray(ROW_CSR)) mxShowCriticalErrorMessage("ROW_CSR argument is not on GPU"); 48 | if(!mxIsGPUArray(COL)) mxShowCriticalErrorMessage("COL argument is not on GPU"); 49 | if(!mxIsGPUArray(VAL)) mxShowCriticalErrorMessage("VAL argument is not on GPU"); 50 | if(!mxIsGPUArray(X)) mxShowCriticalErrorMessage("X argument is not on GPU"); 51 | 52 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 53 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 54 | if (!mxIsScalar(TRANS)) mxShowCriticalErrorMessage("TRANS argument must be a scalar"); 55 | 56 | // Initialize the MathWorks GPU API 57 | mxInitGPU(); 58 | 59 | // Create Matlab pointers on the GPU 60 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR); 61 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL); 62 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL); 63 | mxGPUArray const *x = mxGPUCreateFromMxArray(X); 64 | 65 | // Check sizes - note rows are in CSR (compressed row) format 66 | mwSize nnz = mxGPUGetNumberOfElements(val); 67 | mwSize nrows = mxGetScalar(NROWS); 68 | mwSize ncols = mxGetScalar(NCOLS); 69 | 70 | mwSize *xdims = (mwSize*)mxGPUGetDimensions(x); // xdims always has >= 2 elements 71 | if (mxGPUGetNumberOfDimensions(x) > 2) mxShowCriticalErrorMessage("X argument has too many dimensions",mxGPUGetNumberOfDimensions(x)); 72 | if (xdims[1] != 1) mxShowCriticalErrorMessage("X argument is not a column vector"); 73 | 74 | int nx = xdims[0]; 75 | 76 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size",mxGPUGetNumberOfElements(row_csr)); 77 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL argument wrong size",mxGPUGetNumberOfElements(col)); 78 | 79 | cusparseOperation_t trans = (cusparseOperation_t)mxGetScalar(TRANS); 80 | if (trans == CUSPARSE_OPERATION_NON_TRANSPOSE) 81 | { 82 | if (nx != ncols) mxShowCriticalErrorMessage("X argument wrong size for multiply",nx); 83 | } 84 | else 85 | { 86 | if (nx != nrows) mxShowCriticalErrorMessage("X argument wrong size for transpose multiply",nx); 87 | } 88 | 89 | // Check types 90 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 91 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 92 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 93 | if (mxGPUGetClassID(x) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("X argument is not single"); 94 | 95 | // Check real/complex - mixed is not supported except special case (real A / complex x) 96 | mxComplexity ccx = mxGPUGetComplexity(x); 97 | mxComplexity ccv = mxGPUGetComplexity(val); 98 | mxComplexity ccy = (ccx==mxCOMPLEX || ccv==mxCOMPLEX) ? mxCOMPLEX : mxREAL; 99 | if(ccx==mxREAL && ccv==mxCOMPLEX) mxShowCriticalErrorMessage("Complex matrix and real vector not supported"); 100 | 101 | // Create space for output vector 102 | const mwSize ndim = 1; 103 | mwSize dims[ndim] = {trans == CUSPARSE_OPERATION_NON_TRANSPOSE ? nrows : ncols}; 104 | mxClassID cid = mxGPUGetClassID(x); 105 | mxGPUArray *y; 106 | 107 | // Get handle to the CUBLAS context 108 | cublasHandle_t cublasHandle = 0; 109 | cublasStatus_t cublasStatus; 110 | cublasStatus = cublasCreate(&cublasHandle); 111 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 112 | 113 | // Get handle to the CUSPARSE context 114 | cusparseHandle_t cusparseHandle = 0; 115 | cusparseStatus_t cusparseStatus; 116 | cusparseStatus = cusparseCreate(&cusparseHandle); 117 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 118 | cusparseMatDescr_t descr = 0; 119 | cusparseStatus = cusparseCreateMatDescr(&descr); 120 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 121 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 122 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 123 | 124 | // Convert from matlab pointers to native pointers 125 | const int* const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr); 126 | const int* const d_col = (int*)mxGPUGetDataReadOnly(col); 127 | 128 | // Now we can access the arrays, we can do some checks 129 | int base; 130 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 131 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("ROW_CSR not using 1-based indexing"); 132 | 133 | int nnz_check; 134 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 135 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 136 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check); 137 | 138 | // Call cusparse multiply function in (S)ingle precision 139 | if (ccv==mxREAL && ccx==mxREAL) 140 | { 141 | const float alpha = 1.0; 142 | const float beta = 0.0; 143 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES); 144 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 145 | float* d_y = (float*)mxGPUGetData(y); 146 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val); 147 | const float* const d_x = (float*)mxGPUGetDataReadOnly(x); 148 | #if CUDART_VERSION >= 11000 149 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y); 150 | #else 151 | cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y); 152 | #endif 153 | } 154 | else if (ccv==mxREAL && ccx==mxCOMPLEX) 155 | { 156 | #if CUDART_VERSION >= 11020 // use 11.2 mixed real/complex operation 157 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); 158 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); 159 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES); 160 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 161 | cuFloatComplex* d_y = (cuFloatComplex*)mxGPUGetData(y); 162 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val); 163 | const cuFloatComplex* const d_x = (cuFloatComplex*)mxGPUGetDataReadOnly(x); 164 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y); 165 | #else 166 | const float alpha = 1.0; 167 | const float beta = 0.0; 168 | const float* const d_val = (float*)mxGPUGetDataReadOnly(val); 169 | 170 | mxGPUArray* y_real = mxGPUCreateGPUArray(ndim, dims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES); 171 | mxGPUArray* y_imag = mxGPUCreateGPUArray(ndim, dims, cid, mxREAL, MX_GPU_INITIALIZE_VALUES); 172 | if(!y_real || !y_imag) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 173 | float* d_y_real = (float*)mxGPUGetDataReadOnly(y_real); 174 | float* d_y_imag = (float*)mxGPUGetDataReadOnly(y_imag); 175 | 176 | for(int i = 0; i<2; i++) 177 | { 178 | mxGPUArray const *x_tmp; 179 | if(i==0) x_tmp = mxGPUCopyReal(x); 180 | if(i==1) x_tmp = mxGPUCopyImag(x); 181 | const float* const d_x = (float*)mxGPUGetDataReadOnly(x_tmp); 182 | #if CUDART_VERSION >= 11000 183 | if(i==0) cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_real); 184 | if(i==1) cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_imag); 185 | #else 186 | if(i==0) cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_real); 187 | if(i==1) cusparseStatus = cusparseScsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y_imag); 188 | #endif 189 | mxGPUDestroyGPUArray(x_tmp); 190 | if(cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("csrmv failed."); 191 | } 192 | y = mxGPUCreateComplexGPUArray(y_real,y_imag); 193 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateComplexGPUArray failed."); 194 | mxGPUDestroyGPUArray(y_real); 195 | mxGPUDestroyGPUArray(y_imag); 196 | #endif 197 | } 198 | else 199 | { 200 | const cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); 201 | const cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); 202 | y = mxGPUCreateGPUArray(ndim, dims, cid, ccy, MX_GPU_INITIALIZE_VALUES); 203 | if (y==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed."); 204 | cuFloatComplex* d_y = (cuFloatComplex*)mxGPUGetData(y); 205 | const cuFloatComplex* const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 206 | const cuFloatComplex* const d_x = (cuFloatComplex*)mxGPUGetDataReadOnly(x); 207 | #if CUDART_VERSION >= 11000 208 | cusparseStatus = cusparseXcsrmv_wrapper(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y); 209 | #else 210 | cusparseStatus = cusparseCcsrmv(cusparseHandle, trans, nrows, ncols, nnz, &alpha, descr, d_val, d_row_csr, d_col, d_x, &beta, d_y); 211 | #endif 212 | } 213 | 214 | // Return result 215 | if (cusparseStatus == CUSPARSE_STATUS_SUCCESS) 216 | { 217 | Y = mxGPUCreateMxArrayOnGPU(y); 218 | } 219 | else 220 | { 221 | mxShowCriticalErrorMessage("unknown failure",cusparseStatus); 222 | } 223 | 224 | // Clean up 225 | cusparseDestroyMatDescr(descr); 226 | cusparseDestroy(cusparseHandle); 227 | cublasDestroy(cublasHandle); 228 | mxGPUDestroyGPUArray(row_csr); 229 | mxGPUDestroyGPUArray(col); 230 | mxGPUDestroyGPUArray(val); 231 | mxGPUDestroyGPUArray(x); 232 | mxGPUDestroyGPUArray(y); 233 | mxFree(xdims); 234 | 235 | return; 236 | } 237 | 238 | -------------------------------------------------------------------------------- /private/csrsort.cu: -------------------------------------------------------------------------------- 1 | // 2 | // Mex wrapper to CUSPARSE sort for CSR format (csrsort). 3 | // 4 | // Inspired by cusparse samples (conugateGradient) and Matlab gcsparse. 5 | // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv 6 | // http://www.mathworks.com/matlabcentral/fileexchange/44423-gpu-sparse--accumarray--non-uniform-grid 7 | // 8 | 9 | // includes, system 10 | #include 11 | #include 12 | #include 13 | 14 | /* Using updated (v2) interfaces to cublas */ 15 | #include 16 | #include 17 | #include 18 | 19 | // MATLAB related 20 | #include "mex.h" 21 | #include "gpu/mxGPUArray.h" 22 | #include "mxShowCriticalErrorMessage.h" 23 | 24 | // Input Arguments 25 | #define ROW_CSR prhs[0] 26 | #define COL prhs[1] 27 | #define VAL prhs[2] 28 | #define NROWS prhs[3] 29 | #define NCOLS prhs[4] 30 | 31 | // Output Arguments 32 | #define COL_SORT plhs[0] 33 | #define VAL_SORT plhs[1] 34 | 35 | void mexFunction(int nlhs, mxArray * plhs[], int nrhs, const mxArray * prhs[]) 36 | { 37 | // Checks 38 | if (nlhs > 2) mxShowCriticalErrorMessage("wrong number of output arguments",nlhs); 39 | if (nrhs != 5) mxShowCriticalErrorMessage("wrong number of input arguments",nrhs); 40 | 41 | // Initialize the MathWorks GPU API 42 | mxInitGPU(); 43 | 44 | // Create Matlab pointers on the GPU 45 | mxGPUArray const *row_csr = mxGPUCreateFromMxArray(ROW_CSR); 46 | mxGPUArray const *col = mxGPUCreateFromMxArray(COL); 47 | mxGPUArray const *val = mxGPUCreateFromMxArray(VAL); 48 | 49 | // Checks - note vectors must be in CSR format 50 | int nnz = mxGPUGetNumberOfElements(val); 51 | if (mxGPUGetNumberOfElements(col) != nnz) mxShowCriticalErrorMessage("COL and VAL argument length mismatch"); 52 | 53 | if (!mxIsScalar(NROWS)) mxShowCriticalErrorMessage("NROWS argument must be a scalar"); 54 | if (!mxIsScalar(NCOLS)) mxShowCriticalErrorMessage("NCOLS argument must be a scalar"); 55 | 56 | int ncols = (int)mxGetScalar(NCOLS); 57 | int nrows = (int)mxGetScalar(NROWS); 58 | if (mxGPUGetNumberOfElements(row_csr) != nrows+1) mxShowCriticalErrorMessage("ROW_CSR argument wrong size"); 59 | 60 | if (mxGPUGetClassID(row_csr) != mxINT32_CLASS) mxShowCriticalErrorMessage("ROW_CSR argument is not int32"); 61 | if (mxGPUGetClassID(col) != mxINT32_CLASS) mxShowCriticalErrorMessage("COL argument is not int32"); 62 | if (mxGPUGetClassID(val) != mxSINGLE_CLASS) mxShowCriticalErrorMessage("VAL argument is not single"); 63 | 64 | // Create space for output vectors 65 | const mwSize ndim = 1; 66 | mwSize dims[ndim]; 67 | 68 | dims[0] = nnz; 69 | mxGPUArray *col_sort = mxGPUCreateGPUArray(ndim, dims, mxINT32_CLASS, mxREAL, MX_GPU_INITIALIZE_VALUES); 70 | if (col_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 71 | 72 | mxComplexity ccx = mxGPUGetComplexity(val); 73 | mxGPUArray *val_sort = mxGPUCreateGPUArray(ndim, dims, mxSINGLE_CLASS, ccx, MX_GPU_INITIALIZE_VALUES); 74 | if (val_sort==NULL) mxShowCriticalErrorMessage("mxGPUCreateGPUArray failed"); 75 | 76 | // Get handle to the CUBLAS context 77 | cublasHandle_t cublasHandle = 0; 78 | cublasStatus_t cublasStatus; 79 | cublasStatus = cublasCreate(&cublasHandle); 80 | if (cublasStatus != CUBLAS_STATUS_SUCCESS) mxShowCriticalErrorMessage(cublasStatus); 81 | 82 | // Get handle to the CUSPARSE context 83 | cudaError_t cudaStatus; 84 | cusparseStatus_t cusparseStatus; 85 | cusparseHandle_t cusparseHandle = 0; 86 | cusparseStatus = cusparseCreate(&cusparseHandle); 87 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 88 | cusparseMatDescr_t descr = 0; 89 | cusparseStatus = cusparseCreateMatDescr(&descr); 90 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage(cusparseStatus); 91 | cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); 92 | cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ONE); 93 | 94 | // Convert from matlab pointers to native pointers 95 | const int * const d_row_csr = (int*)mxGPUGetDataReadOnly(row_csr); 96 | const int * const d_col = (int*)mxGPUGetDataReadOnly(col); 97 | int *d_col_sort = (int*)mxGPUGetData(col_sort); 98 | 99 | // Now we can access the arrays, we can do some checks 100 | int base; 101 | cudaMemcpy(&base, d_row_csr, sizeof(int), cudaMemcpyDeviceToHost); 102 | if (base != CUSPARSE_INDEX_BASE_ONE) mxShowCriticalErrorMessage("A_ROW_CSR not using 1-based indexing"); 103 | 104 | int nnz_check; 105 | cudaMemcpy(&nnz_check, d_row_csr+nrows, sizeof(int), cudaMemcpyDeviceToHost); 106 | nnz_check -= CUSPARSE_INDEX_BASE_ONE; 107 | if (nnz_check != nnz) mxShowCriticalErrorMessage("ROW_CSR argument last element != nnz",nnz_check); 108 | 109 | // Since sort is in-place, copy the read-only vectors to read-write ones 110 | cudaStatus = cudaMemcpy((void *)d_col_sort, d_col, nnz*sizeof(int), cudaMemcpyDeviceToDevice); 111 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed"); 112 | 113 | if (ccx == mxREAL) 114 | { 115 | const float * const d_val = (float*)mxGPUGetDataReadOnly(val); 116 | float *d_val_sort = (float*)mxGPUGetData(val_sort); 117 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice); 118 | } 119 | else 120 | { 121 | const cuFloatComplex * const d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 122 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort); 123 | cudaStatus = cudaMemcpy((void *)d_val_sort, d_val, nnz*sizeof(cuFloatComplex), cudaMemcpyDeviceToDevice); 124 | } 125 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMemcpy failed",cudaStatus); 126 | 127 | // Sort by rows 128 | int *P = NULL; 129 | void *pBuffer = NULL; 130 | size_t pBufferSizeInBytes = 0; 131 | 132 | if (nnz > 0) 133 | { 134 | // step 1: allocate buffer 135 | cusparseStatus = cusparseXcsrsort_bufferSizeExt(cusparseHandle, nrows, ncols, nnz, d_row_csr, d_col, &pBufferSizeInBytes); 136 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcoosort_bufferSizeExt failed",cusparseStatus); 137 | 138 | cudaStatus = cudaMalloc( &pBuffer, sizeof(char)*pBufferSizeInBytes); 139 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus); 140 | 141 | // step 2: setup permutation vector P to identity 142 | cudaStatus = cudaMalloc( &P, sizeof(int)*nnz); 143 | if (cudaStatus != cudaSuccess) mxShowCriticalErrorMessage("Operation cudaMalloc failed",cudaStatus); 144 | 145 | cusparseStatus = cusparseCreateIdentityPermutation(cusparseHandle, nnz, P); 146 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseCreateIdentityPermutation failed",cusparseStatus); 147 | 148 | // step 3: sort COO format by Row 149 | cusparseStatus = cusparseXcsrsort(cusparseHandle, nrows, ncols, nnz, descr, d_row_csr, d_col_sort, P, pBuffer); 150 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseXcsrsort failed",cusparseStatus); 151 | 152 | // step 4: gather sorted cooVals 153 | if (ccx == mxREAL) 154 | { 155 | float *d_val = (float*)mxGPUGetDataReadOnly(val); 156 | float *d_val_sort = (float*)mxGPUGetData(val_sort); 157 | #if CUDART_VERSION >= 11000 158 | cusparseHandle_t handle = NULL; 159 | cusparseDnVecDescr_t vec_values; 160 | cusparseSpVecDescr_t vec_permutation; 161 | cusparseCreate(&handle); 162 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_R_32F); 163 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); // MUST USE BASE_ZERO 164 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation); 165 | cusparseDestroyDnVec(vec_values); 166 | cusparseDestroySpVec(vec_permutation); 167 | cusparseDestroy(handle); 168 | #else 169 | cusparseStatus = cusparseSgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO 170 | #endif 171 | } 172 | else 173 | { 174 | cuFloatComplex *d_val = (cuFloatComplex*)mxGPUGetDataReadOnly(val); 175 | cuFloatComplex *d_val_sort = (cuFloatComplex*)mxGPUGetData(val_sort); 176 | #if CUDART_VERSION >= 11000 177 | cusparseHandle_t handle = NULL; 178 | cusparseDnVecDescr_t vec_values; 179 | cusparseSpVecDescr_t vec_permutation; 180 | cusparseCreate(&handle); 181 | cusparseCreateDnVec(&vec_values, nnz, d_val, CUDA_C_32F); 182 | cusparseCreateSpVec(&vec_permutation, nnz, nnz, P, d_val_sort, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_C_32F); // MUST USE BASE_ZERO 183 | cusparseStatus = cusparseGather(handle, vec_values, vec_permutation); 184 | cusparseDestroyDnVec(vec_values); 185 | cusparseDestroySpVec(vec_permutation); 186 | cusparseDestroy(handle); 187 | #else 188 | cusparseStatus = cusparseCgthr(cusparseHandle, nnz, d_val, d_val_sort, P, CUSPARSE_INDEX_BASE_ZERO); // MUST USE BASE_ZERO 189 | #endif 190 | } 191 | if (cusparseStatus != CUSPARSE_STATUS_SUCCESS) mxShowCriticalErrorMessage("Operation cusparseSgthr or cusparseCgthr failed",cusparseStatus); 192 | 193 | } 194 | 195 | // Return result 196 | COL_SORT = mxGPUCreateMxArrayOnGPU(col_sort); 197 | VAL_SORT = mxGPUCreateMxArrayOnGPU(val_sort); 198 | 199 | // Make sure operations are finished before deleting 200 | //cudaDeviceSynchronize(); 201 | 202 | // Clean up 203 | cusparseDestroyMatDescr(descr); 204 | cusparseDestroy(cusparseHandle); 205 | cublasDestroy(cublasHandle); 206 | mxGPUDestroyGPUArray(row_csr); 207 | mxGPUDestroyGPUArray(col); 208 | mxGPUDestroyGPUArray(col_sort); 209 | mxGPUDestroyGPUArray(val); 210 | mxGPUDestroyGPUArray(val_sort); 211 | if (pBuffer) cudaFree(pBuffer); 212 | if (P) cudaFree(P); 213 | 214 | return; 215 | } 216 | -------------------------------------------------------------------------------- /private/mex_all.m: -------------------------------------------------------------------------------- 1 | function mex_all() 2 | 3 | % checks 4 | if ~exist('/usr/local/cuda','dir') 5 | warning('/usr/local/cuda directory not found. Try:\n%s','"sudo ln -s /usr/local/cuda-11 /usr/local/cuda"') 6 | end 7 | 8 | % override MATLAB's supplied version of nvcc - not sure what difference this makes 9 | setenv('MW_ALLOW_ANY_CUDA','1') 10 | setenv('MW_NVCC_PATH', '/usr/local/cuda/bin') 11 | 12 | % need to be in the current directory for mexcuda 13 | oldpath = pwd; 14 | newpath = fileparts(mfilename('fullpath')); 15 | cd(newpath); 16 | 17 | % if the mexcuda fails, we are stuck - rethrow error 18 | try 19 | mex_all_compile(); 20 | cd(oldpath) 21 | catch ME 22 | cd(oldpath) 23 | rethrow(ME) 24 | end 25 | 26 | %% call mexcuda 27 | function mex_all_compile() 28 | 29 | mexcuda csrgeam.cu -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic -v 30 | mexcuda csrmv.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 31 | mexcuda coo2csr.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 32 | mexcuda csr2csc.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 33 | mexcuda csr2csc_cpu.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 34 | mexcuda csrmm.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 35 | mexcuda csr2coo.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 36 | mexcuda csrsort.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 37 | mexcuda coosortByRow.cu -R2018a -I/usr/local/cuda/include -L/usr/local/cuda/lib64 NVCCFLAGS='"$NVCCFLAGS -w -Wno-deprecated-gpu-targets"' LDFLAGS='"$LDFLAGS -Wl,--no-as-needed"' -ldl -lcusparse -lcublas -lculibos -dynamic 38 | -------------------------------------------------------------------------------- /private/mxShowCriticalErrorMessage.h: -------------------------------------------------------------------------------- 1 | // Beautified mex error message macro. 2 | // 3 | // Inspired by 4 | // http://www.advanpix.com/2016/02/14/short-and-informative-error-messages-from-mex/ 5 | // 6 | // Usage: 7 | // mxShowCriticalErrorMessage("my message"); 8 | // mxShowCriticalErrorMessage("my message", 123); 9 | // 10 | #ifndef mxShowCriticalErrorMessage 11 | 12 | #include "mex.h" 13 | #include 14 | #include 15 | 16 | // Macro to strip the path off __FILE__ (platform independent alternative to basename) 17 | #define __FUNC__ std::max(__FILE__,std::max(strrchr(__FILE__,'\\')+1,strrchr(__FILE__,'/')+1)) 18 | 19 | // Use macro to expand __FUNC__ and __LINE__ correctly 20 | #define mxShowCriticalErrorMessage(...) err_fn(__FUNC__,__LINE__,##__VA_ARGS__) 21 | 22 | // Use overloads to handle __VA_ARGS__ correctly 23 | void err_fn(const char *fn_name, int line_no, const char *err_message, int err_code) 24 | { 25 | const int nargs = 5; 26 | mxArray *err_args[nargs]; 27 | err_args[0] = mxCreateString("\n%s(%i): %s (%i).\n"); 28 | err_args[1] = mxCreateString(fn_name); 29 | err_args[2] = mxCreateDoubleMatrix(1,1,mxREAL); 30 | err_args[3] = mxCreateString(err_message); 31 | err_args[4] = mxCreateDoubleMatrix(1,1,mxREAL); 32 | *mxGetPr(err_args[2]) = line_no; 33 | *mxGetPr(err_args[4]) = err_code; 34 | mexCallMATLAB(0,0,nargs,err_args,"error"); 35 | } 36 | 37 | void err_fn(const char *fn_name, int line_no, const char *err_message) 38 | { 39 | const int nargs = 4; 40 | mxArray *err_args[nargs]; 41 | err_args[0] = mxCreateString("\n%s(%i): %s.\n"); 42 | err_args[1] = mxCreateString(fn_name); 43 | err_args[2] = mxCreateDoubleMatrix(1,1,mxREAL); 44 | err_args[3] = mxCreateString(err_message); 45 | *mxGetPr(err_args[2]) = line_no; 46 | mexCallMATLAB(0,0,nargs,err_args,"error"); 47 | } 48 | 49 | void err_fn(const char *fn_name, int line_no, int err_code) 50 | { 51 | err_fn(fn_name, line_no, "Error occurred", err_code); 52 | } 53 | 54 | void err_fn(const char *fn_name, int line_no) 55 | { 56 | err_fn(fn_name, line_no, "Error occurred"); 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /private/wrappers_to_cuda_11.h: -------------------------------------------------------------------------------- 1 | // wrappers to the new CUDA 11 interface for pre-11 code 2 | 3 | #include 4 | #include 5 | #include "mxShowCriticalErrorMessage.h" 6 | #include 7 | 8 | // for cuda 12 9 | #if CUDART_VERSION >= 12000 10 | 11 | #define CUSPARSE_MV_ALG_DEFAULT CUSPARSE_SPMV_ALG_DEFAULT 12 | #define CUSPARSE_CSR2CSC_ALG2 CUSPARSE_CSR2CSC_ALG1 13 | 14 | #endif 15 | // end of cuda 12 16 | 17 | #define CHECK_CUSPARSE(func) \ 18 | { \ 19 | cusparseStatus_t status = (func); \ 20 | if (status != CUSPARSE_STATUS_SUCCESS) \ 21 | mxShowCriticalErrorMessage(cusparseGetErrorString(status),status); \ 22 | } 23 | 24 | template cudaDataType type_to_enum(); 25 | template<> cudaDataType type_to_enum() { return CUDA_R_32F; } 26 | template<> cudaDataType type_to_enum() { return CUDA_C_32F; } 27 | 28 | // -------------------------------------------------------------------------------// 29 | template 30 | cusparseStatus_t 31 | cusparseXcsrmv_wrapper(cusparseHandle_t handle, 32 | cusparseOperation_t transA, 33 | int A_num_rows, 34 | int A_num_cols, 35 | int A_num_nnz, 36 | const T* alpha, 37 | const cusparseMatDescr_t descrA, 38 | const S* dA_values, 39 | const int* dA_csrOffsets, 40 | const int* dA_columns, 41 | const T* dX, 42 | const T* beta, 43 | void* dY) 44 | { 45 | cusparseSpMatDescr_t matA; 46 | cusparseDnVecDescr_t vecX, vecY; 47 | void* buffer = NULL; 48 | size_t bufferSize = 0; 49 | cudaDataType typeA = type_to_enum(); 50 | cudaDataType typeX = type_to_enum(); 51 | cudaDataType typeY = (typeA==CUDA_C_32F || typeX==CUDA_C_32F) ? CUDA_C_32F : CUDA_R_32F; 52 | 53 | //std::cout << "typeA " << typeA << " typeX " << typeX << " typeY " << typeY << std::endl; 54 | 55 | // Create sparse matrix A in CSR format 56 | CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz, 57 | (void*)dA_csrOffsets, (void*)dA_columns, (void*)dA_values, 58 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 59 | cusparseGetMatIndexBase(descrA), typeA) ) 60 | // Create dense vector X 61 | int X_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_cols : A_num_rows; 62 | CHECK_CUSPARSE( cusparseCreateDnVec(&vecX, X_rows, (void*)dX, typeX) ) 63 | // Create dense vector y 64 | int Y_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_rows : A_num_cols; 65 | CHECK_CUSPARSE( cusparseCreateDnVec(&vecY, Y_rows, (void*)dY, typeY) ) 66 | // allocate an external buffer if needed 67 | CHECK_CUSPARSE( cusparseSpMV_bufferSize( 68 | handle, transA, 69 | alpha, matA, vecX, beta, vecY, typeY, 70 | CUSPARSE_MV_ALG_DEFAULT, &bufferSize) ) 71 | if (bufferSize > 0) 72 | { 73 | cudaError_t status = cudaMalloc(&buffer, bufferSize); 74 | if (status != cudaSuccess) 75 | return CUSPARSE_STATUS_ALLOC_FAILED; 76 | } 77 | 78 | //std::cout << "bufferSize " << bufferSize << " CUSPARSE_STATUS_NOT_SUPPORTED " << CUSPARSE_STATUS_NOT_SUPPORTED << std::endl; 79 | 80 | // execute SpMV 81 | CHECK_CUSPARSE( cusparseSpMV(handle, transA, 82 | alpha, matA, vecX, beta, vecY, typeY, 83 | CUSPARSE_MV_ALG_DEFAULT, buffer) ) 84 | 85 | 86 | // destroy matrix/vector descriptors 87 | CHECK_CUSPARSE( cusparseDestroySpMat(matA) ) 88 | CHECK_CUSPARSE( cusparseDestroyDnVec(vecX) ) 89 | CHECK_CUSPARSE( cusparseDestroyDnVec(vecY) ) 90 | if(buffer) cudaFree(buffer); 91 | return CUSPARSE_STATUS_SUCCESS; 92 | } 93 | 94 | // -------------------------------------------------------------------------------// 95 | template 96 | cusparseStatus_t 97 | cusparseXcsrmm_wrapper(cusparseHandle_t handle, 98 | cusparseOperation_t transA, 99 | int A_num_rows, 100 | int A_num_cols, 101 | int B_num_cols, 102 | int A_num_nnz, 103 | const T* alpha, 104 | const cusparseMatDescr_t descrA, 105 | const S* dA_values, 106 | const int* dA_csrOffsets, 107 | const int* dA_columns, 108 | const T* dB, 109 | int ldb, 110 | const T* beta, 111 | void* dC, 112 | int ldc) 113 | { 114 | cusparseSpMatDescr_t matA; 115 | cusparseDnMatDescr_t matB, matC; 116 | void* buffer = NULL; 117 | size_t bufferSize = 0; 118 | cudaDataType typeA = type_to_enum(); 119 | cudaDataType typeB = type_to_enum(); 120 | cudaDataType typeC = (typeA==CUDA_C_32F || typeB==CUDA_C_32F) ? CUDA_C_32F : CUDA_R_32F; 121 | 122 | // handle some limited transpose functionality (A or A' only) 123 | cusparseOperation_t transB = CUSPARSE_OPERATION_NON_TRANSPOSE; 124 | int B_num_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_cols : A_num_rows; 125 | int C_num_rows = (transA==CUSPARSE_OPERATION_NON_TRANSPOSE) ? A_num_rows : A_num_cols; 126 | int C_num_cols = (transB==CUSPARSE_OPERATION_NON_TRANSPOSE) ? B_num_cols : B_num_rows; 127 | 128 | // Create sparse matrix A in CSR format 129 | CHECK_CUSPARSE( cusparseCreateCsr(&matA, A_num_rows, A_num_cols, A_num_nnz, 130 | (void*)dA_csrOffsets, (void*)dA_columns, (void*)dA_values, 131 | CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, 132 | cusparseGetMatIndexBase(descrA), typeA) ) 133 | // Create dense vector B 134 | CHECK_CUSPARSE( cusparseCreateDnMat(&matB, B_num_rows, B_num_cols, ldb, (void*)dB, typeB, CUSPARSE_ORDER_COL) ) 135 | // Create dense vector C 136 | CHECK_CUSPARSE( cusparseCreateDnMat(&matC, C_num_rows, C_num_cols, ldc, (void*)dC, typeC, CUSPARSE_ORDER_COL) ) 137 | // allocate an external buffer if needed 138 | CHECK_CUSPARSE( cusparseSpMM_bufferSize( 139 | handle, transA, transB, 140 | (void*)alpha, matA, matB, (void*)beta, matC, typeC, 141 | CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize) ) 142 | if (bufferSize > 0) { 143 | cudaError_t status = cudaMalloc(&buffer, bufferSize); 144 | if (status != cudaSuccess) 145 | return CUSPARSE_STATUS_ALLOC_FAILED; 146 | } 147 | // execute SpMM 148 | CHECK_CUSPARSE( cusparseSpMM(handle, transA, transB, 149 | alpha, matA, matB, beta, matC, typeC, 150 | CUSPARSE_SPMM_ALG_DEFAULT, buffer) ) 151 | // destroy matrix/vector descriptors 152 | CHECK_CUSPARSE( cusparseDestroySpMat(matA) ) 153 | CHECK_CUSPARSE( cusparseDestroyDnMat(matB) ) 154 | CHECK_CUSPARSE( cusparseDestroyDnMat(matC) ) 155 | if(buffer) cudaFree(buffer); 156 | return CUSPARSE_STATUS_SUCCESS; 157 | } 158 | 159 | // -------------------------------------------------------------------------------// 160 | template 161 | cusparseStatus_t 162 | cusparseXcsr2csc_wrapper(cusparseHandle_t handle, 163 | int m, 164 | int n, 165 | int nnz, 166 | const T* csrVal, 167 | const int* csrRowPtr, 168 | const int* csrColInd, 169 | T* cscVal, 170 | int* cscRowInd, 171 | int* cscColPtr, 172 | cusparseAction_t copyValues, 173 | cusparseIndexBase_t idxBase) 174 | { 175 | void* buffer = NULL; 176 | size_t bufferSize = 0; 177 | cudaDataType valType = type_to_enum(); 178 | cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG2; 179 | 180 | // fails if nnz==0 181 | if(nnz==0) 182 | { 183 | mxShowCriticalErrorMessage("BUG: cusparseCsr2cscEx2 fails when nnz=0"); 184 | return CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED; 185 | } 186 | 187 | // make buffer 188 | CHECK_CUSPARSE( cusparseCsr2cscEx2_bufferSize( 189 | handle, 190 | m, 191 | n, 192 | nnz, 193 | csrVal, 194 | csrRowPtr, 195 | csrColInd, 196 | cscVal, 197 | cscColPtr, 198 | cscRowInd, 199 | valType, 200 | copyValues, 201 | idxBase, 202 | alg, 203 | &bufferSize) ) 204 | 205 | if (bufferSize > 0) 206 | { 207 | cudaError_t status = cudaMalloc(&buffer, bufferSize); 208 | if (status != cudaSuccess) 209 | return CUSPARSE_STATUS_ALLOC_FAILED; 210 | } 211 | 212 | CHECK_CUSPARSE( cusparseCsr2cscEx2( 213 | handle, 214 | m, 215 | n, 216 | nnz, 217 | csrVal, 218 | csrRowPtr, 219 | csrColInd, 220 | cscVal, 221 | cscColPtr, 222 | cscRowInd, 223 | valType, 224 | copyValues, 225 | idxBase, 226 | alg, 227 | buffer) ) 228 | 229 | if(buffer) cudaFree(buffer); 230 | return CUSPARSE_STATUS_SUCCESS; 231 | } 232 | 233 | 234 | 235 | -------------------------------------------------------------------------------- /test_gpuSparse.m: -------------------------------------------------------------------------------- 1 | % test gpuSparse class 2 | clear all 3 | reset(gpuDevice(1)) 4 | 5 | M = 121401; 6 | N = 113331; 7 | P = 5e-4; 8 | 9 | %M = 4; 10 | %N = 4; 11 | %P = 1; 12 | 13 | disp('---SETUP---') 14 | 15 | rand('state',0); 16 | randn('state',0); 17 | 18 | tic; fprintf('Making sparse... ') 19 | A = sprandn(M,N,P); 20 | toc 21 | 22 | % remove unwanted precision 23 | [i j v] = find(A); 24 | v = double(single(v)); 25 | A = sparse(i,j,v,M,N); 26 | 27 | tic; fprintf('Converting to gpuSparse... ') 28 | a = gpuSparse(A); validate(a) 29 | toc 30 | 31 | fprintf('Sorted index conversion to gpuSparse: ') 32 | [i j v] = find(a); 33 | tic; 34 | b = gpuSparse(i,j,v,M,N); validate(b) 35 | fprintf('errors = [%i %i %i]. ',any(a.row~=b.row),any(a.col~=b.col),any(a.val~=b.val)) 36 | toc 37 | 38 | fprintf('Unsorted index conversion to gpuSparse: ') 39 | k = randperm(numel(v)); 40 | i = i(k); 41 | j = j(k); 42 | v = v(k); 43 | tic; 44 | b = gpuSparse(i,j,v,M,N); validate(b) 45 | fprintf('errors = [%i %i %i]. ',any(a.row~=b.row),any(a.col~=b.col),any(a.val~=b.val)) 46 | toc 47 | 48 | x = randn(N,1,'gpuArray'); 49 | y = randn(M,1,'gpuArray'); 50 | 51 | % remove unwanted precision 52 | x = double(single(x)); 53 | y = double(single(y)); 54 | 55 | %% Expected failues (bounds etc) 56 | disp('---CATCH ERRORS---') 57 | 58 | try; gpuSparse('test'); warning('failed'); end 59 | try; gpuSparse(rand(3,3,3)); warning('failed'); end 60 | try; gpuSparse(1,-1); warning('failed'); end 61 | try; gpuSparse(-1,1); warning('failed'); end 62 | try; gpuSparse(1,Inf); warning('failed'); end 63 | try; gpuSparse(1,NaN); warning('failed'); end 64 | try; gpuSparse(Inf,1); warning('failed'); end 65 | try; gpuSparse(NaN,1); warning('failed'); end 66 | try; gpuSparse(intmax('int32'),1); warning('failed'); end 67 | try; gpuSparse(1,intmax('int32')); warning('failed'); end 68 | try; gpuSparse(1,-1,0); warning('failed'); end 69 | try; gpuSparse(-1,1,0); warning('failed'); end 70 | try; gpuSparse(1,Inf,0); warning('failed'); end 71 | try; gpuSparse(1,NaN,0); warning('failed'); end 72 | try; gpuSparse(Inf,1,0); warning('failed'); end 73 | try; gpuSparse(NaN,1,0); warning('failed'); end 74 | try; gpuSparse(intmax('int32'),1,0); warning('failed'); end 75 | try; gpuSparse(1,intmax('int32'),0); warning('failed'); end 76 | try; gpuSparse(1,1,'test'); warning('failed'); end 77 | try; gpuSparse(1:2,1:1,1:2); warning('failed'); end 78 | try; gpuSparse(1:1,1:2,1:2); warning('failed'); end 79 | %try; gpuSparse(1:2,1:2,1:1); warning('failed'); end % this works... why is it here?! 80 | try; gpuSparse(1:1,1:1,1:2); warning('failed'); end 81 | try; gpuSparse(1,1,1,10,0); warning('failed'); end 82 | try; gpuSparse(1,1,1,0,10); warning('failed'); end 83 | try; gpuSparse(1,1,1,10,intmax('int32')); warning('failed'); end 84 | try; gpuSparse(1,1,1,intmax('int32'),10); warning('failed'); end 85 | try; gpuSparse(1,1,10,10,'test'); warning('failed'); end 86 | try; gpuSparse(1,1,10,'test',10); warning('failed'); end 87 | try; gpuSparse(10,10,1,10,9); warning('failed'); end 88 | try; gpuSparse(10,10,1,9,10); warning('failed'); end 89 | try; gpuSparse(10,10,1,10,10,-1); warning('failed'); end 90 | try; gpuSparse(10,10,1,10,10,Inf); warning('failed'); end 91 | try; gpuSparse(1.5,1,1,10,10,1); warning('failed'); end 92 | try; gpuSparse(1,1.5,1,10,10,1); warning('failed'); end 93 | try; gpuSparse(1,1,1,10.5,10,1); warning('failed'); end 94 | try; gpuSparse(1,1,1,10,10.5,1); warning('failed'); end 95 | try; gpuSparse(1,1,1,10,10,1.5); warning('failed'); end 96 | try; gpuSparse(1,1,1,10:11,10,1); warning('failed'); end 97 | try; gpuSparse(1,1,1,10,10:11,1); warning('failed'); end 98 | try; gpuSparse(1,1,1,10,10,1:2); warning('failed'); end 99 | 100 | %% accuracy 101 | disp('---ACCURACY---') 102 | 103 | disp([' Ax ' num2str(norm(A*x-a*single(x),Inf))]) 104 | disp([' A''*y ' num2str(norm(A'*y-a'*single(y),Inf))]) 105 | 106 | B = sprandn(M,N,P); 107 | 108 | % remove unwanted precision 109 | [i j v] = find(B); 110 | v = double(single(v)); 111 | B = sparse(i,j,v,M,N); 112 | 113 | b = gpuSparse(B); validate(b) 114 | 115 | C=A+B; 116 | c=a+b; validate(c) 117 | disp(['(A+B)x ' num2str(norm(C*x-c*single(x),Inf))]) 118 | disp(['(A+B)''*y ' num2str(norm(C'*y-c'*single(y),Inf))]) 119 | 120 | C=A-B; 121 | c=a-b; validate(c) 122 | disp(['(A-B)x ' num2str(norm(C*x-c*single(x),Inf))]) 123 | disp(['(A-B)''*y ' num2str(norm(C'*y-c'*single(y),Inf))]) 124 | 125 | d = a - (a')'; validate(d) 126 | disp(['max(a-a'''') ' num2str(max(d.val))]) 127 | disp(['min(a-a'''') ' num2str(min(d.val))]) 128 | 129 | d = a - full_transpose(full_transpose(a)); validate(d) 130 | disp(['max(a-a'''') ' num2str(max(d.val)) ' (full_transpose)']) 131 | disp(['min(a-a'''') ' num2str(min(d.val)) ' (full_transpose)']) 132 | 133 | B = double(single(randn(N,3))); 134 | b = gpuArray(B); 135 | 136 | C = A*B; 137 | c = a*single(b); 138 | disp(['(A*B-a*b) ' num2str([norm([C-c],Inf)])]) 139 | 140 | B = double(single(randn(M,4))); 141 | b = gpuArray(B); 142 | 143 | C = A'*B; 144 | c = a'*single(b); 145 | disp(['(A''*B-a''*b) ' num2str([norm([C-c],Inf)])]) 146 | 147 | %% miscellaneous operations 148 | 149 | disp('---MISCELLANEOUS---') 150 | 151 | % mixed real/complex multiplies 152 | 153 | A = A + 1i*sprandn(A); 154 | 155 | % remove unwanted precision 156 | [i j v] = find(A); 157 | v = double(single(v)); 158 | A = sparse(i,j,v,M,N); 159 | 160 | a = gpuSparse(A); validate(a) 161 | 162 | x = single(randn(N,1) + 1i*randn(N,1,'gpuArray')); 163 | y = single(randn(M,1) + 1i*randn(M,1,'gpuArray')); 164 | 165 | disp('real multiply') 166 | disp(norm(real(A)*double(real(x)) - real(a)*real(x),Inf)) 167 | disp(norm(real(A')*double(real(y)) - real(a')*real(y),Inf)) 168 | disp(norm(real(A.')*double(real(y)) - real(a.')*real(y),Inf)) 169 | 170 | disp('complex multiply') 171 | disp(norm(A*double(x) - a*x,Inf)) 172 | disp(norm(A'*double(y) - a'*y,Inf)) 173 | disp(norm(A.'*double(y) - a.'*y,Inf)) 174 | 175 | disp('mixed real/complex multiply') 176 | disp(norm(A*real(double(x)) - a*real(x),Inf)) 177 | disp(norm(real(A)*double(x) - real(a)*x,Inf)) 178 | disp(norm(A'*real(double(y)) - a'*real(y),Inf)) 179 | disp(norm(A.'*real(double(y)) - a.'*real(y),Inf)) 180 | disp(norm(real(A')*double(y) - real(a')*y,Inf)) 181 | disp(norm(real(A.')*double(y) - real(a.')*y,Inf)) 182 | 183 | disp('max') 184 | disp(norm(full(max(A,[],2)) - max(a,[],2))) 185 | 186 | disp('sum') 187 | disp(norm(sum(A,1) - sum(a,1),inf)) 188 | disp(norm(sum(A,2) - sum(a,2),inf)) 189 | 190 | disp('norm') 191 | disp(norm(A,1) - norm(a,1)) 192 | disp(norm(A,inf) - norm(a,inf)) 193 | disp(norm(A,'fro') - norm(a,'fro')) 194 | 195 | disp('sparse'); 196 | disp(norm(sparse(a)-A,inf)); 197 | disp(norm(sparse(a')-A',inf)); 198 | disp(norm(sparse(a.')-A.',inf)); 199 | disp('full_transpose(a)') 200 | at = full_transpose(a); validate(at); 201 | disp(norm(sparse(at)-A.',inf)) 202 | disp('full_ctranspose(a)') 203 | at = full_ctranspose(a); validate(at); 204 | disp(norm(sparse(at)-A',inf)) 205 | disp('full_transpose(a.'')') 206 | att = full_transpose(a.'); validate(att); 207 | disp(norm(sparse(att)-(A.').',inf)) 208 | disp('full_transpose(a'')') 209 | att = full_transpose(a'); validate(att); 210 | disp(norm(sparse(att)-(A').',inf)) 211 | disp('full_ctranspose(a.'')') 212 | att = full_ctranspose(a.'); validate(att); 213 | disp(norm(sparse(att)-(A.')',inf)) 214 | disp('full_ctranspose(a'')') 215 | att = full_ctranspose(a'); validate(att); 216 | disp(norm(sparse(att)-(A')',inf)) 217 | 218 | disp('find') 219 | [i j v] = find(A); [i2 j2 v2] = find(a); 220 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2)) 221 | [i j v] = find(A'); [i2 j2 v2] = find(a'); 222 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2)) 223 | [i j v] = find(A.'); [i2 j2 v2] = find(a.'); 224 | fprintf(' %i %i %g\n',norm(i-i2),norm(j-j2),norm(single(v)-v2)) 225 | 226 | % these fail - values are in different order 227 | %disp('nonzeros') 228 | %disp(norm(nonzeros(A)-nonzeros(a),inf)) 229 | %disp(norm(nonzeros(A')-nonzeros(a'),inf)) 230 | %disp(norm(nonzeros(A.')-nonzeros(a.'),inf)) 231 | 232 | disp('addition') 233 | B = sprandn(M,N,P); 234 | 235 | [i j v] = find(B); % remove unwanted precision 236 | v = double(single(v)); 237 | B = sparse(i,j,v,M,N); 238 | 239 | b = gpuSparse(B); validate(b) 240 | 241 | A = real(A); B = real(B); 242 | a = real(a); validate(a); 243 | b = real(b); validate(b); 244 | c = a+b; validate(c); 245 | 246 | disp(norm((A+B) - sparse(a+b),Inf)) 247 | disp(norm((A'+B') - sparse(a'+b'),Inf)) 248 | disp(norm((A.'+B.') - sparse(a.'+b.'),Inf)) 249 | disp(norm((A+B)' - sparse((a+b)'),Inf)) 250 | 251 | disp('cat') 252 | C = [A;B]; 253 | c = [a;b]; 254 | disp(norm(C-c,'fro')) 255 | C = [A B]; 256 | c = [a b]; 257 | disp(norm(C-c,'fro')) 258 | 259 | %% timings 260 | disp('---TIMINGS---') 261 | 262 | for j = 1:2 263 | 264 | A = gather(A); 265 | x = gather(x); 266 | y = gather(y); 267 | 268 | x = double(x); 269 | y = double(y); 270 | 271 | % to test mm as well as mv multiply 272 | if j==1 273 | fprintf('\n============= Matrix-vector multiply =============\n'); 274 | else 275 | x = repmat(x,1,5); 276 | y = repmat(y,1,5); 277 | fprintf('\n========= Matrix-matrix multiply (cols %i) =========\n',size(x,2)); 278 | end 279 | 280 | tic; fprintf('A*x (sparse) : ') 281 | for k = 1:20 282 | z = A*x; wait(gpuDevice); 283 | end 284 | toc; 285 | 286 | AT = A'; 287 | tic; fprintf('AT*y (sparse) : ') 288 | for k = 1:20 289 | z = AT*y; wait(gpuDevice); 290 | end 291 | toc; 292 | 293 | tic; fprintf('A''*y (sparse) : ') 294 | for k = 1:20 295 | z = A'*y; wait(gpuDevice); 296 | end 297 | toc; 298 | 299 | A = gpuArray(A); 300 | x = gpuArray(x); 301 | y = gpuArray(y); 302 | 303 | tic; fprintf('\nA*x (gpuArray) : ') 304 | for k = 1:20 305 | z = A*x; wait(gpuDevice); 306 | end 307 | toc; 308 | 309 | AT = A'; 310 | tic; fprintf('AT*y (gpuArray) : ') 311 | for k = 1:20 312 | z = AT*y; wait(gpuDevice); 313 | end 314 | toc; 315 | 316 | tic; fprintf('A''*y (gpuArray) : ') 317 | for k = 1:20 318 | z = A'*y; wait(gpuDevice); 319 | end 320 | toc; 321 | 322 | a = gpuSparse(A); validate(a) 323 | x = single(x); 324 | y = single(y); 325 | 326 | tic; fprintf('\nA*x (gpuSparse): ') 327 | for k = 1:20 328 | z = a*x; wait(gpuDevice); 329 | end 330 | toc; 331 | 332 | at = full_transpose(a); validate(at) 333 | tic; fprintf('At*y (gpuSparse): ') 334 | for k = 1:20 335 | z = at*y; wait(gpuDevice); 336 | end 337 | toc; 338 | 339 | tic; fprintf('A''*y (gpuSparse): ') 340 | for k = 1:20 341 | z = a'*y; wait(gpuDevice); 342 | end 343 | toc; 344 | 345 | end --------------------------------------------------------------------------------