├── test ├── REQUIRE ├── util.jl ├── runtests.jl ├── rand.jl ├── dnn.jl ├── sparse_solver.jl ├── base.jl ├── fft.jl └── solver.jl ├── docs ├── src │ ├── index.md │ └── tutorials │ │ ├── intro1.png │ │ └── common.jl ├── .gitignore ├── Project.toml └── make.jl ├── deps ├── .gitignore └── build.jl ├── bors.toml ├── .gitignore ├── src ├── deprecated.jl ├── indexing.jl ├── dnn │ ├── error.jl │ ├── CUDNN.jl │ ├── nnlib.jl │ ├── helpers.jl │ └── libcudnn_types.jl ├── blas │ ├── util.jl │ ├── CUBLAS.jl │ ├── error.jl │ ├── README.md │ ├── libcublas_types.jl │ └── highlevel.jl ├── fft │ ├── CUFFT.jl │ ├── genericfft.jl │ ├── error.jl │ ├── fft.jl │ ├── libcufft_types.jl │ ├── libcufft.jl │ ├── highlevel.jl │ └── wrappers.jl ├── nnlib.jl ├── rand │ ├── CURAND.jl │ ├── error.jl │ ├── libcurand_types.jl │ ├── highlevel.jl │ └── libcurand.jl ├── sparse │ ├── CUSPARSE.jl │ ├── error.jl │ ├── highlevel.jl │ ├── libcusparse.jl │ ├── libcusparse_types.jl │ └── array.jl ├── subarray.jl ├── solver │ ├── error.jl │ ├── CUSOLVER.jl │ ├── libcusolver_types.jl │ ├── highlevel.jl │ └── libcusolver.jl ├── utils.jl ├── broadcast.jl ├── gpuarray_interface.jl ├── CuArrays.jl ├── matmul.jl ├── mapreduce.jl └── array.jl ├── REQUIRE ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── Project.toml ├── LICENSE.md ├── .gitlab-ci.yml └── README.md /test/REQUIRE: -------------------------------------------------------------------------------- 1 | FFTW 2 | -------------------------------------------------------------------------------- /docs/src/index.md: -------------------------------------------------------------------------------- 1 | # CuArrays.jl -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | src/**/generated/ 2 | -------------------------------------------------------------------------------- /deps/.gitignore: -------------------------------------------------------------------------------- 1 | ext.jl.bak 2 | build.log 3 | 4 | -------------------------------------------------------------------------------- /bors.toml: -------------------------------------------------------------------------------- 1 | status = [ 2 | "ci/gitlab/%" 3 | ] 4 | delete_merged_branches = true 5 | -------------------------------------------------------------------------------- /docs/src/tutorials/intro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dpsanders/CuArrays.jl/master/docs/src/tutorials/intro1.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.cov 2 | *.jl.*.cov 3 | *.jl.mem 4 | deps/ext.jl 5 | Manifest.toml 6 | tutorials/build/ 7 | docs/build/ 8 | -------------------------------------------------------------------------------- /src/deprecated.jl: -------------------------------------------------------------------------------- 1 | # Deprecated functionality 2 | 3 | import Base: @deprecate_binding 4 | 5 | @deprecate_binding BLAS CUBLAS 6 | @deprecate_binding FFT CUFFT 7 | -------------------------------------------------------------------------------- /REQUIRE: -------------------------------------------------------------------------------- 1 | julia 1.0 2 | CUDAnative 1.1 3 | CUDAdrv 1.1 4 | CUDAapi 0.5.3 5 | NNlib 0.5.0 6 | GPUArrays 0.5 7 | Adapt 0.4 8 | AbstractFFTs 9 | MacroTools 10 | ForwardDiff 11 | DiffRules 12 | -------------------------------------------------------------------------------- /docs/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 3 | Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" 4 | Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" 5 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 6 | -------------------------------------------------------------------------------- /test/util.jl: -------------------------------------------------------------------------------- 1 | macro grab_output(ex) 2 | quote 3 | mktemp() do fname, fout 4 | ret = nothing 5 | open(fname, "w") do fout 6 | redirect_stdout(fout) do 7 | ret = $(esc(ex)) 8 | end 9 | end 10 | ret, read(fname, String) 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /src/indexing.jl: -------------------------------------------------------------------------------- 1 | import GPUArrays: allowscalar, @allowscalar 2 | 3 | function _getindex(xs::CuArray{T}, i::Integer) where T 4 | buf = Mem.view(buffer(xs), (i-1)*sizeof(T)) 5 | return Mem.download(T, buf)[1] 6 | end 7 | 8 | function _setindex!(xs::CuArray{T}, v::T, i::Integer) where T 9 | buf = Mem.view(buffer(xs), (i-1)*sizeof(T)) 10 | Mem.upload!(buf, T[v]) 11 | end 12 | -------------------------------------------------------------------------------- /docs/src/tutorials/common.jl: -------------------------------------------------------------------------------- 1 | # function to run a Julia script outside of the current environment 2 | function script(code; wrapper=``, args=``) 3 | if Base.JLOptions().project != C_NULL 4 | args = `$args --project=$(unsafe_string(Base.JLOptions().project))` 5 | end 6 | mktemp() do path, io 7 | write(io, code) 8 | flush(io) 9 | cmd = `$wrapper $(Base.julia_cmd()) $args $path` 10 | # redirect stderr to stdout to have it picked up by Weave.jl 11 | run(pipeline(ignorestatus(cmd), stderr=stdout)) 12 | end 13 | nothing 14 | end 15 | -------------------------------------------------------------------------------- /src/dnn/error.jl: -------------------------------------------------------------------------------- 1 | export CUDNNError 2 | 3 | struct CUDNNError <: Exception 4 | code::cudnnStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUDNNError) = print(io, "CUDNNError(code $(err.code), $(err.msg))") 8 | 9 | function CUDNNError(status::cudnnStatus_t) 10 | msg = unsafe_string(cudnnGetErrorString(status)) 11 | return CUDNNError(status, msg) 12 | end 13 | 14 | macro check(dnn_func) 15 | quote 16 | local err::cudnnStatus_t 17 | err = $(esc(dnn_func)) 18 | if err != CUDNN_STATUS_SUCCESS 19 | throw(CUDNNError(err)) 20 | end 21 | err 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /docs/make.jl: -------------------------------------------------------------------------------- 1 | using Documenter 2 | using Literate 3 | 4 | using Pkg 5 | if haskey(ENV, "GITLAB_CI") 6 | Pkg.add([PackageSpec(name = x; rev = "master") 7 | for x in ["CUDAapi", "GPUArrays", "CUDAnative", "NNlib", "CUDAdrv"]]) 8 | end 9 | 10 | using CuArrays 11 | 12 | # generate tutorials 13 | OUTPUT = joinpath(@__DIR__, "src/tutorials/generated") 14 | Literate.markdown(joinpath(@__DIR__, "src/tutorials/intro.jl"), OUTPUT) 15 | 16 | makedocs( 17 | modules = [CuArrays], 18 | format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), 19 | sitename = "CuArrays.jl", 20 | pages = [ 21 | "Home" => "index.md", 22 | "Tutorials" => [ 23 | "tutorials/generated/intro.md" 24 | ], 25 | ], 26 | doctest = true 27 | ) 28 | -------------------------------------------------------------------------------- /src/blas/util.jl: -------------------------------------------------------------------------------- 1 | # convert matrix to band storage 2 | function band(A::AbstractMatrix,kl,ku) 3 | m, n = size(A) 4 | AB = zeros(eltype(A),kl+ku+1,n) 5 | for j = 1:n 6 | for i = max(1,j-ku):min(m,j+kl) 7 | AB[ku+1-j+i,j] = A[i,j] 8 | end 9 | end 10 | return AB 11 | end 12 | 13 | # convert band storage to general matrix 14 | function unband(AB::AbstractMatrix,m,kl,ku) 15 | bm, n = size(AB) 16 | A = zeros(eltype(AB),m,n) 17 | for j = 1:n 18 | for i = max(1,j-ku):min(m,j+kl) 19 | A[i,j] = AB[ku+1-j+i,j] 20 | end 21 | end 22 | return A 23 | end 24 | 25 | # zero out elements not on matrix bands 26 | function bandex(A::AbstractMatrix,kl,ku) 27 | m, n = size(A) 28 | AB = band(A,kl,ku) 29 | B = unband(AB,m,kl,ku) 30 | return B 31 | end 32 | -------------------------------------------------------------------------------- /src/fft/CUFFT.jl: -------------------------------------------------------------------------------- 1 | module CUFFT 2 | 3 | import CUDAapi 4 | 5 | import CUDAdrv: CuPtr, PtrOrCuPtr 6 | 7 | using ..CuArrays 8 | using ..CuArrays: libcufft, configured 9 | 10 | import AbstractFFTs: plan_fft, plan_fft!, plan_bfft, plan_bfft!, 11 | plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft, 12 | Plan, ScaledPlan 13 | import Base: show, *, convert, unsafe_convert, size, strides, ndims 14 | import Base.Sys: WORD_SIZE 15 | 16 | using LinearAlgebra 17 | import LinearAlgebra: mul! 18 | 19 | include("libcufft_types.jl") 20 | include("error.jl") 21 | 22 | include("libcufft.jl") 23 | include("genericfft.jl") 24 | include("fft.jl") 25 | include("wrappers.jl") 26 | include("highlevel.jl") 27 | 28 | version() = VersionNumber(cufftGetProperty(CUDAapi.MAJOR_VERSION), 29 | cufftGetProperty(CUDAapi.MINOR_VERSION), 30 | cufftGetProperty(CUDAapi.PATCH_LEVEL)) 31 | 32 | end 33 | -------------------------------------------------------------------------------- /test/runtests.jl: -------------------------------------------------------------------------------- 1 | using Test 2 | 3 | # development often happens in lockstep with other packages, 4 | # so check-out the master branch of those packages. 5 | using Pkg 6 | if haskey(ENV, "GITLAB_CI") 7 | Pkg.add([PackageSpec(name = x; rev = "master") 8 | for x in ["CUDAapi", "GPUArrays", "CUDAnative", "NNlib", "CUDAdrv"]]) 9 | end 10 | 11 | include("util.jl") 12 | 13 | using Random 14 | Random.seed!(1) 15 | 16 | using CuArrays 17 | 18 | using GPUArrays 19 | import GPUArrays: allowscalar, @allowscalar 20 | 21 | testf(f, xs...; kwargs...) = GPUArrays.TestSuite.compare(f, CuArray, xs...; kwargs...) 22 | 23 | allowscalar(false) 24 | 25 | @testset "CuArrays" begin 26 | 27 | include("base.jl") 28 | include("dnn.jl") 29 | include("blas.jl") 30 | include("sparse.jl") 31 | include("solver.jl") 32 | include("fft.jl") 33 | include("rand.jl") 34 | include("sparse_solver.jl") 35 | 36 | CuArrays.pool_status() 37 | CuArrays.pool_timings() 38 | 39 | end 40 | -------------------------------------------------------------------------------- /src/dnn/CUDNN.jl: -------------------------------------------------------------------------------- 1 | module CUDNN 2 | 3 | import CUDAapi 4 | 5 | import CUDAdrv: CUDAdrv, CuContext, CuPtr, CU_NULL 6 | 7 | using ..CuArrays 8 | using ..CuArrays: libcudnn, active_context, configured, unsafe_free! 9 | 10 | include("libcudnn_types.jl") 11 | include("error.jl") 12 | 13 | const _handles = Dict{CuContext,cudnnHandle_t}() 14 | const _handle = Ref{cudnnHandle_t}(C_NULL) 15 | 16 | function handle() 17 | if _handle[] == C_NULL 18 | @assert isassigned(active_context) # some other call should have initialized CUDA 19 | _handle[] = get!(_handles, active_context[]) do 20 | context = active_context[] 21 | handle = cudnnCreate() 22 | atexit(()->CUDAdrv.isvalid(context) && cudnnDestroy(handle)) 23 | handle 24 | end 25 | end 26 | 27 | return _handle[] 28 | end 29 | 30 | include("libcudnn.jl") 31 | include("helpers.jl") 32 | include("nnlib.jl") 33 | 34 | version() = VersionNumber(cudnnGetProperty(CUDAapi.MAJOR_VERSION), 35 | cudnnGetProperty(CUDAapi.MINOR_VERSION), 36 | cudnnGetProperty(CUDAapi.PATCH_LEVEL)) 37 | 38 | end 39 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | name = "CuArrays" 2 | uuid = "3a865a2d-5b23-5a0f-bc46-62713ec82fae" 3 | version = "0.10.0" 4 | 5 | [deps] 6 | AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" 7 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 8 | CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" 9 | CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" 10 | CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" 11 | DiffRules = "b552c78f-8df3-52c6-915a-8e097449b14b" 12 | ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" 13 | GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" 14 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 15 | MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" 16 | NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" 17 | Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" 18 | Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 19 | SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" 20 | TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" 21 | 22 | [extras] 23 | FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" 24 | Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 25 | Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 26 | 27 | [targets] 28 | test = ["Test", "FFTW", "Pkg"] 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The CuArrays.jl package is licensed under the MIT "Expat" License: 2 | 3 | > Copyright (c) 2017: Mike J Innes. 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining a copy 6 | > of this software and associated documentation files (the "Software"), to deal 7 | > in the Software without restriction, including without limitation the rights 8 | > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | > copies of the Software, and to permit persons to whom the Software is 10 | > furnished to do so, subject to the following conditions: 11 | > 12 | > The above copyright notice and this permission notice shall be included in all 13 | > copies or substantial portions of the Software. 14 | > 15 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | > SOFTWARE. 22 | > 23 | -------------------------------------------------------------------------------- /src/nnlib.jl: -------------------------------------------------------------------------------- 1 | using NNlib 2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, 3 | maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, 4 | softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax 5 | using CUDAnative 6 | 7 | # Activation functions 8 | @cufunc σ(x) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x))) 9 | 10 | @cufunc function logσ(x) 11 | max_v = max(zero(x), -x) 12 | z = exp(-max_v) + exp(-x-max_v) 13 | -(max_v + log(z)) 14 | end 15 | 16 | @cufunc elu(x, α = one(x)) = 17 | ifelse(x ≥ 0, x/1, α * (exp(x) - one(x))) 18 | 19 | # TODO: make @cufunc recognise its own definitions 20 | cufunc(::typeof(swish)) = x -> x * cufunc(σ)(x) 21 | 22 | @cufunc function selu(x) 23 | λ = oftype(x/1, 1.0507009873554804934193349852946) 24 | α = oftype(x/1, 1.6732632423543772848170429916717) 25 | λ * ifelse(x > 0, x/1, α * (exp(x) - 1)) 26 | end 27 | 28 | @cufunc softplus(x) = log1p(exp(x)) 29 | 30 | if !@isdefined CUDNN 31 | function conv!(y::CuArray, x::CuArray, w::CuArray; kw...) 32 | error("CUDNN is not installed.") 33 | end 34 | function softmax!(out::CuVecOrMat, xs::CuVecOrMat) 35 | error("CUDNN is not installed.") 36 | end 37 | function logsoftmax!(out::CuVecOrMat, xs::CuVecOrMat) 38 | error("CUDNN is not installed.") 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /src/rand/CURAND.jl: -------------------------------------------------------------------------------- 1 | module CURAND 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuPtr 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcurand, active_context 8 | 9 | using GPUArrays 10 | 11 | using Random 12 | 13 | export curand, 14 | curandn, 15 | curand_logn, rand_logn!, 16 | curand_poisson, rand_poisson! 17 | 18 | include("libcurand_types.jl") 19 | include("error.jl") 20 | 21 | const _generators = Dict{CuContext,RNG}() 22 | const _generator = Ref{Union{Nothing,RNG}}(nothing) 23 | 24 | function generator() 25 | if _generator[] == nothing 26 | @assert isassigned(active_context) # some other call should have initialized CUDA 27 | _generator[] = get!(_generators, active_context[]) do 28 | context = active_context[] 29 | generator = create_generator() 30 | # FIXME: crashes 31 | #atexit(()->CUDAdrv.isvalid(context) && destroy_generator(generator)) 32 | generator 33 | end 34 | end 35 | 36 | return _generator[]::RNG 37 | end 38 | 39 | include("libcurand.jl") 40 | include("highlevel.jl") 41 | 42 | version() = VersionNumber(curandGetProperty(CUDAapi.MAJOR_VERSION), 43 | curandGetProperty(CUDAapi.MINOR_VERSION), 44 | curandGetProperty(CUDAapi.PATCH_LEVEL)) 45 | 46 | end 47 | -------------------------------------------------------------------------------- /test/rand.jl: -------------------------------------------------------------------------------- 1 | @testset "CURAND" begin 2 | 3 | if !isdefined(CuArrays, :CURAND) 4 | @warn "Not testing CURAND" 5 | else 6 | using CuArrays.CURAND 7 | @info "Testing CURAND $(CURAND.version())" 8 | 9 | CURAND.seed!() 10 | 11 | # in-place 12 | for (f,T) in ((rand!,Float32), 13 | (randn!,Float32), 14 | (rand_logn!,Float32), 15 | (rand_poisson!,Cuint)), 16 | d in (2, (2,2), (2,2,2)) 17 | A = CuArray{T}(undef, d) 18 | f(A) 19 | end 20 | 21 | # out-of-place, with implicit type 22 | for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32), 23 | (curand_poisson,Cuint)), 24 | args in ((2,), (2, 2)) 25 | A = f(args...) 26 | @test eltype(A) == T 27 | end 28 | 29 | # out-of-place, with type specified 30 | for (f,T) in ((curand,Float32), (curandn,Float32), (curand_logn,Float32), 31 | (curand,Float64), (curandn,Float64), (curand_logn,Float64), 32 | (curand_poisson,Cuint)), 33 | args in ((T, 2), (T, 2, 2), (T, (2, 2))) 34 | A = f(args...) 35 | @test eltype(A) == T 36 | end 37 | 38 | # unsupported types that fall back to GPUArrays 39 | for (f,T) in ((curand,Int64),), 40 | args in ((T, 2), (T, 2, 2), (T, (2, 2))) 41 | A = f(args...) 42 | @test eltype(A) == T 43 | end 44 | for (f,T) in ((rand!,Int64),), 45 | d in (2, (2,2), (2,2,2)) 46 | A = CuArray{T}(undef, d) 47 | f(A) 48 | end 49 | 50 | end 51 | 52 | end 53 | -------------------------------------------------------------------------------- /src/blas/CUBLAS.jl: -------------------------------------------------------------------------------- 1 | module CUBLAS 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcublas, active_context, unsafe_free! 8 | 9 | using LinearAlgebra 10 | 11 | include("libcublas_types.jl") 12 | include("error.jl") 13 | 14 | const _handles = Dict{CuContext,cublasHandle_t}() 15 | const _handle = Ref{cublasHandle_t}(C_NULL) 16 | 17 | function handle() 18 | if _handle[] == C_NULL 19 | @assert isassigned(active_context) # some other call should have initialized CUDA 20 | _handle[] = get!(_handles, active_context[]) do 21 | context = active_context[] 22 | handle = cublasCreate_v2() 23 | 24 | # enable tensor math mode if our device supports it, and fast math is enabled 25 | dev = CUDAdrv.device(context) 26 | if Base.JLOptions().fast_math == 1 && CUDAdrv.capability(dev) >= v"7.0" 27 | cublasSetMathMode(CUBLAS_TENSOR_OP_MATH, handle) 28 | end 29 | 30 | atexit(()->CUDAdrv.isvalid(context) && cublasDestroy_v2(handle)) 31 | handle 32 | end 33 | end 34 | 35 | return _handle[] 36 | end 37 | 38 | include("libcublas.jl") 39 | include("util.jl") 40 | include("wrappers.jl") 41 | include("highlevel.jl") 42 | 43 | version() = VersionNumber(cublasGetProperty(CUDAapi.MAJOR_VERSION), 44 | cublasGetProperty(CUDAapi.MINOR_VERSION), 45 | cublasGetProperty(CUDAapi.PATCH_LEVEL)) 46 | 47 | end 48 | -------------------------------------------------------------------------------- /src/sparse/CUSPARSE.jl: -------------------------------------------------------------------------------- 1 | module CUSPARSE 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcusparse, active_context, unsafe_free! 8 | 9 | using SparseArrays 10 | using LinearAlgebra 11 | 12 | import Base.one 13 | import Base.zero 14 | 15 | const SparseChar = Char 16 | import Base.one 17 | import Base.zero 18 | 19 | export CuSparseMatrixCSC, CuSparseMatrixCSR, 20 | CuSparseMatrixHYB, CuSparseMatrixBSR, 21 | CuSparseMatrix, AbstractCuSparseMatrix, 22 | CuSparseVector 23 | 24 | include("libcusparse_types.jl") 25 | include("error.jl") 26 | 27 | const _handles = Dict{CuContext,cusparseHandle_t}() 28 | const _handle = Ref{cusparseHandle_t}() 29 | 30 | function handle() 31 | if _handle[] == C_NULL 32 | @assert isassigned(active_context) # some other call should have initialized CUDA 33 | _handle[] = get!(_handles, active_context[]) do 34 | context = active_context[] 35 | handle = cusparseCreate() 36 | atexit(()->CUDAdrv.isvalid(context) && cusparseDestroy(handle)) 37 | handle 38 | end 39 | end 40 | 41 | return _handle[] 42 | end 43 | 44 | include("libcusparse.jl") 45 | include("array.jl") 46 | include("util.jl") 47 | include("wrappers.jl") 48 | include("highlevel.jl") 49 | 50 | version() = VersionNumber(cusparseGetProperty(CUDAapi.MAJOR_VERSION), 51 | cusparseGetProperty(CUDAapi.MINOR_VERSION), 52 | cusparseGetProperty(CUDAapi.PATCH_LEVEL)) 53 | 54 | end 55 | -------------------------------------------------------------------------------- /src/subarray.jl: -------------------------------------------------------------------------------- 1 | import Base: view 2 | 3 | using Base: ScalarIndex, ViewIndex, Slice, @_inline_meta, @boundscheck, 4 | to_indices, compute_offset1, unsafe_length, _maybe_reshape_parent, index_ndims 5 | 6 | struct Contiguous end 7 | struct NonContiguous end 8 | 9 | # Detect whether the view is contiguous or not 10 | CuIndexStyle() = Contiguous() 11 | CuIndexStyle(I...) = NonContiguous() 12 | CuIndexStyle(i1::Colon, ::ScalarIndex...) = Contiguous() 13 | CuIndexStyle(i1::AbstractUnitRange, ::ScalarIndex...) = Contiguous() 14 | CuIndexStyle(i1::Colon, I...) = CuIndexStyle(I...) 15 | 16 | cuviewlength() = () 17 | cuviewlength(::Real, I...) = (@_inline_meta; cuviewlength(I...)) # skip scalars 18 | cuviewlength(i1::AbstractUnitRange, I...) = (@_inline_meta; (unsafe_length(i1), cuviewlength(I...)...)) 19 | cuviewlength(i1::AbstractUnitRange, ::ScalarIndex...) = (@_inline_meta; (unsafe_length(i1),)) 20 | 21 | view(A::CuArray, I::Vararg{Any,N}) where {N} = (@_inline_meta; _cuview(A, I, CuIndexStyle(I...))) 22 | 23 | function _cuview(A, I, ::Contiguous) 24 | @_inline_meta 25 | J = to_indices(A, I) 26 | @boundscheck checkbounds(A, J...) 27 | _cuview(_maybe_reshape_parent(A, index_ndims(J...)), J, cuviewlength(J...)) 28 | end 29 | 30 | # for contiguous views just return a new CuArray 31 | _cuview(A::CuArray{T}, I::NTuple{N,ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M} = 32 | CuArray{T,M}(A.buf, dims; offset=A.offset + compute_offset1(A, 1, I) * sizeof(T), own=A.own) 33 | 34 | # fallback to SubArray when the view is not contiguous 35 | _cuview(A, I, ::NonContiguous) where {N} = invoke(view, Tuple{AbstractArray, typeof(I).parameters...}, A, I...) 36 | -------------------------------------------------------------------------------- /src/solver/error.jl: -------------------------------------------------------------------------------- 1 | export CUSOLVERError 2 | 3 | struct CUSOLVERError <: Exception 4 | code::cusolverStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUSOLVERError) = print(io, "CUSOLVERError(code $(err.code), $(err.msg))") 8 | 9 | function CUSOLVERError(code::cusolverStatus_t) 10 | msg = status_message(code) 11 | return CUSOLVERError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUSOLVER_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUSOLVER_STATUS_NOT_INITIALIZED 18 | return "the library was not initialized" 19 | elseif status == CUSOLVER_STATUS_ALLOC_FAILED 20 | return "the resource allocation failed" 21 | elseif status == CUSOLVER_STATUS_INVALID_VALUE 22 | return "an invalid value was used as an argument" 23 | elseif status == CUSOLVER_STATUS_ARCH_MISMATCH 24 | return "an absent device architectural feature is required" 25 | elseif status == CUSOLVER_STATUS_EXECUTION_FAILED 26 | return "the GPU program failed to execute" 27 | elseif status == CUSOLVER_STATUS_INTERNAL_ERROR 28 | return "an internal operation failed" 29 | elseif status == CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED 30 | return "the matrix type is not supported." 31 | else 32 | return "unknown status" 33 | end 34 | end 35 | 36 | macro check(solver_func) 37 | quote 38 | local err::cusolverStatus_t 39 | err = $(esc(solver_func::Expr)) 40 | if err != CUSOLVER_STATUS_SUCCESS 41 | throw(CUSOLVERError(err)) 42 | end 43 | err 44 | end 45 | end -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Sanity checks (read this first, then remove this section)** 11 | Make sure you're reporting *a bug*; for general questions, please use Discourse. 12 | 13 | If you're dealing with a performance issue, make sure you **disable scalar iteration** (`CuArrays.allowscalar(false)`). Only file an issue if that shows scalar iteration happening within Base or CuArrays, as opposed to your own code. 14 | 15 | If you're seeing an error message, **follow the error message instructions**, if any (eg. `inspect code with @device_code_warntype`). If you can't solve the problem using that information, make sure to post it as part of the issue. 16 | 17 | If your bug is still valid, please go ahead and fill out the template below. 18 | 19 | **Describe the bug** 20 | A clear and concise description of what the bug is. 21 | 22 | **To Reproduce** 23 | The Minimal Working Example (MWE) for this bug: 24 | ```julia 25 | # some code here 26 | ``` 27 | 28 | **Expected behavior** 29 | A clear and concise description of what you expected to happen. 30 | 31 | **Build log** 32 | ``` 33 | # post the output of Pkg.build() 34 | # make sure the error still reproduces after that. 35 | ``` 36 | 37 | **Environment details (please complete this section)** 38 | Details on Julia: 39 | ``` 40 | # please post the output of: 41 | versioninfo() 42 | ``` 43 | 44 | Julia packages: 45 | - CuArrays.jl: 46 | - CUDAnative.jl: 47 | - ... 48 | 49 | CUDA: toolkit and driver version 50 | 51 | 52 | **Additional context** 53 | Add any other context about the problem here. 54 | -------------------------------------------------------------------------------- /src/sparse/error.jl: -------------------------------------------------------------------------------- 1 | export CUSPARSError 2 | 3 | struct CUSPARSEError <: Exception 4 | code::cusparseStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUSPARSEError) = print(io, "CUSPARSError(code $(err.code), $(err.msg))") 8 | 9 | function CUSPARSError(code::cusparseStatus_t) 10 | msg = status_message(code) 11 | return CUSPARSEError(code, msg) 12 | end 13 | 14 | 15 | function statusmessage( status ) 16 | if status == CUSPARSE_STATUS_SUCCESS 17 | return "cusparse success" 18 | end 19 | if status == CUSPARSE_STATUS_NOT_INITIALIZED 20 | return "cusparse not initialized" 21 | end 22 | if status == CUSPARSE_STATUS_ALLOC_FAILED 23 | return "cusparse allocation failed" 24 | end 25 | if status == CUSPARSE_STATUS_INVALID_VALUE 26 | return "cusparse invalid value" 27 | end 28 | if status == CUSPARSE_STATUS_ARCH_MISMATCH 29 | return "cusparse architecture mismatch" 30 | end 31 | if status == CUSPARSE_STATUS_MAPPING_ERROR 32 | return "cusparse mapping error" 33 | end 34 | if status == CUSPARSE_STATUS_EXECUTION_FAILED 35 | return "cusparse execution failed" 36 | end 37 | if status == CUSPARSE_STATUS_INTERNAL_ERROR 38 | return "cusparse internal error" 39 | end 40 | if status == CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED 41 | return "cusparse matrix type not supported" 42 | end 43 | end 44 | 45 | macro check(sparse_func) 46 | quote 47 | local err = $(esc(sparse_func::Expr)) 48 | if err != CUSPARSE_STATUS_SUCCESS 49 | throw(CUSPARSEError(cusparseStatus_t(err))) 50 | end 51 | err 52 | end 53 | end 54 | 55 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | CI_IMAGE_TAG: 'cuda' 3 | 4 | include: 5 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/common.yml' 6 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.0.yml' 7 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_v1.1.yml' 8 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/test_dev.yml' 9 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/coverage_v1.1.yml' 10 | - 'https://raw.githubusercontent.com/JuliaGPU/gitlab-ci/master/templates/v2/documentation_v1.1.yml' 11 | 12 | test:v1.0: 13 | only: 14 | - master 15 | - staging 16 | - trying 17 | 18 | test:v1.1: 19 | only: 20 | - master 21 | - staging 22 | - trying 23 | 24 | test:dev: 25 | allow_failure: true 26 | only: 27 | - master 28 | - staging 29 | - trying 30 | 31 | coverage: 32 | allow_failure: true 33 | only: 34 | - master 35 | - staging 36 | - trying 37 | 38 | documentation: 39 | only: 40 | - master 41 | - staging 42 | - trying 43 | 44 | pages: 45 | stage: deploy 46 | script: 47 | - mv docs/build public 48 | artifacts: 49 | paths: 50 | - public 51 | only: 52 | - master 53 | 54 | flux: 55 | stage: test 56 | image: "juliagpu/julia:v1.1-cuda" 57 | script: 58 | - mkdir $JULIA_DEPOT_PATH # Pkg.jl#325 59 | - julia -e 'using Pkg; 60 | Pkg.develop(PackageSpec(path=pwd())); 61 | Pkg.build(); 62 | Pkg.add(PackageSpec(name="Flux", rev="master")); 63 | Pkg.test("Flux");' 64 | allow_failure: true 65 | only: 66 | - master 67 | - staging 68 | - trying 69 | -------------------------------------------------------------------------------- /src/fft/genericfft.jl: -------------------------------------------------------------------------------- 1 | cufftfloat(x) = _cufftfloat(float(x)) 2 | _cufftfloat(::Type{T}) where {T<:cufftReals} = T 3 | _cufftfloat(::Type{Float16}) = Float32 4 | _cufftfloat(::Type{Complex{T}}) where {T} = Complex{_cufftfloat(T)} 5 | _cufftfloat(::Type{T}) where {T} = error("type $T not supported") 6 | _cufftfloat(x::T) where {T} = _cufftfloat(T)(x) 7 | 8 | complexfloat(x::CuArray{Complex{<:cufftReals}}) = x 9 | realfloat(x::CuArray{<:cufftReals}) = x 10 | 11 | complexfloat(x::CuArray{T}) where {T<:Complex} = copy1(typeof(cufftfloat(zero(T))), x) 12 | complexfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(complex(cufftfloat(zero(T)))), x) 13 | 14 | realfloat(x::CuArray{T}) where {T<:Real} = copy1(typeof(cufftfloat(zero(T))), x) 15 | 16 | function copy1(::Type{T}, x) where T 17 | y = CuArray{T}(undef, map(length, axes(x))) 18 | #copy!(y, x) 19 | y .= broadcast(xi->convert(T,xi),x) 20 | end 21 | 22 | # promote to a complex floating-point type (out-of-place only), 23 | # so implementations only need Complex{Float} methods 24 | for f in (:fft, :bfft, :ifft) 25 | pf = Symbol("plan_", f) 26 | @eval begin 27 | $f(x::CuArray{<:Real}, region=1:ndims(x)) = $f(complexfloat(x), region) 28 | $pf(x::CuArray{<:Real}, region) = $pf(complexfloat(x), region) 29 | $f(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region=1:ndims(x)) = $f(complexfloat(x), region) 30 | $pf(x::CuArray{<:Complex{<:Union{Integer,Rational}}}, region) = $pf(complexfloat(x), region) 31 | end 32 | end 33 | rfft(x::CuArray{<:Union{Integer,Rational}}, region=1:ndims(x)) = rfft(realfloat(x), region) 34 | plan_rfft(x::CuArray{<:Real}, region) = plan_rfft(realfloat(x), region) 35 | 36 | *(p::Plan{T}, x::CuArray) where {T} = p * copy1(T, x) 37 | *(p::ScaledPlan, x::CuArray) = rmul!(p.p * x, p.scale) 38 | -------------------------------------------------------------------------------- /src/blas/error.jl: -------------------------------------------------------------------------------- 1 | export CUBLASError 2 | 3 | struct CUBLASError <: Exception 4 | code::cublasStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUBLASError) = print(io, "CUBLASError(code $(err.code), $(err.msg))") 8 | 9 | function CUBLASError(code::cublasStatus_t) 10 | msg = status_message(code) 11 | return CUBLASError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUBLAS_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUBLAS_STATUS_NOT_INITIALIZED 18 | return "the library was not initialized" 19 | elseif status == CUBLAS_STATUS_ALLOC_FAILED 20 | return "the resource allocation failed" 21 | elseif status == CUBLAS_STATUS_INVALID_VALUE 22 | return "an invalid value was used as an argument" 23 | elseif status == CUBLAS_STATUS_ARCH_MISMATCH 24 | return "an absent device architectural feature is required" 25 | elseif status == CUBLAS_STATUS_MAPPING_ERROR 26 | return "an access to GPU memory space failed" 27 | elseif status == CUBLAS_STATUS_EXECUTION_FAILED 28 | return "the GPU program failed to execute" 29 | elseif status == CUBLAS_STATUS_INTERNAL_ERROR 30 | return "an internal operation failed" 31 | elseif status == CUBLAS_STATUS_NOT_SUPPORTED 32 | return "the requested feature is not supported" 33 | elseif status == CUBLAS_STATUS_LICENSE_ERROR 34 | return "error detected trying to check the license" 35 | else 36 | return "unknown status" 37 | end 38 | end 39 | 40 | macro check(blas_func) 41 | quote 42 | local err::cublasStatus_t 43 | err = $(esc(blas_func::Expr)) 44 | if err != CUBLAS_STATUS_SUCCESS 45 | throw(CUBLASError(err)) 46 | end 47 | err 48 | end 49 | end -------------------------------------------------------------------------------- /src/solver/CUSOLVER.jl: -------------------------------------------------------------------------------- 1 | module CUSOLVER 2 | 3 | import CUDAdrv: CUDAdrv, CuContext, CuStream_t, CuPtr, PtrOrCuPtr, CU_NULL 4 | import CUDAapi 5 | 6 | using ..CuArrays 7 | using ..CuArrays: libcusolver, active_context, _getindex, unsafe_free! 8 | 9 | using LinearAlgebra 10 | using SparseArrays 11 | 12 | import Base.one 13 | import Base.zero 14 | import CuArrays.CUSPARSE.CuSparseMatrixCSR 15 | import CuArrays.CUSPARSE.CuSparseMatrixCSC 16 | import CuArrays.CUSPARSE.cusparseMatDescr_t 17 | 18 | include("libcusolver_types.jl") 19 | include("error.jl") 20 | 21 | const _dense_handles = Dict{CuContext,cusolverDnHandle_t}() 22 | const _dense_handle = Ref{cusolverDnHandle_t}(C_NULL) 23 | const _sparse_handles = Dict{CuContext,cusolverSpHandle_t}() 24 | const _sparse_handle = Ref{cusolverSpHandle_t}(C_NULL) 25 | 26 | function dense_handle() 27 | if _dense_handle[] == C_NULL 28 | @assert isassigned(active_context) # some other call should have initialized CUDA 29 | _dense_handle[] = get!(_dense_handles, active_context[]) do 30 | context = active_context[] 31 | handle = cusolverDnCreate() 32 | atexit(()->CUDAdrv.isvalid(context) && cusolverDnDestroy(handle)) 33 | handle 34 | end 35 | end 36 | return _dense_handle[] 37 | end 38 | 39 | function sparse_handle() 40 | if _sparse_handle[] == C_NULL 41 | @assert isassigned(active_context) # some other call should have initialized CUDA 42 | _sparse_handle[] = get!(_sparse_handles, active_context[]) do 43 | context = active_context[] 44 | handle = cusolverSpCreate() 45 | atexit(()->CUDAdrv.isvalid(context) && cusolverSpDestroy(handle)) 46 | handle 47 | end 48 | end 49 | return _sparse_handle[] 50 | end 51 | 52 | include("libcusolver.jl") 53 | include("sparse.jl") 54 | include("dense.jl") 55 | include("highlevel.jl") 56 | 57 | version() = VersionNumber(cusolverGetProperty(CUDAapi.MAJOR_VERSION), 58 | cusolverGetProperty(CUDAapi.MINOR_VERSION), 59 | cusolverGetProperty(CUDAapi.PATCH_LEVEL)) 60 | 61 | end 62 | -------------------------------------------------------------------------------- /src/rand/error.jl: -------------------------------------------------------------------------------- 1 | export CURANDError 2 | 3 | struct CURANDError <: Exception 4 | code::curandStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CURANDError) = print(io, "CURANDError(code $(err.code), $(err.msg))") 8 | 9 | function CURANDError(code::curandStatus_t) 10 | msg = status_message(code) 11 | return CURANDError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CURAND_STATUS_SUCCESS 16 | return "generator was created successfully" 17 | elseif status == CURAND_STATUS_VERSION_MISMATCH 18 | return "Header file and linked library version do not match" 19 | elseif status == CURAND_STATUS_NOT_INITIALIZED 20 | return "Generator not initialized" 21 | elseif status == CURAND_STATUS_ALLOCATION_FAILED 22 | return "Memory allocation failed" 23 | elseif status == CURAND_STATUS_TYPE_ERROR 24 | return "Generator is wrong type" 25 | elseif status == CURAND_STATUS_OUT_OF_RANGE 26 | return "Argument out of range" 27 | elseif status == CURAND_STATUS_LENGTH_NOT_MULTIPLE 28 | return "Length requested is not a multple of dimension" 29 | elseif status == CURAND_STATUS_DOUBLE_PRECISION_REQUIRED 30 | return "GPU does not have double precision required by MRG32k3a" 31 | elseif status == CURAND_STATUS_LAUNCH_FAILURE 32 | return "Kernel launch failure" 33 | elseif status == CURAND_STATUS_PREEXISTING_FAILURE 34 | return "Preexisting failure on library entry" 35 | elseif status == CURAND_STATUS_INITIALIZATION_FAILED 36 | return "Initialization of CUDA failed" 37 | elseif status == CURAND_STATUS_ARCH_MISMATCH 38 | return "Architecture mismatch, GPU does not support requested feature" 39 | elseif status == CURAND_STATUS_INTERNAL_ERROR 40 | return "Internal library error" 41 | else 42 | return "unknown status" 43 | end 44 | end 45 | 46 | macro check(func) 47 | quote 48 | local err::curandStatus_t 49 | err = $(esc(func::Expr)) 50 | if err != CURAND_STATUS_SUCCESS 51 | throw(CURANDError(err)) 52 | end 53 | err 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /src/blas/README.md: -------------------------------------------------------------------------------- 1 | # CUBLAS implementation progress 2 | 3 | The following sections list the CUBLAS functions shown on the CUBLAS 4 | documentation page: 5 | 6 | http://docs.nvidia.com/cuda/cublas/index.html 7 | 8 | ## Level 1 (13 functions) 9 | 10 | CUBLAS functions: 11 | 12 | * [x] amax 13 | * [x] amin 14 | * [x] asum 15 | * [x] axpy 16 | * [x] copy 17 | * [x] dot, dotc, dotu 18 | * [x] nrm2 19 | * [ ] rot (not implemented in julia blas.jl) 20 | * [ ] rotg (not implemented in julia blas.jl) 21 | * [ ] rotm (not implemented in julia blas.jl) 22 | * [ ] rotmg (not implemented in julia blas.jl) 23 | * [x] scal 24 | * [ ] swap (not implemented in julia blas.jl) 25 | 26 | ## Level 2 27 | 28 | Key: 29 | * `ge`: general 30 | * `gb`: general banded 31 | * `sy`: symmetric 32 | * `sb`: symmetric banded 33 | * `sp`: symmetric packed 34 | * `tr`: triangular 35 | * `tb`: triangular banded 36 | * `tp`: triangular packed 37 | * `he`: hermitian 38 | * `hb`: hermitian banded 39 | * `hp`: hermitian packed 40 | 41 | CUBLAS functions: 42 | 43 | * [x] gbmv (in julia/blas.jl) 44 | * [x] gemv (in julia/blas.jl) 45 | * [x] ger (in julia/blas.jl) 46 | * [x] sbmv (in julia/blas.jl) 47 | * [ ] spmv 48 | * [ ] spr 49 | * [ ] spr2 50 | * [x] symv (in julia/blas.jl) 51 | * [x] syr (in julia/blas.jl) 52 | * [ ] syr2 53 | * [x] tbmv 54 | * [x] tbsv 55 | * [ ] tpmv 56 | * [ ] tpsv 57 | * [x] trmv (in julia/blas.jl) 58 | * [x] trsv (in julia/blas.jl) 59 | * [x] hemv (in julia/blas.jl) 60 | * [x] hbmv 61 | * [ ] hpmv 62 | * [x] her (in julia/blas.jl) 63 | * [x] her2 64 | * [ ] hpr 65 | * [ ] hpr2 66 | 67 | ## Level 3 68 | 69 | CUBLAS functions: 70 | 71 | * [x] gemm (in julia/blas.jl) 72 | * [x] gemmBatched 73 | * [x] symm (in julia/blas.jl) 74 | * [x] syrk (in julia/blas.jl) 75 | * [x] syr2k (in julia/blas.jl) 76 | * [ ] syrkx 77 | * [x] trmm (in julia/blas.jl) 78 | * [x] trsm (in julia/blas.jl) 79 | * [x] trsmBatched 80 | * [x] hemm 81 | * [x] herk (in julia/blas.jl) 82 | * [x] her2k (in julia/blas.jl) 83 | * [ ] herkx 84 | 85 | ## BLAS-like extensions 86 | 87 | * [x] geam 88 | * [x] dgmm 89 | * [x] getrfBatched 90 | * [x] getriBatched 91 | * [x] geqrfBatched 92 | * [x] gelsBatched 93 | * [ ] tpttr 94 | * [ ] trttp 95 | -------------------------------------------------------------------------------- /src/utils.jl: -------------------------------------------------------------------------------- 1 | using Base.Cartesian 2 | 3 | function cudims(n::Integer) 4 | threads = min(n, 256) 5 | ceil(Int, n / threads), threads 6 | end 7 | 8 | cudims(a::AbstractArray) = cudims(length(a)) 9 | 10 | @inline ind2sub_(a::AbstractArray{T,0}, i) where T = () 11 | @inline ind2sub_(a, i) = Tuple(CartesianIndices(a)[i]) 12 | 13 | macro cuindex(A) 14 | quote 15 | A = $(esc(A)) 16 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 17 | i > length(A) && return 18 | ind2sub_(A, i) 19 | end 20 | end 21 | 22 | 23 | @generated function nindex(i::T, ls::NTuple{N,T}) where {N,T} 24 | na = one(i) 25 | quote 26 | Base.@_inline_meta 27 | $(foldr((n, els) -> :(i ≤ ls[$n] ? ($n, i) : (i -= ls[$n]; $els)), :($na, $na), one(i):i(N))) 28 | end 29 | end 30 | 31 | @inline function catindex(dim, I::NTuple{N}, shapes) where N 32 | @inbounds x, i = nindex(I[dim], getindex.(shapes, dim)) 33 | x, ntuple(n -> n == dim ? i : I[n], Val{N}) 34 | end 35 | 36 | function growdims(dim, x) 37 | if ndims(x) >= dim 38 | x 39 | else 40 | reshape(x, size.((x,), 1:dim)...) 41 | end 42 | end 43 | 44 | function _cat(dim, dest, xs...) 45 | function kernel(dim, dest, xs) 46 | I = @cuindex dest 47 | @inbounds n, I′ = catindex(dim, Int.(I), size.(xs)) 48 | @inbounds dest[I...] = xs[n][I′...] 49 | return 50 | end 51 | xs = growdims.(dim, xs) 52 | blk, thr = cudims(dest) 53 | @cuda blocks=blk threads=thr kernel(dim, dest, xs) 54 | return dest 55 | end 56 | 57 | function Base.cat_t(dims::Integer, T::Type, x::CuArray, xs::CuArray...) 58 | catdims = Base.dims2cat(dims) 59 | shape = Base.cat_shape(catdims, (), size.((x, xs...))...) 60 | dest = Base.cat_similar(x, T, shape) 61 | _cat(dims, dest, x, xs...) 62 | end 63 | 64 | Base.vcat(xs::CuArray...) = cat(xs..., dims=1) 65 | Base.hcat(xs::CuArray...) = cat(xs..., dims=2) 66 | 67 | 68 | """ 69 | @sync ex 70 | 71 | Run expression `ex` and synchronize the GPU afterwards. This is a CPU-friendly 72 | synchronization, i.e. it performs a blocking synchronization without increasing CPU load. As 73 | such, this operation is preferred over implicit synchronization (e.g. when performing a 74 | memory copy) for high-performance applications. 75 | 76 | It is also useful for timing code that executes asynchronously. 77 | """ 78 | macro sync(ex) 79 | quote 80 | local e = CuEvent(CUDAdrv.EVENT_BLOCKING_SYNC | CUDAdrv.EVENT_DISABLE_TIMING) 81 | local ret = $(esc(ex)) 82 | CUDAdrv.record(e) 83 | CUDAdrv.synchronize(e) 84 | ret 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /src/broadcast.jl: -------------------------------------------------------------------------------- 1 | import Base.Broadcast: Broadcasted, Extruded, BroadcastStyle, ArrayStyle 2 | 3 | BroadcastStyle(::Type{<:CuArray}) = ArrayStyle{CuArray}() 4 | 5 | function Base.similar(bc::Broadcasted{ArrayStyle{CuArray}}, ::Type{T}) where T 6 | similar(CuArray{T}, axes(bc)) 7 | end 8 | 9 | 10 | # replace base functions with libdevice alternatives 11 | # TODO: do this with Cassette.jl 12 | 13 | cufunc(f) = f 14 | cufunc(::Type{T}) where T = (x...) -> T(x...) # broadcasting type ctors isn't GPU compatible 15 | 16 | Broadcast.broadcasted(::ArrayStyle{CuArray}, f, args...) = 17 | Broadcasted{ArrayStyle{CuArray}}(cufunc(f), args, nothing) 18 | 19 | libdevice = :[ 20 | cos, cospi, sin, sinpi, tan, acos, asin, atan, 21 | cosh, sinh, tanh, acosh, asinh, atanh, 22 | log, log10, log1p, log2, logb, ilogb, 23 | exp, exp2, exp10, expm1, ldexp, 24 | erf, erfinv, erfc, erfcinv, erfcx, 25 | brev, clz, ffs, byte_perm, popc, 26 | isfinite, isinf, isnan, nearbyint, 27 | nextafter, signbit, copysign, abs, 28 | sqrt, rsqrt, cbrt, rcbrt, pow, 29 | ceil, floor, saturate, 30 | lgamma, tgamma, 31 | j0, j1, jn, y0, y1, yn, 32 | normcdf, normcdfinv, hypot, 33 | fma, sad, dim, mul24, mul64hi, hadd, rhadd, scalbn].args 34 | 35 | for f in libdevice 36 | isdefined(Base, f) || continue 37 | @eval cufunc(::typeof(Base.$f)) = CUDAnative.$f 38 | end 39 | 40 | using MacroTools 41 | 42 | const _cufuncs = copy(libdevice) 43 | cufuncs() = (global _cufuncs; _cufuncs) 44 | 45 | function replace_device(ex) 46 | global _cufuncs 47 | MacroTools.postwalk(ex) do x 48 | x in _cufuncs ? :(CuArrays.cufunc($x)) : x 49 | end 50 | end 51 | 52 | macro cufunc(ex) 53 | global _cufuncs 54 | def = MacroTools.splitdef(ex) 55 | f = def[:name] 56 | def[:name] = Symbol(:cu, f) 57 | def[:body] = replace_device(def[:body]) 58 | push!(_cufuncs, f) 59 | quote 60 | $(esc(MacroTools.combinedef(def))) 61 | CuArrays.cufunc(::typeof($(esc(f)))) = $(esc(def[:name])) 62 | end 63 | end 64 | 65 | # ForwardDiff Integration 66 | using ForwardDiff: Dual, value, partials, unary_dual_definition 67 | using DiffRules 68 | 69 | for f in libdevice 70 | if haskey(DiffRules.DEFINED_DIFFRULES, (:Base,f,1)) 71 | f == :tanh && continue 72 | diffrule = DiffRules.DEFINED_DIFFRULES[(:Base,f,1)] 73 | DiffRules.DEFINED_DIFFRULES[(:CUDAnative,f,1)] = 74 | (args...) -> replace_device(diffrule(args...)) 75 | eval(unary_dual_definition(:CUDAnative, f)) 76 | end 77 | end 78 | 79 | DiffRules.DEFINED_DIFFRULES[(:CUDAnative, :tanh, 1)] = x -> 80 | replace_device(:(1-tanh(x)^2)) 81 | eval(unary_dual_definition(:CUDAnative, :tanh)) 82 | -------------------------------------------------------------------------------- /deps/build.jl: -------------------------------------------------------------------------------- 1 | using CUDAapi 2 | using CUDAdrv 3 | using CUDAnative 4 | 5 | 6 | ## auxiliary routines 7 | 8 | status = 0 9 | function build_warning(reason) 10 | println("$reason.") 11 | global status 12 | status = 1 13 | # NOTE: it's annoying that we have to `exit(1)`, but otherwise messages are hidden 14 | end 15 | 16 | function build_error(reason) 17 | println(reason) 18 | exit(1) 19 | end 20 | 21 | 22 | ## main 23 | 24 | config_path = joinpath(@__DIR__, "ext.jl") 25 | const previous_config_path = config_path * ".bak" 26 | 27 | function write_ext(config) 28 | open(config_path, "w") do io 29 | println(io, "# autogenerated file, do not edit") 30 | for (key,val) in config 31 | println(io, "const $key = $(repr(val))") 32 | end 33 | end 34 | end 35 | 36 | function main() 37 | ispath(config_path) && mv(config_path, previous_config_path; force=true) 38 | config = Dict{Symbol,Any}(:configured => false) 39 | write_ext(config) 40 | 41 | 42 | ## discover stuff 43 | 44 | CUDAdrv.configured || build_error("Dependent package CUDAdrv.jl has not been built successfully") 45 | CUDAnative.configured || build_error("Dependent package CUDAnative.jl has not been built successfully") 46 | 47 | toolkit = find_toolkit() 48 | 49 | for name in ("cublas", "cusparse", "cusolver", "cufft", "curand", "cudnn") 50 | lib = Symbol("lib$name") 51 | config[lib] = find_cuda_library(name, toolkit) 52 | if config[lib] == nothing 53 | build_warning("Could not find library '$name'") 54 | end 55 | end 56 | 57 | 58 | ## (re)generate ext.jl 59 | 60 | function globals(mod) 61 | all_names = names(mod, all=true) 62 | filter(name-> !any(name .== [nameof(mod), Symbol("#eval"), :eval]), all_names) 63 | end 64 | 65 | if isfile(previous_config_path) 66 | @eval module Previous; include($previous_config_path); end 67 | previous_config = Dict{Symbol,Any}(name => getfield(Previous, name) 68 | for name in globals(Previous)) 69 | 70 | if config == previous_config 71 | mv(previous_config_path, config_path; force=true) 72 | return 73 | end 74 | end 75 | 76 | config[:configured] = true 77 | write_ext(config) 78 | 79 | if status != 0 80 | # we got here, so the status is non-fatal 81 | build_error(""" 82 | 83 | CuArrays.jl has been built successfully, but there were warnings. 84 | Some functionality may be unavailable.""") 85 | end 86 | end 87 | 88 | main() 89 | -------------------------------------------------------------------------------- /src/gpuarray_interface.jl: -------------------------------------------------------------------------------- 1 | import GPUArrays 2 | 3 | struct CuArrayBackend <: GPUArrays.GPUBackend end 4 | GPUArrays.backend(::Type{<:CuArray}) = CuArrayBackend() 5 | 6 | 7 | #Abstract GPU interface 8 | struct CuKernelState end 9 | 10 | @inline function GPUArrays.LocalMemory(::CuKernelState, ::Type{T}, ::Val{N}, ::Val{id}) where {T, N, id} 11 | ptr = CUDAnative._shmem(Val(id), T, Val(N)) 12 | CuDeviceArray(N, DevicePtr{T, CUDAnative.AS.Shared}(ptr)) 13 | end 14 | 15 | GPUArrays.AbstractDeviceArray(A::CUDAnative.CuDeviceArray, shape) = CUDAnative.CuDeviceArray(shape, pointer(A)) 16 | 17 | @inline GPUArrays.synchronize_threads(::CuKernelState) = CUDAnative.sync_threads() 18 | 19 | GPUArrays.blas_module(::CuArray) = CuArrays.CUBLAS 20 | GPUArrays.blasbuffer(x::CuArray) = x 21 | 22 | """ 23 | Blocks until all operations are finished on `A` 24 | """ 25 | GPUArrays.synchronize(A::CuArray) = 26 | CUDAdrv.synchronize() 27 | 28 | for (i, sym) in enumerate((:x, :y, :z)) 29 | for (f, fcu) in ( 30 | (:blockidx, :blockIdx), 31 | (:blockdim, :blockDim), 32 | (:threadidx, :threadIdx), 33 | (:griddim, :gridDim) 34 | ) 35 | fname = Symbol(string(f, '_', sym)) 36 | cufun = Symbol(string(fcu, '_', sym)) 37 | @eval GPUArrays.$fname(::CuKernelState) = CUDAnative.$cufun() 38 | end 39 | end 40 | 41 | # devices() = CUDAdrv.devices() 42 | GPUArrays.device(A::CuArray) = CUDAdrv.device(CUDAdrv.CuCurrentContext()) 43 | GPUArrays.is_gpu(dev::CUDAdrv.CuDevice) = true 44 | GPUArrays.name(dev::CUDAdrv.CuDevice) = string("CU ", CUDAdrv.name(dev)) 45 | GPUArrays.threads(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK) 46 | 47 | GPUArrays.blocks(dev::CUDAdrv.CuDevice) = 48 | (CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X), 49 | CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y), 50 | CUDAdrv.attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Z)) 51 | 52 | GPUArrays.free_global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.Mem.info()[1] 53 | GPUArrays.global_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.totalmem(dev) 54 | GPUArrays.local_memory(dev::CUDAdrv.CuDevice) = CUDAdrv.attribute(dev, CUDAdrv.TOTAL_CONSTANT_MEMORY) 55 | 56 | function GPUArrays._gpu_call(::CuArrayBackend, f, A, args::Tuple, 57 | blocks_threads::Tuple{T, T}) where {N, T <: NTuple{N, Integer}} 58 | blk, thr = blocks_threads 59 | @cuda blocks=blk threads=thr f(CuKernelState(), args...) 60 | end 61 | 62 | # Save reinterpret and reshape implementation use this in GPUArrays 63 | GPUArrays.unsafe_reinterpret(::Type{T}, A::CuArray, size::NTuple{N, Integer}) where {T, N} = 64 | CuArray{T, N}(A.buf, size) 65 | -------------------------------------------------------------------------------- /src/fft/error.jl: -------------------------------------------------------------------------------- 1 | export CUFFTError 2 | 3 | struct CUFFTError <: Exception 4 | code::cufftStatus_t 5 | msg::AbstractString 6 | end 7 | Base.show(io::IO, err::CUFFTError) = print(io, "CUFFTError(code $(err.code), $(err.msg))") 8 | 9 | function CUFFTError(code::cufftStatus_t) 10 | msg = status_message(code) 11 | return CUFFTError(code, msg) 12 | end 13 | 14 | function status_message(status) 15 | if status == CUFFT_STATUS_SUCCESS 16 | return "the operation completed successfully" 17 | elseif status == CUFFT_STATUS_INVALID_PLAN 18 | return "cuFFT was passed an invalid plan handle" 19 | elseif status == CUFFT_STATUS_ALLOC_FAILED 20 | return "cuFFT failed to allocate GPU or CPU memory" 21 | elseif status == CUFFT_STATUS_INVALID_TYPE 22 | return "cuFFT invalid type " # No longer used 23 | elseif status == CUFFT_STATUS_INVALID_VALUE 24 | return "User specified an invalid pointer or parameter" 25 | elseif status == CUFFT_STATUS_INTERNAL_ERROR 26 | return "Driver or internal cuFFT library error" 27 | elseif status == CUFFT_STATUS_EXEC_FAILED 28 | return "Failed to execute an FFT on the GPU" 29 | elseif status == CUFFT_STATUS_SETUP_FAILED 30 | return "The cuFFT library failed to initialize" 31 | elseif status == CUFFT_STATUS_INVALID_SIZE 32 | return "User specified an invalid transform size" 33 | elseif status == CUFFT_STATUS_UNALIGNED_DATA 34 | return "cuFFT unaligned data" # No longer used 35 | elseif status == CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST 36 | return "Missing parameters in call" 37 | elseif status == CUFFT_STATUS_INVALID_DEVICE 38 | return "Execution of a plan was on different GPU than plan creation" 39 | elseif status == CUFFT_STATUS_PARSE_ERROR 40 | return "Internal plan database error" 41 | elseif status == CUFFT_STATUS_NO_WORKSPACE 42 | return "No workspace has been provided prior to plan execution" 43 | elseif status == CUFFT_STATUS_NOT_IMPLEMENTED 44 | return "Function does not implement functionality for parameters given." 45 | elseif status == CUFFT_STATUS_LICENSE_ERROR 46 | return "cuFFT license error" # Used in previous versions. 47 | elseif status == CUFFT_STATUS_NOT_SUPPORTED 48 | return "Operation is not supported for parameters given." 49 | else 50 | return "unknown status" 51 | end 52 | end 53 | 54 | macro check(fft_func) 55 | quote 56 | local err::cufftStatus_t 57 | err = $(esc(fft_func::Expr)) 58 | if err != CUFFT_STATUS_SUCCESS 59 | throw(CUFFTError(err)) 60 | end 61 | err 62 | end 63 | end 64 | -------------------------------------------------------------------------------- /src/rand/libcurand_types.jl: -------------------------------------------------------------------------------- 1 | const curandGenerator_t = Ptr{Cvoid} 2 | 3 | mutable struct RNG <: Random.AbstractRNG 4 | ptr::curandGenerator_t 5 | typ::Int 6 | end 7 | 8 | Base.unsafe_convert(::Type{curandGenerator_t}, rng::RNG) = rng.ptr 9 | 10 | 11 | const curandDiscreteDistribution_t = Ptr{Cvoid} 12 | 13 | mutable struct DiscreteDistribution 14 | ptr::curandDiscreteDistribution_t 15 | end 16 | 17 | Base.unsafe_convert(::Type{curandDiscreteDistribution_t}, dist::DiscreteDistribution) = dist.ptr 18 | 19 | 20 | # CURAND status codes 21 | const curandStatus_t = UInt32 22 | const CURAND_STATUS_SUCCESS = 0 23 | const CURAND_STATUS_VERSION_MISMATCH = 100 24 | const CURAND_STATUS_NOT_INITIALIZED = 101 25 | const CURAND_STATUS_ALLOCATION_FAILED = 102 26 | const CURAND_STATUS_TYPE_ERROR = 103 27 | const CURAND_STATUS_OUT_OF_RANGE = 104 28 | const CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105 29 | const CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106 30 | const CURAND_STATUS_LAUNCH_FAILURE = 201 31 | const CURAND_STATUS_PREEXISTING_FAILURE = 202 32 | const CURAND_STATUS_INITIALIZATION_FAILED = 203 33 | const CURAND_STATUS_ARCH_MISMATCH = 204 34 | const CURAND_STATUS_INTERNAL_ERROR = 999 35 | 36 | # CURAND RNG types (curandRngType) 37 | const CURAND_RNG_TEST = 0 38 | const CURAND_RNG_PSEUDO_DEFAULT = 100 39 | const CURAND_RNG_PSEUDO_XORWOW = 101 40 | const CURAND_RNG_PSEUDO_MRG32K3A = 121 41 | const CURAND_RNG_PSEUDO_MTGP32 = 141 42 | const CURAND_RNG_PSEUDO_MT19937 = 142 43 | const CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161 44 | const CURAND_RNG_QUASI_DEFAULT = 200 45 | const CURAND_RNG_QUASI_SOBOL32 = 201 46 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202 47 | const CURAND_RNG_QUASI_SOBOL64 = 203 48 | const CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204 49 | 50 | # CURAND ordering of results in memory 51 | const CURAND_ORDERING_PSEUDO_BEST = 100 52 | const CURAND_ORDERING_PSEUDO_DEFAULT = 101 53 | const CURAND_ORDERING_PSEUDO_SEEDED = 102 54 | const CURAND_ORDERING_QUASI_DEFAULT = 201 55 | 56 | # CURAND choice of direction vector set 57 | const CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101 58 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102 59 | const CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103 60 | const CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104 61 | 62 | # CURAND method 63 | const CURAND_CHOOSE_BEST = 0 64 | const CURAND_ITR = 1 65 | const CURAND_KNUTH = 2 66 | const CURAND_HITR = 3 67 | const CURAND_M1 = 4 68 | const CURAND_M2 = 5 69 | const CURAND_BINARY_SEARCH = 6 70 | const CURAND_DISCRETE_GAUSS = 7 71 | const CURAND_REJECTION = 8 72 | const CURAND_DEVICE_API = 9 73 | const CURAND_FAST_REJECTION = 10 74 | const CURAND_3RD = 11 75 | const CURAND_DEFINITION = 12 76 | const CURAND_POISSON = 13 77 | -------------------------------------------------------------------------------- /src/solver/libcusolver_types.jl: -------------------------------------------------------------------------------- 1 | import ..CUBLAS: cublasfill, cublasop, cublasside, cublasFillMode_t, cublasOperation_t, cublasSideMode_t 2 | 3 | #enum cusolverStatus_t 4 | #error messages from CUSOLVER 5 | 6 | const cusolverStatus_t = UInt32 7 | const CUSOLVER_STATUS_SUCCESS = 0 8 | const CUSOLVER_STATUS_NOT_INITIALIZED = 1 9 | const CUSOLVER_STATUS_ALLOC_FAILED = 2 10 | const CUSOLVER_STATUS_INVALID_VALUE = 3 11 | const CUSOLVER_STATUS_ARCH_MISMATCH = 4 12 | const CUSOLVER_STATUS_EXECUTION_FAILED = 5 13 | const CUSOLVER_STATUS_INTERNAL_ERROR = 6 14 | const CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 7 15 | 16 | const csrqrInfo_t = Ptr{Nothing} 17 | const gesvdjInfo_t = Ptr{Cvoid} 18 | const syevjInfo_t = Ptr{Cvoid} 19 | 20 | const cusolverEigMode_t = UInt32 21 | const CUSOLVER_EIG_MODE_NOVECTOR = 0 22 | const CUSOLVER_EIG_MODE_VECTOR = 1 23 | 24 | const cusolverEigType_t = UInt32 25 | const CUSOLVER_EIG_TYPE_1 = 1 26 | const CUSOLVER_EIG_TYPE_2 = 2 27 | const CUSOLVER_EIG_TYPE_3 = 3 28 | 29 | # refactorization types 30 | 31 | const cusolverRfNumericBoostReport_t = UInt32 32 | const CUSOLVER_NUMERIC_BOOST_NOT_USED = 0 33 | const CUSOLVER_NUMERIC_BOOST_USED = 1 34 | 35 | const cusolverRfResetValuesFastMode_t = UInt32 36 | const CUSOLVER_RESET_VALUES_FAST_MODE_OFF = 0 37 | const CUSOLVER_RESET_VALUES_FAST_MODE_ON = 1 38 | 39 | const cusolverRfFactorization_t = UInt32 40 | const CUSOLVER_FACTORIZATION_ALG0 = 0 41 | const CUSOLVER_FACTORIZATION_ALG1 = 1 42 | const CUSOLVER_FACTORIZATION_ALG2 = 2 43 | 44 | const cusolverRfTriangularSolve_t = UInt32 45 | const CUSOLVER_TRIANGULAR_SOLVE_ALG0 = 0 46 | const CUSOLVER_TRIANGULAR_SOLVE_ALG1 = 1 47 | const CUSOLVER_TRIANGULAR_SOLVE_ALG2 = 2 48 | const CUSOLVER_TRIANGULAR_SOLVE_ALG3 = 3 49 | 50 | const cusolverRfUnitDiagonal_t = UInt32 51 | const CUSOLVER_UNIT_DIAGONAL_STORED_L = 0 52 | const CUSOLVER_UNIT_DIAGONAL_STORED_U = 1 53 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_L = 2 54 | const CUSOLVER_UNIT_DIAGONAL_ASSUMED_U = 3 55 | 56 | const cusolverDnContext = Nothing 57 | const cusolverDnHandle_t = Ptr{cusolverDnContext} 58 | const cusolverSpContext = Nothing 59 | const cusolverSpHandle_t = Ptr{cusolverSpContext} 60 | const cusolverRfContext = Nothing 61 | const cusolverRfHandle_t = Ptr{cusolverRfContext} 62 | 63 | #complex numbers 64 | 65 | const cuComplex = Complex{Float32} 66 | const cuDoubleComplex = Complex{Float64} 67 | 68 | const CusolverFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 69 | const CusolverReal = Union{Float64,Float32} 70 | const CusolverComplex = Union{ComplexF64,ComplexF32} 71 | -------------------------------------------------------------------------------- /src/fft/fft.jl: -------------------------------------------------------------------------------- 1 | # K is a flag for forward/backward 2 | # also used as an alias for r2c/c2r 3 | 4 | abstract type CuFFTPlan{T<:cufftNumber, K, inplace} <: Plan{T} end 5 | 6 | mutable struct cCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace} 7 | plan::cufftHandle_t 8 | sz::NTuple{N,Int} # Julia size of input array 9 | osz::NTuple{N,Int} # Julia size of output array 10 | xtype::Int 11 | region::Any 12 | pinv::ScaledPlan # required by AbstractFFT API 13 | 14 | function cCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N}, 15 | sizey::Tuple, region, xtype::Integer 16 | ) where {T<:cufftNumber,K,inplace,N} 17 | # maybe enforce consistency of sizey 18 | p = new(plan, size(X), sizey, xtype, region) 19 | finalizer(destroy_plan, p) 20 | p 21 | end 22 | end 23 | 24 | cCuFFTPlan(plan,X,region,xtype::Integer) = cCuFFTPlan(plan,X,size(X),region,xtype) 25 | 26 | mutable struct rCuFFTPlan{T<:cufftNumber,K,inplace,N} <: CuFFTPlan{T,K,inplace} 27 | plan::cufftHandle_t 28 | sz::NTuple{N,Int} # Julia size of input array 29 | osz::NTuple{N,Int} # Julia size of output array 30 | xtype::Int 31 | region::Any 32 | pinv::ScaledPlan # required by AbstractFFT API 33 | 34 | function rCuFFTPlan{T,K,inplace,N}(plan::cufftHandle_t, X::CuArray{T,N}, 35 | sizey::Tuple, region, xtype::Integer 36 | ) where {T<:cufftNumber,K,inplace,N} 37 | # maybe enforce consistency of sizey 38 | p = new(plan, size(X), sizey, xtype, region) 39 | finalizer(destroy_plan, p) 40 | p 41 | end 42 | end 43 | 44 | rCuFFTPlan(plan,X,region,xtype::Integer) = rCuFFTPlan(plan,X,size(X),region,xtype) 45 | 46 | const xtypenames = Dict{cufftType,String}(CUFFT_R2C => "real-to-complex", 47 | CUFFT_C2R => "complex-to-real", 48 | CUFFT_C2C => "complex", 49 | CUFFT_D2Z => "d.p. real-to-complex", 50 | CUFFT_Z2D => "d.p. complex-to-real", 51 | CUFFT_Z2Z => "d.p. complex") 52 | 53 | function showfftdims(io, sz, T) 54 | if isempty(sz) 55 | print(io,"0-dimensional") 56 | elseif length(sz) == 1 57 | print(io, sz[1], "-element") 58 | else 59 | print(io, join(sz, "×")) 60 | end 61 | print(io, " CuArray of ", T) 62 | end 63 | 64 | function show(io::IO, p::CuFFTPlan{T,K,inplace}) where {T,K,inplace} 65 | print(io, inplace ? "CUFFT in-place " : "CUFFT ", 66 | xtypenames[p.xtype], 67 | K == CUFFT_FORWARD ? " forward" : " backward", 68 | " plan for ") 69 | showfftdims(io, p.sz, T) 70 | end 71 | -------------------------------------------------------------------------------- /src/fft/libcufft_types.jl: -------------------------------------------------------------------------------- 1 | # CUFFT API function return values 2 | const cufftStatus_t = UInt32 3 | const CUFFT_STATUS_SUCCESS = 0 # The cuFFT operation was successful 4 | const CUFFT_STATUS_INVALID_PLAN = 1 # cuFFT was passed an invalid plan handle 5 | const CUFFT_STATUS_ALLOC_FAILED = 2 # cuFFT failed to allocate GPU or CPU memory 6 | const CUFFT_STATUS_INVALID_TYPE = 3 # No longer used 7 | const CUFFT_STATUS_INVALID_VALUE = 4 # User specified an invalid pointer or parameter 8 | const CUFFT_STATUS_INTERNAL_ERROR = 5 # Driver or internal cuFFT library error 9 | const CUFFT_STATUS_EXEC_FAILED = 6 # Failed to execute an FFT on the GPU 10 | const CUFFT_STATUS_SETUP_FAILED = 7 # The cuFFT library failed to initialize 11 | const CUFFT_STATUS_INVALID_SIZE = 8 # User specified an invalid transform size 12 | const CUFFT_STATUS_UNALIGNED_DATA = 9 # No longer used 13 | const CUFFT_STATUS_INCOMPLETE_PARAMETER_LIST = 10 # Missing parameters in call 14 | const CUFFT_STATUS_INVALID_DEVICE = 11 # Execution of a plan was on different GPU than plan creation 15 | const CUFFT_STATUS_PARSE_ERROR = 12 # Internal plan database error 16 | const CUFFT_STATUS_NO_WORKSPACE = 13 # No workspace has been provided prior to plan execution 17 | const CUFFT_STATUS_NOT_IMPLEMENTED = 14 # Function does not implement functionality for parameters given. 18 | const CUFFT_STATUS_LICENSE_ERROR = 15 # Used in previous versions. 19 | const CUFFT_STATUS_NOT_SUPPORTED = 16 # Operation is not supported for parameters given. 20 | 21 | 22 | const cufftReal = Float32 23 | const cufftDoubleReal = Float64 24 | 25 | const cufftComplex = ComplexF32 26 | const cufftDoubleComplex = ComplexF64 27 | 28 | # CUFFT transform directions 29 | const CUFFT_FORWARD = -1 # Forward FFT 30 | const CUFFT_INVERSE = 1 # Inverse FFT 31 | 32 | # CUFFT supports the following transform types 33 | const cufftType = Cint 34 | const CUFFT_R2C = 0x2a # Real to Complex 35 | const CUFFT_C2R = 0x2c # Complex to Real 36 | const CUFFT_C2C = 0x29 # Complex to Complex 37 | const CUFFT_D2Z = 0x6a # Double to Double-Complex 38 | const CUFFT_Z2D = 0x6c # Double-Complex to Double 39 | const CUFFT_Z2Z = 0x69 # Double-Complex to Double-Complex 40 | 41 | const cufftCompatibility = Cint 42 | const CUFFT_COMPATIBILITY_NATIVE = 0x00 43 | const CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 44 | const CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC = 0x02 45 | const CUFFT_COMPATIBILITY_FFTW_ALL = 0x03 46 | 47 | const cufftHandle_t = Cint 48 | 49 | const cufftNumber = Union{cufftDoubleReal,cufftReal,cufftDoubleComplex,cufftComplex} 50 | # note trailing s to deconflict w/ header file 51 | const cufftReals = Union{cufftDoubleReal,cufftReal} 52 | const cufftComplexes = Union{cufftDoubleComplex,cufftComplex} 53 | const cufftDouble = Union{cufftDoubleReal,cufftDoubleComplex} 54 | const cufftSingle = Union{cufftReal,cufftComplex} 55 | const cufftTypeDouble = Union{Type{cufftDoubleReal},Type{cufftDoubleComplex}} 56 | const cufftTypeSingle = Union{Type{cufftReal},Type{cufftComplex}} 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CuArrays 2 | 3 | [![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] 4 | 5 | [codecov-img]: https://codecov.io/gh/JuliaGPU/CuArrays.jl/branch/master/graph/badge.svg 6 | [codecov-url]: https://codecov.io/gh/JuliaGPU/CuArrays.jl 7 | 8 | [docs-latest-img]: https://img.shields.io/badge/docs-latest-blue.svg 9 | [docs-latest-url]: https://juliagpu.gitlab.io/CuArrays.jl/ 10 | 11 | CuArrays provides a fully-functional GPU array, which can give significant speedups over 12 | normal arrays without code changes. CuArrays are implemented fully in Julia, making the 13 | implementation [elegant and extremely 14 | generic](http://mikeinnes.github.io/2017/08/24/cudanative.html). 15 | 16 | Documentation for this package is sparse, and for many of the array operations you should 17 | refer to the official Julia documentation. The following resources can be useful to get a 18 | better understanding of the characteristics and performance trade offs that come with GPU 19 | arrays: 20 | 21 | - Introductory tutorial on [GPU programming with Julia](https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/) 22 | - Slide deck on [effectively using GPUs with Julia](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/) 23 | 24 | ## Installation 25 | 26 | CuArrays should work **out-of-the-box** on Julia 1.0. You only need to have a 27 | proper set-up of CUDA, meaning the rest of the Julia CUDA stack should work 28 | (notably CUDAapi.jl, CUDAdrv.jl and CUDAnative.jl). If you encounter any issues 29 | with CuArrays.jl, please make sure those other packages are working as expected. 30 | 31 | Some parts of CuArrays.jl depend on **optional libraries**, such as 32 | [cuDNN](https://developer.nvidia.com/cudnn). The build process should notify 33 | about missing dependencies, i.e. inspect the output of `Pkg.build("CuArrays")` 34 | to see whether your installation is complete. 35 | 36 | 37 | ## Features 38 | 39 | ```julia 40 | xs = cu(rand(5, 5)) 41 | ys = cu[1, 2, 3] 42 | xs_cpu = collect(xs) 43 | ``` 44 | 45 | Because `CuArray` is an `AbstractArray`, it doesn't have much of a learning curve; just use your favourite array ops as usual. The following are supported (on arbitrary numbers of arguments, dimensions etc): 46 | 47 | * Conversions and `copy!` with CPU arrays 48 | * General indexing (`xs[1:2, 5, :]`) 49 | * `permutedims` 50 | * Concatenation (`vcat(x, y)`, `cat(3, xs, ys, zs)`) 51 | * `map`, fused broadcast (`zs .= xs.^2 .+ ys .* 2`) 52 | * `fill!(xs, 0)` 53 | * Reduction over dimensions (`reducedim(+, xs, 3)`, `sum(x -> x^2, xs, 1)` etc) 54 | * Reduction to scalar (`reduce(*, 1, xs)`, `sum(xs)` etc) 55 | * Various BLAS operations (matrix\*matrix, matrix\*vector) 56 | * FFTs, using the AbstractFFTs API 57 | 58 | We welcome issues or PRs for functionality not on this list. 59 | 60 | Note that some operations not on this list will work, but be slow, due to Base's generic 61 | implementations. This is intentional, to enable a "make it work, then make it fast" 62 | workflow. When you're ready you can disable slow fallback methods: 63 | 64 | ```julia 65 | julia> CuArrays.allowscalar(false) 66 | julia> xs[5] 67 | ERROR: getindex is disabled 68 | ``` 69 | -------------------------------------------------------------------------------- /src/fft/libcufft.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUFFT library 2 | 3 | cufftGetVersion() = ccall((:cufftGetVersion,libcufft), Cint, ()) 4 | 5 | function cufftGetProperty(property::CUDAapi.libraryPropertyType) 6 | value_ref = Ref{Cint}() 7 | @check ccall((:cufftGetProperty, libcufft), cufftStatus_t, 8 | (Cint, Ptr{Cint}), 9 | property, value_ref) 10 | value_ref[] 11 | end 12 | 13 | cufftDestroy(plan) = ccall((:cufftDestroy,libcufft), Nothing, (cufftHandle_t,), plan) 14 | 15 | function cufftPlan1d(plan, nx, type, batch) 16 | @check ccall((:cufftPlan1d,libcufft),cufftStatus_t, 17 | (Ptr{cufftHandle_t}, Cint, cufftType, Cint), 18 | plan, nx, type, batch) 19 | end 20 | 21 | function cufftPlan2d(plan, nx, ny, type) 22 | @check ccall((:cufftPlan2d,libcufft),cufftStatus_t, 23 | (Ptr{cufftHandle_t}, Cint, Cint, cufftType), 24 | plan, nx, ny, type) 25 | end 26 | 27 | function cufftPlan3d(plan, nx, ny, nz, type) 28 | @check ccall((:cufftPlan3d,libcufft),cufftStatus_t, 29 | (Ptr{cufftHandle_t}, Cint, Cint, Cint, cufftType), 30 | plan, nx, ny, nz, type) 31 | end 32 | 33 | function cufftPlanMany(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch) 34 | @check ccall((:cufftPlanMany,libcufft),cufftStatus_t, 35 | (Ptr{cufftHandle_t}, Cint, Ptr{Cint}, 36 | Ptr{Cint}, Cint, Cint, 37 | Ptr{Cint}, Cint, Cint, 38 | cufftType, Cint), 39 | plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch) 40 | end 41 | 42 | function cufftExecC2C(plan, idata, odata, direction) 43 | @check ccall((:cufftExecC2C,libcufft), cufftStatus_t, 44 | (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}, Cint), 45 | plan, idata, odata, direction) 46 | end 47 | 48 | function cufftExecC2R(plan, idata, odata) 49 | @check ccall((:cufftExecC2R,libcufft), cufftStatus_t, 50 | (cufftHandle_t, CuPtr{cufftComplex}, CuPtr{cufftComplex}), 51 | plan, idata, odata) 52 | end 53 | 54 | function cufftExecR2C(plan, idata, odata) 55 | @check ccall((:cufftExecR2C,libcufft), cufftStatus_t, 56 | (cufftHandle_t, CuPtr{cufftReal}, CuPtr{cufftComplex}), 57 | plan, idata, odata) 58 | end 59 | 60 | function cufftExecZ2Z(plan, idata, odata, direction) 61 | @check ccall((:cufftExecZ2Z,libcufft), cufftStatus_t, 62 | (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}, 63 | Cint), 64 | plan, idata, odata, direction) 65 | end 66 | 67 | function cufftExecZ2D(plan, idata, odata) 68 | @check ccall((:cufftExecZ2D,libcufft), cufftStatus_t, 69 | (cufftHandle_t, CuPtr{cufftDoubleComplex}, CuPtr{cufftDoubleComplex}), 70 | plan, idata, odata) 71 | end 72 | 73 | function cufftExecD2Z(plan, idata, odata) 74 | @check ccall((:cufftExecD2Z,libcufft), cufftStatus_t, 75 | (cufftHandle_t, CuPtr{cufftDoubleReal}, CuPtr{cufftDoubleComplex}), 76 | plan, idata, odata) 77 | end 78 | -------------------------------------------------------------------------------- /src/sparse/highlevel.jl: -------------------------------------------------------------------------------- 1 | (\)(A::AbstractTriangular{<:CuSparseMatrix},B::CuMatrix) = sm('N',A,B,'O') 2 | (\)(transA::Transpose{<:Any, <:AbstractTriangular{<:CuSparseMatrix}}, B::CuMatrix) = sm('T',parent(transA),B,'O') 3 | (\)(adjA::Adjoint{<:Any, <:AbstractTriangular{<:CuSparseMatrix}},B::CuMatrix) = sm('C',parent(adjA),B,'O') 4 | 5 | mul!(C::CuVector{T},A::CuSparseMatrix,B::CuVector) where {T} = mv!('N',one(T),A,B,zero(T),C,'O') 6 | mul!(C::CuVector{T},transA::Transpose{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O') 7 | mul!(C::CuVector{T},adjA::Adjoint{<:Any,<:CuSparseMatrix},B::CuVector) where {T} = mv!('C',one(T),parent(transA),B,zero(T),C,'O') 8 | mul!(C::CuVector{T},A::HermOrSym{T,<:CuSparseMatrix{T}},B::CuVector{T}) where T = mv!('N',one(T),A,B,zero(T),C,'O') 9 | mul!(C::CuVector{T},transA::Transpose{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('T',one(T),parent(transA),B,zero(T),C,'O') 10 | mul!(C::CuVector{T},adjA::Adjoint{<:Any, <:HermOrSym{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where {T} = mv!('C',one(T),parent(adjA),B,zero(T),C,'O') 11 | 12 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},B::CuMatrix{T}) where {T} = mm2!('N','N',one(T),A,B,zero(T),C,'O') 13 | mul!(C::CuMatrix{T},A::CuSparseMatrix{T},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('N','T',one(T),A,parent(transB),zero(T),C,'O') 14 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T}) where {T} = mm2!('T','N',one(T),parent(transA),B,zero(T),C,'O') 15 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:CuSparseMatrix{T}},transB::Transpose{<:Any, CuMatrix{T}}) where {T} = mm2!('T','T',one(T),parent(transA),parent(transB),zero(T),C,'O') 16 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:CuSparseMatrix{T}},B::CuMatrix{T}) where {T} = mm2!('C','N',one(T),parent(adjA),B,zero(T),C,'O') 17 | 18 | mul!(C::CuMatrix{T},A::HermOrSym{<:Number, <:CuSparseMatrix},B::CuMatrix) where {T} = mm!('N',one(T),A,B,zero(T),C,'O') 19 | mul!(C::CuMatrix{T},transA::Transpose{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('T',one(T),parent(transA),B,zero(T),C,'O') 20 | mul!(C::CuMatrix{T},adjA::Adjoint{<:Any, <:HermOrSym{<:Number, <:CuSparseMatrix}},B::CuMatrix) where {T} = mm!('C',one(T),parent(adjA),B,zero(T),C,'O') 21 | 22 | (\)(A::AbstractTriangular{<:CuSparseMatrix},B::CuVector) = sv2('N',A,B,'O') 23 | (\)(transA::Transpose{<:Any, <:AbstractTriangular{<:CuSparseMatrix}},B::CuVector) = sv2('T',parent(transA),B,'O') 24 | (\)(adjA::Adjoint{<:Any, <:AbstractTriangular{T,<:CuSparseMatrix{T}}},B::CuVector{T}) where T = sv2('C',parent(adjA),B,'O') 25 | (\)(A::AbstractTriangular{T,CuSparseMatrixHYB{T}},B::CuVector{T}) where T = sv('N',A,B,'O') 26 | (\)(transA::Transpose{<:Any, AbstractTriangular{T,CuSparseMatrixHYB{T}}},B::CuVector{T}) where T = sv('T',parent(transA),B,'O') 27 | (\)(adjA::Adjoint{<:Any, AbstractTriangular{T,CuSparseMatrixHYB{T}}},B::CuVector{T}) where T = sv('C',parent(adjA),B,'O') 28 | 29 | (+)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,B,'O','O','O') 30 | (-)(A::Union{CuSparseMatrixCSR,CuSparseMatrixCSC},B::Union{CuSparseMatrixCSR,CuSparseMatrixCSC}) = geam(A,-one(eltype(A)),B,'O','O','O') 31 | -------------------------------------------------------------------------------- /src/CuArrays.jl: -------------------------------------------------------------------------------- 1 | __precompile__() 2 | 3 | module CuArrays 4 | 5 | using CUDAdrv, CUDAnative 6 | 7 | using GPUArrays 8 | 9 | export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, cuzeros, cuones, cufill 10 | 11 | import LinearAlgebra 12 | 13 | using Adapt 14 | 15 | const ext = joinpath(dirname(@__DIR__), "deps", "ext.jl") 16 | isfile(ext) || error("CuArrays.jl has not been built, please run Pkg.build(\"CuArrays\").") 17 | include(ext) 18 | if !configured 19 | # default (non-functional) values for critical variables, 20 | # making it possible to _load_ the package at all times. 21 | const libcublas = nothing 22 | const libcusparse = nothing 23 | const libcusolver = nothing 24 | const libcufft = nothing 25 | const libcurand = nothing 26 | const libcudnn = nothing 27 | end 28 | 29 | include("memory.jl") 30 | include("array.jl") 31 | include("subarray.jl") 32 | include("utils.jl") 33 | include("indexing.jl") 34 | include("broadcast.jl") 35 | include("matmul.jl") 36 | include("mapreduce.jl") 37 | 38 | include("gpuarray_interface.jl") 39 | 40 | # many libraries need to be initialized per-device (per-context, really, but we assume users 41 | # of CuArrays and/or CUDAnative only use a single context), so keep track of the active one. 42 | const active_context = Ref{CuContext}() 43 | 44 | libcublas !== nothing && include("blas/CUBLAS.jl") 45 | libcusparse !== nothing && include("sparse/CUSPARSE.jl") 46 | libcusolver !== nothing && include("solver/CUSOLVER.jl") 47 | libcufft !== nothing && include("fft/CUFFT.jl") 48 | libcurand !== nothing && include("rand/CURAND.jl") 49 | libcudnn !== nothing && include("dnn/CUDNN.jl") 50 | 51 | include("nnlib.jl") 52 | 53 | include("deprecated.jl") 54 | 55 | function __init__() 56 | if !configured 57 | @warn("CuArrays.jl has not been successfully built, and will not work properly.") 58 | @warn("Please run Pkg.build(\"CuArrays\") and restart Julia.") 59 | return 60 | end 61 | 62 | function check_library(name, path) 63 | path === nothing && return 64 | if !ispath(path) 65 | error("$name library has changed. Please run Pkg.build(\"CuArrays\") and restart Julia.") 66 | end 67 | end 68 | check_library("CUBLAS", libcublas) 69 | check_library("CUSPARSE", libcusparse) 70 | check_library("CUSOLVER", libcusolver) 71 | check_library("CUFFT", libcufft) 72 | check_library("CURAND", libcurand) 73 | check_library("CUDNN", libcudnn) 74 | 75 | # update the active context when we switch devices 76 | callback = (::CuDevice, ctx::CuContext) -> begin 77 | active_context[] = ctx 78 | 79 | # wipe the active handles 80 | isdefined(CuArrays, :CUBLAS) && (CUBLAS._handle[] = C_NULL) 81 | isdefined(CuArrays, :CUSOLVER) && (CUSOLVER._dense_handle[] = C_NULL) 82 | isdefined(CuArrays, :CURAND) && (CURAND._generator[] = nothing) 83 | isdefined(CuArrays, :CUDNN) && (CUDNN._handle[] = C_NULL) 84 | end 85 | push!(CUDAnative.device!_listeners, callback) 86 | 87 | # a device might be active already 88 | existing_ctx = CUDAdrv.CuCurrentContext() 89 | if existing_ctx !== nothing 90 | active_context[] = existing_ctx 91 | end 92 | 93 | __init_memory__() 94 | end 95 | 96 | end # module 97 | -------------------------------------------------------------------------------- /src/matmul.jl: -------------------------------------------------------------------------------- 1 | using LinearAlgebra 2 | 3 | 4 | function generic_matmatmul!(C::AbstractVecOrMat{R}, A::AbstractVecOrMat{T}, B::AbstractVecOrMat{S}) where {T,S,R} 5 | if size(A,2) != size(B,1) 6 | throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) 7 | end 8 | if size(C,1) != size(A,1) || size(C,2) != size(B,2) 9 | throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((size(A,1),size(B,2)))")) 10 | end 11 | if isempty(A) || isempty(B) 12 | return fill!(C, zero(R)) 13 | end 14 | 15 | function kernel(C, A, B) 16 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 17 | j = (blockIdx().y-1) * blockDim().y + threadIdx().y 18 | 19 | if i <= size(A,1) && j <= size(B,2) 20 | z2 = zero(A[i, 1]*B[1, j] + A[i, 1]*B[1, j]) 21 | Ctmp = convert(promote_type(R, typeof(z2)), z2) 22 | for k in 1:size(A,2) 23 | Ctmp += A[i, k]*B[k, j] 24 | end 25 | C[i,j] = Ctmp 26 | end 27 | 28 | return 29 | end 30 | 31 | max_threads = 256 32 | threads_x = min(max_threads, size(C,1)) 33 | threads_y = min(max_threads ÷ threads_x, size(C,2)) 34 | threads = (threads_x, threads_y) 35 | blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads) 36 | 37 | @cuda threads=threads blocks=blocks kernel(C, A, B) 38 | 39 | C 40 | end 41 | 42 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 43 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 44 | LinearAlgebra.mul!(C::CuVecOrMat, A::CuVecOrMat, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 45 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 46 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::CuVecOrMat) = generic_matmatmul!(C, A, B) 47 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 48 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 49 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Adjoint{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 50 | LinearAlgebra.mul!(C::CuVecOrMat, A::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}, B::LinearAlgebra.Transpose{<:Any, <:CuVecOrMat}) = generic_matmatmul!(C, A, B) 51 | 52 | 53 | function generic_rmul!(X::CuArray, s::Number) 54 | function kernel(X, s) 55 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 56 | @inbounds X[i] *= s 57 | return 58 | end 59 | @cuda blocks=length(X) kernel(X, s) 60 | X 61 | end 62 | 63 | LinearAlgebra.rmul!(A::CuArray, b::Number) = generic_rmul!(A, b) 64 | 65 | 66 | function generic_lmul!(s::Number, X::CuArray) 67 | function kernel(s, X) 68 | i = (blockIdx().x-1) * blockDim().x + threadIdx().x 69 | @inbounds X[i] = s*X[i] 70 | return 71 | end 72 | @cuda blocks=length(X) kernel(s, X) 73 | X 74 | end 75 | 76 | LinearAlgebra.lmul!(a::Number, B::CuArray) = generic_lmul!(a, B) 77 | -------------------------------------------------------------------------------- /src/blas/libcublas_types.jl: -------------------------------------------------------------------------------- 1 | # libcublas_types.jl 2 | # 3 | # Initially generated with wrap_c from Clang.jl. Modified to remove anonymous 4 | # enums and add cublasContext. 5 | # 6 | # Author: Nick Henderson 7 | # Created: 2014-08-27 8 | # License: MIT 9 | # 10 | 11 | # begin enum cublasStatus_t 12 | const cublasStatus_t = UInt32 13 | const CUBLAS_STATUS_SUCCESS = 0 14 | const CUBLAS_STATUS_NOT_INITIALIZED = 1 15 | const CUBLAS_STATUS_ALLOC_FAILED = 3 16 | const CUBLAS_STATUS_INVALID_VALUE = 7 17 | const CUBLAS_STATUS_ARCH_MISMATCH = 8 18 | const CUBLAS_STATUS_MAPPING_ERROR = 11 19 | const CUBLAS_STATUS_EXECUTION_FAILED = 13 20 | const CUBLAS_STATUS_INTERNAL_ERROR = 14 21 | const CUBLAS_STATUS_NOT_SUPPORTED = 15 22 | const CUBLAS_STATUS_LICENSE_ERROR = 16 23 | # end enum cublasStatus_t 24 | # begin enum cublasFillMode_t 25 | const cublasFillMode_t = UInt32 26 | const CUBLAS_FILL_MODE_LOWER = 0 27 | const CUBLAS_FILL_MODE_UPPER = 1 28 | # end enum cublasFillMode_t 29 | # begin enum cublasDiagType_t 30 | const cublasDiagType_t = UInt32 31 | const CUBLAS_DIAG_NON_UNIT = 0 32 | const CUBLAS_DIAG_UNIT = 1 33 | # end enum cublasDiagType_t 34 | # begin enum cublasSideMode_t 35 | const cublasSideMode_t = UInt32 36 | const CUBLAS_SIDE_LEFT = 0 37 | const CUBLAS_SIDE_RIGHT = 1 38 | # end enum cublasSideMode_t 39 | # begin enum cublasOperation_t 40 | const cublasOperation_t = UInt32 41 | const CUBLAS_OP_N = 0 42 | const CUBLAS_OP_T = 1 43 | const CUBLAS_OP_C = 2 44 | # end enum cublasOperation_t 45 | # begin enum cublasPointerMode_t 46 | const cublasPointerMode_t = UInt32 47 | const CUBLAS_POINTER_MODE_HOST = 0 48 | const CUBLAS_POINTER_MODE_DEVICE = 1 49 | # end enum cublasPointerMode_t 50 | # begin enum cublasAtomicsMode_t 51 | const cublasAtomicsMode_t = UInt32 52 | const CUBLAS_ATOMICS_NOT_ALLOWED = 0 53 | const CUBLAS_ATOMICS_ALLOWED = 1 54 | # end enum cublasAtomicsMode_t 55 | const cublasContext = Nothing 56 | const cublasHandle_t = Ptr{cublasContext} 57 | # complex numbers in cuda 58 | const cuComplex = Complex{Float32} 59 | const cuDoubleComplex = Complex{Float64} 60 | # complex types from Base/linalg.jl 61 | const CublasFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 62 | const CublasReal = Union{Float64,Float32} 63 | const CublasComplex = Union{ComplexF64,ComplexF32} 64 | # FP16 (cuda_fp16.h) in cuda 65 | const __half = Float16 66 | struct __half2 67 | x1::__half 68 | x2::__half 69 | end 70 | 71 | if CUDAdrv.version() >= v"0.7.5" 72 | # specify which GEMM algorithm to use in cublasGemmEx() (CUDA 7.5+) 73 | const cublasGemmAlgo_t = Int32 74 | const CUBLAS_GEMM_DFALT = -1 75 | const CUBLAS_GEMM_ALGO0 = 0 76 | const CUBLAS_GEMM_ALGO1 = 1 77 | const CUBLAS_GEMM_ALGO2 = 2 78 | const CUBLAS_GEMM_ALGO3 = 3 79 | const CUBLAS_GEMM_ALGO4 = 4 80 | const CUBLAS_GEMM_ALGO5 = 5 81 | const CUBLAS_GEMM_ALGO6 = 6 82 | const CUBLAS_GEMM_ALGO7 = 7 83 | # specify which DataType to use with cublasgemmEx() and cublasGemmEx() (CUDA 7.5+) functions 84 | const cudaDataType_t = UInt32 85 | const CUDA_R_16F = UInt32(2) 86 | const CUDA_C_16F = UInt32(6) 87 | const CUDA_R_32F = UInt32(0) 88 | const CUDA_C_32F = UInt32(4) 89 | const CUDA_R_64F = UInt32(1) 90 | const CUDA_C_64F = UInt32(5) 91 | const CUDA_R_8I = UInt32(3) 92 | const CUDA_C_8I = UInt32(7) 93 | const CUDA_R_8U = UInt32(8) 94 | const CUDA_C_8U = UInt32(9) 95 | const CUDA_R_32I = UInt32(10) 96 | const CUDA_C_32I = UInt32(11) 97 | const CUDA_R_32U = UInt32(12) 98 | const CUDA_C_32U = UInt32(13) 99 | end 100 | 101 | @enum CUBLASMathMode::Cint begin 102 | CUBLAS_DEFAULT_MATH = 0 103 | CUBLAS_TENSOR_OP_MATH = 1 104 | end 105 | -------------------------------------------------------------------------------- /src/mapreduce.jl: -------------------------------------------------------------------------------- 1 | using CuArrays: @cuindex, cudims 2 | 3 | function mapreducedim_kernel_serial(f, op, R, A, range) 4 | I = @cuindex R 5 | newrange = map((r, i) -> r === nothing ? i : r, range, I) 6 | for I′ in CartesianIndices(newrange) 7 | @inbounds R[I...] = op(R[I...], f(A[I′])) 8 | end 9 | return 10 | end 11 | 12 | @inline function reduce_block(arr::CuDeviceArray, op) 13 | sync_threads() 14 | len = blockDim().x 15 | while len != 1 16 | sync_threads() 17 | skip = (len + 1) >> 1 18 | reduce_to = threadIdx().x - skip 19 | if 0 < reduce_to <= (len >> 1) 20 | arr[reduce_to] = op(arr[reduce_to], arr[threadIdx().x]) 21 | end 22 | len = skip 23 | end 24 | sync_threads() 25 | end 26 | 27 | function mapreducedim_kernel_parallel(f, op, R::CuDeviceArray{T}, A::CuDeviceArray{T}, 28 | CIS, Rlength, Slength) where {T} 29 | for Ri_base in 0:(gridDim().x * blockDim().y):(Rlength-1) 30 | Ri = Ri_base + (blockIdx().x - 1) * blockDim().y + threadIdx().y 31 | Ri > Rlength && return 32 | RI = Tuple(CartesianIndices(R)[Ri]) 33 | S = @cuStaticSharedMem(T, 512) 34 | Si_folded_base = (threadIdx().y - 1) * blockDim().x 35 | Si_folded = Si_folded_base + threadIdx().x 36 | # serial reduction of A into S by Slength ÷ xthreads 37 | for Si_base in 0:blockDim().x:(Slength-1) 38 | Si = Si_base + threadIdx().x 39 | Si > Slength && break 40 | SI = Tuple(CIS[Si]) 41 | AI = ifelse.(size(R) .== 1, SI, RI) 42 | if Si_base == 0 43 | S[Si_folded] = f(A[AI...]) 44 | else 45 | S[Si_folded] = op(S[Si_folded], f(A[AI...])) 46 | end 47 | end 48 | # block-parallel reduction of S to S[1] by xthreads 49 | reduce_block(view(S, (Si_folded_base + 1):512), op) 50 | # reduce S[1] into R 51 | threadIdx().x == 1 && (R[Ri] = op(R[Ri], S[Si_folded])) 52 | end 53 | return 54 | end 55 | 56 | function Base._mapreducedim!(f, op, R::CuArray{T}, A::CuArray{T}) where {T} 57 | # the kernel as generated from `f` and `op` can require lots of registers (eg. #160), 58 | # so we need to be careful about how many threads we launch not to run out of them. 59 | Rlength = length(R) 60 | Ssize = ifelse.(size(R) .== 1, size(A), 1) 61 | Slength = prod(Ssize) 62 | CIS = CartesianIndices(Ssize) 63 | 64 | parallel_args = (f, op, R, A, CIS, Rlength, Slength) 65 | GC.@preserve parallel_args begin 66 | parallel_kargs = cudaconvert.(parallel_args) 67 | parallel_tt = Tuple{Core.Typeof.(parallel_kargs)...} 68 | parallel_kernel = cufunction(mapreducedim_kernel_parallel, parallel_tt) 69 | 70 | # we are limited in how many threads we can launch... 71 | ## by the kernel 72 | kernel_threads = CUDAnative.maxthreads(parallel_kernel) 73 | ## by the device 74 | dev = CUDAdrv.device() 75 | block_threads = (x=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_X), 76 | y=attribute(dev, CUDAdrv.MAX_BLOCK_DIM_Y), 77 | total=attribute(dev, CUDAdrv.MAX_THREADS_PER_BLOCK)) 78 | 79 | # figure out a legal launch configuration 80 | y_thr = min(nextpow(2, Rlength ÷ 512 + 1), 512, block_threads.y, kernel_threads) 81 | x_thr = min(512 ÷ y_thr, Slength, block_threads.x, 82 | ceil(Int, block_threads.total/y_thr), 83 | ceil(Int, kernel_threads/y_thr)) 84 | 85 | if x_thr >= 8 86 | blk, thr = (Rlength - 1) ÷ y_thr + 1, (x_thr, y_thr, 1) 87 | parallel_kernel(parallel_kargs...; threads=thr, blocks=blk) 88 | else 89 | # not enough work, fall back to serial reduction 90 | range = ifelse.(length.(axes(R)) .== 1, axes(A), nothing) 91 | blk, thr = cudims(R) 92 | @cuda(blocks=blk, threads=thr, mapreducedim_kernel_serial(f, op, R, A, range)) 93 | end 94 | end 95 | 96 | return R 97 | end 98 | -------------------------------------------------------------------------------- /test/dnn.jl: -------------------------------------------------------------------------------- 1 | @testset "CUDNN" begin 2 | 3 | if !isdefined(CuArrays, :CUDNN) 4 | @warn "Not testing CUDNN" 5 | else 6 | using CuArrays.CUDNN 7 | @info "Testing CUDNN $(CUDNN.version())" 8 | 9 | @testset "NNlib" begin 10 | using NNlib 11 | using NNlib: ∇conv_data, ∇conv_filter, 12 | maxpool, meanpool, ∇maxpool, ∇meanpool, 13 | softmax, ∇softmax, logsoftmax, ∇logsoftmax 14 | 15 | @test testf(NNlib.conv, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4)) 16 | @test testf(∇conv_data, rand(Float64, 9, 9, 4, 1), rand(Float64, 2, 2, 3, 4)) 17 | @test testf(∇conv_filter, rand(Float64, 9, 9, 4, 1), rand(Float64, 10, 10, 3, 1)) 18 | @test testf(CuArrays.CUDNN.∇conv_bias!, cu(rand(Float64, 1, 1, 10, 1)), cu(rand(Float64, 10, 10, 10, 1))) 19 | 20 | @test testf(NNlib.conv, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4); dilation=2) 21 | @test testf(∇conv_data, rand(Float64, 8, 8, 4, 1), rand(Float64, 2, 2, 3, 4); dilation=2) 22 | @test testf(∇conv_filter, rand(Float64, 8, 8, 4, 1), rand(Float64, 10, 10, 3, 1); dilation=2) 23 | 24 | @test testf(NNlib.crosscor, rand(Float64, 10, 10, 3, 1), rand(Float64, 2, 2, 3, 4)) 25 | @test testf(∇conv_data, rand(Float64, 9, 9, 4, 1), rand(Float64, 2, 2, 3, 4); flipkernel=1) 26 | @test testf(∇conv_filter, rand(Float64, 9, 9, 4, 1), rand(Float64, 10, 10, 3, 1); flipkernel=1) 27 | 28 | @test_nowarn NNlib.conv!(cu(zeros(Float64, 9, 9, 3, 1)), cu(rand(Float64, 10, 10, 1, 1)), cu(rand(Float64, 2, 2, 1, 3)), algo=1) 29 | @test_nowarn NNlib.∇conv_data!(cu(zeros(Float64, 10, 10, 1, 1)), cu(ones(Float64, 9, 9, 3, 1)), cu(rand(Float64, 2, 2, 1, 3)), algo=1) 30 | @test_nowarn NNlib.∇conv_filter!(cu(zeros(Float64, 2, 2, 1, 3)), cu(ones(Float64, 9, 9, 3, 1)), cu(rand(Float64, 10, 10, 1, 1)), algo=1) 31 | 32 | @test testf(NNlib.conv, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4)) 33 | @test testf(∇conv_data, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 2, 2, 2, 3, 4)) 34 | @test testf(∇conv_filter, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 10, 10, 10, 3, 1)) 35 | 36 | @test testf(NNlib.conv, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4); dilation=2) 37 | @test testf(∇conv_data, rand(Float64, 8, 8, 8, 4, 1), rand(Float64, 2, 2, 2, 3, 4); dilation=2) 38 | @test testf(∇conv_filter, rand(Float64, 8, 8, 8, 4, 1), rand(Float64, 10, 10, 10, 3, 1); dilation=2) 39 | 40 | @test testf(NNlib.crosscor, rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 2, 2, 2, 3, 4)) 41 | @test testf(∇conv_data, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 2, 2, 2, 3, 4); flipkernel=1) 42 | @test testf(∇conv_filter, rand(Float64, 9, 9, 9, 4, 1), rand(Float64, 10, 10, 10, 3, 1); flipkernel=1) 43 | 44 | @test testf(x -> maxpool(x, (2,2)), rand(Float64, 10, 10, 3, 1)) 45 | @test testf(x -> meanpool(x, (2,2)), rand(Float64, 10, 10, 3, 1)) 46 | @test testf((x, dy) -> ∇maxpool(dy, maxpool(x, (2,2)), x, (2,2)), rand(Float64, 10, 10, 3, 1), rand(Float64, 5, 5, 3, 1)) 47 | @test testf((x, dy) -> ∇meanpool(dy, meanpool(x, (2,2)), x, (2,2)), rand(Float64, 10, 10, 3, 1), rand(Float64, 5, 5, 3, 1)) 48 | 49 | @test testf(x -> maxpool(x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1)) 50 | @test testf(x -> meanpool(x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1)) 51 | @test testf((x, dy) -> ∇maxpool(dy, maxpool(x, (2,2,2)), x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 5, 5, 5, 3, 1)) 52 | @test testf((x, dy) -> ∇meanpool(dy, meanpool(x, (2,2,2)), x, (2,2,2)), rand(Float64, 10, 10, 10, 3, 1), rand(Float64, 5, 5, 5, 3, 1)) 53 | 54 | for dims in [(5,5), (5,)] 55 | @test testf(softmax, rand(Float64, dims)) 56 | @test testf(∇softmax, rand(Float64, dims), rand(Float64, dims)) 57 | @test testf(logsoftmax, rand(Float64, dims)) 58 | @test testf(∇logsoftmax, rand(Float64, dims), rand(Float64, dims)) 59 | end 60 | end 61 | 62 | @testset "Activations and Other Ops" begin 63 | @test testf(CuArrays.CUDNN.cudnnAddTensor, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 64 | @test testf(CuArrays.CUDNN.cudnnActivationForward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 65 | @test testf(CuArrays.CUDNN.cudnnActivationBackward, cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1)), cu(rand(Float64, 10, 10, 3, 1))) 66 | end 67 | 68 | end 69 | 70 | end 71 | -------------------------------------------------------------------------------- /src/solver/highlevel.jl: -------------------------------------------------------------------------------- 1 | # QR factorization 2 | 3 | struct CuQR{T,S<:AbstractMatrix} <: LinearAlgebra.Factorization{T} 4 | factors::S 5 | τ::CuVector{T} 6 | CuQR{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ) 7 | end 8 | 9 | struct CuQRPackedQ{T,S<:AbstractMatrix} <: LinearAlgebra.AbstractQ{T} 10 | factors::CuMatrix{T} 11 | τ::CuVector{T} 12 | CuQRPackedQ{T,S}(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T,S<:AbstractMatrix} = new(factors, τ) 13 | end 14 | 15 | CuQR(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQR{T,typeof(factors)}(factors, τ) 16 | CuQRPackedQ(factors::AbstractMatrix{T}, τ::CuVector{T}) where {T} = CuQRPackedQ{T,typeof(factors)}(factors, τ) 17 | 18 | LinearAlgebra.qr!(A::CuMatrix{T}) where T = CuQR(geqrf!(A::CuMatrix{T})...) 19 | Base.size(A::CuQR) = size(A.factors) 20 | Base.size(A::CuQRPackedQ, dim::Integer) = 0 < dim ? (dim <= 2 ? size(A.factors, 1) : 1) : throw(BoundsError()) 21 | CuArrays.CuMatrix(A::CuQRPackedQ) = orgqr!(copy(A.factors), A.τ) 22 | CuArrays.CuArray(A::CuQRPackedQ) = convert(CuMatrix, A) 23 | Base.Matrix(A::CuQRPackedQ) = Matrix(CuMatrix(A)) 24 | 25 | function Base.getproperty(A::CuQR, d::Symbol) 26 | m, n = size(getfield(A, :factors)) 27 | if d == :R 28 | return triu!(A.factors[1:min(m, n), 1:n]) 29 | elseif d == :Q 30 | return CuQRPackedQ(A.factors, A.τ) 31 | else 32 | getfield(A, d) 33 | end 34 | end 35 | 36 | # iteration for destructuring into components 37 | Base.iterate(S::CuQR) = (S.Q, Val(:R)) 38 | Base.iterate(S::CuQR, ::Val{:R}) = (S.R, Val(:done)) 39 | Base.iterate(S::CuQR, ::Val{:done}) = nothing 40 | 41 | # Apply changes Q from the left 42 | LinearAlgebra.lmul!(A::CuQRPackedQ{T,S}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} = 43 | ormqr!('L', 'N', A.factors, A.τ, B) 44 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Real, S<:CuMatrix} = 45 | ormqr!('L', 'T', parent(adjA).factors, parent(adjA).τ, B) 46 | LinearAlgebra.lmul!(adjA::Adjoint{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Complex, S<:CuMatrix} = 47 | ormqr!('L', 'C', parent(adjA).factors, parent(adjA).τ, B) 48 | LinearAlgebra.lmul!(trA::Transpose{T,<:CuQRPackedQ{T,S}}, B::CuVecOrMat{T}) where {T<:Number, S<:CuMatrix} = 49 | ormqr!('L', 'T', parent(trA).factors, parent(trA).τ, B) 50 | 51 | function Base.getindex(A::CuQRPackedQ{T, S}, i::Integer, j::Integer) where {T, S} 52 | x = CuArray{T}(undef, size(A, 2)) .= 0 53 | x[j] = 1 54 | lmul!(A, x) 55 | return _getindex(x, i) 56 | end 57 | 58 | function Base.show(io::IO, F::CuQR) 59 | println(io, "$(typeof(F)) with factors Q and R:") 60 | show(io, F.Q) 61 | println(io) 62 | show(io, F.R) 63 | end 64 | 65 | # Singular Value Decomposition 66 | 67 | struct CuSVD{T,Tr,A<:AbstractMatrix{T}} <: LinearAlgebra.Factorization{T} 68 | U::CuMatrix{T} 69 | S::CuVector{Tr} 70 | V::A 71 | end 72 | 73 | # iteration for destructuring into components 74 | Base.iterate(S::CuSVD) = (S.U, Val(:S)) 75 | Base.iterate(S::CuSVD, ::Val{:S}) = (S.S, Val(:V)) 76 | Base.iterate(S::CuSVD, ::Val{:V}) = (S.V, Val(:done)) 77 | Base.iterate(S::CuSVD, ::Val{:done}) = nothing 78 | 79 | @inline function Base.getproperty(S::CuSVD, s::Symbol) 80 | if s === :Vt 81 | return getfield(S, :V)' 82 | else 83 | return getfield(S, s) 84 | end 85 | end 86 | 87 | @enum SVDAlgorithm QRAlgorithm JacobiAlgorithm 88 | function LinearAlgebra.svd!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm; full::Bool=false) where T 89 | if method === QRAlgorithm 90 | U, s, Vt = gesvd!(full ? 'A' : 'S', full ? 'A' : 'S', A::CuMatrix{T}) 91 | return CuSVD(U, s, Vt') 92 | elseif method === JacobiAlgorithm 93 | return CuSVD(gesvdj!('V', Int(!full), A::CuMatrix{T})...) 94 | end 95 | end 96 | # Once LinearAlgebra.svd(::AbstractMatrix) accepts kwargs this method can be deleted 97 | LinearAlgebra.svd(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm; full=false) = svd!(copy(A), method, full=full) 98 | 99 | function LinearAlgebra.svdvals!(A::CuMatrix{T}, method::SVDAlgorithm=JacobiAlgorithm) where T 100 | if method === QRAlgorithm 101 | return gesvd!('N', 'N', A::CuMatrix{T})[2] 102 | elseif method === JacobiAlgorithm 103 | return gesvdj!('N', 1, A::CuMatrix{T})[2] 104 | end 105 | end 106 | # Once LinearAlgebra.svdvals(::AbstractMatrix) accepts kwargs this method can be deleted 107 | LinearAlgebra.svdvals(A::CuMatrix, method::SVDAlgorithm=JacobiAlgorithm) = svdvals!(copy(A), method) 108 | -------------------------------------------------------------------------------- /src/rand/highlevel.jl: -------------------------------------------------------------------------------- 1 | # high-level interface for CURAND 2 | # 3 | # the interface is split in two levels: 4 | # - functions that extend the Random standard library, and take an RNG as first argument, 5 | # will only ever dispatch to CURAND and as a result are limited in the types they support. 6 | # - functions that take an array will dispatch to either CURAND or GPUArrays 7 | # - `cu`-prefixed functions are provided for constructing GPU arrays from only an eltype 8 | 9 | 10 | ## seeding 11 | 12 | seed!(rng::RNG=generator()) = generate_seeds(rng) 13 | 14 | 15 | ## in-place 16 | 17 | # uniform 18 | Random.rand!(rng::RNG, A::CuArray{Float32}) = generate_uniform(rng, A) 19 | Random.rand!(rng::RNG, A::CuArray{Float64}) = generate_uniform_double(rng, A) 20 | 21 | # normal 22 | Random.randn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_normal(rng, A, mean, stddev) 23 | Random.randn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_normal_double(rng, A, mean, stddev) 24 | 25 | # log-normal 26 | rand_logn!(rng::RNG, A::CuArray{Float32}; mean=0, stddev=1) = generate_log_normal(rng, A, mean, stddev) 27 | rand_logn!(rng::RNG, A::CuArray{Float64}; mean=0, stddev=1) = generate_log_normal_double(rng, A, mean, stddev) 28 | 29 | # log-normal 30 | rand_poisson!(rng::RNG, A::CuArray{Cuint}; lambda=1) = generate_poisson(rng, A, lambda) 31 | 32 | 33 | ## out of place 34 | 35 | Random.rand(rng::RNG, ::Type{X}, dims::Dims) where {X} = rand!(rng, CuArray{X}(undef, dims)) 36 | Random.randn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = randn!(rng, CuArray{X}(undef, dims); kwargs...) 37 | rand_logn(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_logn!(rng, CuArray{X}(undef, dims); kwargs...) 38 | rand_poisson(rng::RNG, ::Type{X}, dims::Dims; kwargs...) where {X} = rand_poisson!(rng, CuArray{X}(undef, dims); kwargs...) 39 | 40 | # specify default types 41 | Random.rand(rng::RNG, dims::Integer...; kwargs...) = rand(rng, Float32, dims...; kwargs...) 42 | Random.randn(rng::RNG, dims::Integer...; kwargs...) = randn(rng, Float32, dims...; kwargs...) 43 | rand_logn(rng::RNG, dims...; kwargs...) = rand_logn(rng, Float32, dims...; kwargs...) 44 | rand_poisson(rng::RNG, dims...; kwargs...) = rand_poisson(rng, Cuint, dims...; kwargs...) 45 | 46 | # convenience 47 | Random.randn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 48 | randn(rng, X, Dims((dim1, dims...)); kwargs...) 49 | rand_logn(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 50 | rand_logn(rng, X, Dims((dim1, dims...)); kwargs...) 51 | rand_poisson(rng::RNG, ::Type{X}, dim1::Integer, dims::Integer...; kwargs...) where {X} = 52 | rand_poisson(rng, X, Dims((dim1, dims...)); kwargs...) 53 | 54 | 55 | ## functions that dispatch to either CURAND or GPUArrays 56 | 57 | uniform_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 58 | uniform_rng(A::CuArray) = GPUArrays.global_rng(A) 59 | 60 | normal_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 61 | normal_rng(::CuArray{T}) where {T} = 62 | error("CuArrays does not support generating normally distributed numbers of type $T") 63 | 64 | logn_rng(::CuArray{<:Union{Float32,Float64}}) = generator() 65 | logn_rng(::CuArray{T}) where {T} = 66 | error("CuArrays does not support generating lognormally distributed numbers of type $T") 67 | 68 | poisson_rng(::CuArray{Cuint}) = generator() 69 | poisson_rng(::CuArray{T}) where {T} = 70 | error("CuArrays does not support generating Poisson distributed numbers of type $T") 71 | 72 | 73 | Random.rand!(A::CuArray; kwargs...) = rand!(uniform_rng(A), A; kwargs...) 74 | Random.randn!(A::CuArray; kwargs...) = randn!(normal_rng(A), A; kwargs...) 75 | rand_logn!(A::CuArray; kwargs...) = rand_logn!(logn_rng(A), A; kwargs...) 76 | rand_poisson!(A::CuArray; kwargs...) = rand_poisson!(poisson_rng(A), A; kwargs...) 77 | 78 | 79 | # need to prefix with `cu` to disambiguate from Random functions that return an Array 80 | # TODO: `@gpu rand` with Cassette 81 | curand(::Type{X}, args...; kwargs...) where {X} = rand!(CuArray{X}(undef, args...); kwargs...) 82 | curandn(::Type{X}, args...; kwargs...) where {X} = randn!(CuArray{X}(undef, args...); kwargs...) 83 | curand_logn(::Type{X}, args...; kwargs...) where {X} = rand_logn!(CuArray{X}(undef, args...); kwargs...) 84 | curand_poisson(::Type{X}, args...; kwargs...) where {X} = rand_poisson!(CuArray{X}(undef, args...); kwargs...) 85 | 86 | # specify default types 87 | curand(args...; kwargs...) where {X} = curand(Float32, args...; kwargs...) 88 | curandn(args...; kwargs...) where {X} = curandn(Float32, args...; kwargs...) 89 | curand_logn(args...; kwargs...) where {X} = curand_logn(Float32, args...; kwargs...) 90 | curand_poisson(args...; kwargs...) where {X} = curand_poisson(Cuint, args...; kwargs...) 91 | -------------------------------------------------------------------------------- /src/sparse/libcusparse.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUSPARSE library 2 | 3 | #helper functions 4 | function cusparseCreate() 5 | handle = Ref{cusparseHandle_t}() 6 | @check ccall( (:cusparseCreate, libcusparse), cusparseStatus_t, (Ptr{cusparseHandle_t},), handle) 7 | handle[] 8 | end 9 | 10 | function cusparseDestroy(handle) 11 | @check ccall( (:cusparseDestroy, libcusparse), cusparseStatus_t, (cusparseHandle_t,), handle) 12 | end 13 | 14 | function cusparseGetVersion(handle, version) 15 | @check ccall( (:cusparseGetVersion, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{Cint}), handle, version) 16 | end 17 | 18 | function cusparseSetStream(handle, streamId) 19 | @check ccall( (:cusparseSetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, CuStream_t), handle, streamId) 20 | end 21 | 22 | function cusparseGetStream(handle, streamId) 23 | @check ccall( (:cusparseGetStream, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{CuStream_t}), handle, streamId) 24 | end 25 | 26 | function cusparseGetPointerMode(handle, mode) 27 | @check ccall( (:cusparseGetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, Ptr{cusparsePointerMode_t}), handle, mode) 28 | end 29 | 30 | function cusparseSetPointerMode(handle, mode) 31 | @check ccall( (:cusparseSetPointerMode, libcusparse), cusparseStatus_t, (cusparseHandle_t, cusparsePointerMode_t), handle, mode) 32 | end 33 | 34 | function cusparseCreateHybMat(hybA) 35 | @check ccall( (:cusparseCreateHybMat, libcusparse), cusparseStatus_t, (Ptr{cusparseHybMat_t},), hybA) 36 | end 37 | 38 | function cusparseDestroyHybMat(hybA) 39 | @check ccall( (:cusparseDestroyHybMat, libcusparse), cusparseStatus_t, (cusparseHybMat_t,), hybA) 40 | end 41 | 42 | function cusparseCreateSolveAnalysisInfo(info) 43 | @check ccall( (:cusparseCreateSolveAnalysisInfo, libcusparse), cusparseStatus_t, (Ptr{cusparseSolveAnalysisInfo_t},), info) 44 | end 45 | 46 | function cusparseDestroySolveAnalysisInfo(info) 47 | @check ccall( (:cusparseDestroySolveAnalysisInfo, libcusparse), cusparseStatus_t, (cusparseSolveAnalysisInfo_t,), info) 48 | end 49 | 50 | function cusparseCreateBsrsm2Info(info) 51 | @check ccall( (:cusparseCreateBsrsm2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsm2Info_t},), info) 52 | end 53 | 54 | function cusparseDestroyBsrsm2Info(info) 55 | @check ccall( (:cusparseDestroyBsrsm2Info, libcusparse), cusparseStatus_t, (bsrsm2Info_t,), info) 56 | end 57 | 58 | function cusparseCreateBsrsv2Info(info) 59 | @check ccall( (:cusparseCreateBsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{bsrsv2Info_t},), info) 60 | end 61 | 62 | function cusparseDestroyBsrsv2Info(info) 63 | @check ccall( (:cusparseDestroyBsrsv2Info, libcusparse), cusparseStatus_t, (bsrsv2Info_t,), info) 64 | end 65 | 66 | function cusparseCreateCsrsv2Info(info) 67 | @check ccall( (:cusparseCreateCsrsv2Info, libcusparse), cusparseStatus_t, (Ptr{csrsv2Info_t},), info) 68 | end 69 | 70 | function cusparseDestroyCsrsv2Info(info) 71 | @check ccall( (:cusparseDestroyCsrsv2Info, libcusparse), cusparseStatus_t, (csrsv2Info_t,), info) 72 | end 73 | 74 | function cusparseCreateCsric02Info(info) 75 | @check ccall( (:cusparseCreateCsric02Info, libcusparse), cusparseStatus_t, (Ptr{csric02Info_t},), info) 76 | end 77 | 78 | function cusparseDestroyCsric02Info(info) 79 | @check ccall( (:cusparseDestroyCsric02Info, libcusparse), cusparseStatus_t, (csric02Info_t,), info) 80 | end 81 | 82 | function cusparseCreateCsrilu02Info(info) 83 | @check ccall( (:cusparseCreateCsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{csrilu02Info_t},), info) 84 | end 85 | 86 | function cusparseDestroyCsrilu02Info(info) 87 | @check ccall( (:cusparseDestroyCsrilu02Info, libcusparse), cusparseStatus_t, (csrilu02Info_t,), info) 88 | end 89 | 90 | function cusparseCreateBsric02Info(info) 91 | @check ccall( (:cusparseCreateBsric02Info, libcusparse), cusparseStatus_t, (Ptr{bsric02Info_t},), info) 92 | end 93 | 94 | function cusparseDestroyBsric02Info(info) 95 | @check ccall( (:cusparseDestroyBsric02Info, libcusparse), cusparseStatus_t, (bsric02Info_t,), info) 96 | end 97 | 98 | function cusparseCreateBsrilu02Info(info) 99 | @check ccall( (:cusparseCreateBsrilu02Info, libcusparse), cusparseStatus_t, (Ptr{bsrilu02Info_t},), info) 100 | end 101 | 102 | function cusparseDestroyBsrilu02Info(info) 103 | @check ccall( (:cusparseDestroyBsrilu02Info, libcusparse), cusparseStatus_t, (bsrilu02Info_t,), info) 104 | end 105 | 106 | function cusparseGetProperty(property::CUDAapi.libraryPropertyType) 107 | value_ref = Ref{Cint}() 108 | @check ccall((:cusparseGetProperty, libcusparse), 109 | cusparseStatus_t, 110 | (Cint, Ptr{Cint}), 111 | property, value_ref) 112 | value_ref[] 113 | end 114 | -------------------------------------------------------------------------------- /src/dnn/nnlib.jl: -------------------------------------------------------------------------------- 1 | using NNlib 2 | import NNlib: conv!, ∇conv_filter!, ∇conv_data!, 3 | maxpool!, meanpool!, ∇maxpool!, ∇meanpool!, 4 | softmax, softmax!, ∇softmax!, logsoftmax, logsoftmax!, ∇logsoftmax 5 | import ..CuArrays: CuVecOrMat, CuVector 6 | using CUDAnative 7 | 8 | 9 | # Softmax 10 | 11 | const CUDNNFloat = Union{Float16,Float32,Float64} 12 | 13 | reshape4D(x::AbstractVector) = reshape(x, 1, 1, length(x), 1) 14 | reshape4D(x::AbstractMatrix) = reshape(x, 1, 1, size(x)...) 15 | 16 | function softmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 17 | cudnnSoftmaxForward(reshape4D(xs), reshape4D(out)) 18 | return out 19 | end 20 | 21 | function ∇softmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 22 | cudnnSoftmaxBackward(reshape4D(softmax(xs)), reshape4D(Δ), reshape4D(out)) 23 | return out 24 | end 25 | 26 | function logsoftmax!(out::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 27 | cudnnSoftmaxForward(reshape4D(xs), reshape4D(out), algorithm=CUDNN_SOFTMAX_LOG) 28 | return out 29 | end 30 | 31 | function ∇logsoftmax!(out::CuVecOrMat{T}, Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat 32 | cudnnSoftmaxBackward(reshape4D(logsoftmax(xs)), reshape4D(Δ), reshape4D(out); 33 | algorithm=CUDNN_SOFTMAX_LOG) 34 | return out 35 | end 36 | 37 | ∇logsoftmax(Δ::CuVecOrMat{T}, xs::CuVecOrMat{T}) where T<:CUDNNFloat = 38 | ∇logsoftmax!(similar(xs), Δ, xs) 39 | 40 | 41 | # Convolution 42 | 43 | function conv!(y::CuArray{T}, x::CuArray{T}, w::CuArray{T}; 44 | pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat 45 | if version() < v"6" 46 | all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6") 47 | end 48 | 49 | workspace_size = 50 | cudnnGetConvolutionForwardWorkspaceSize(y, x, w, padding=pad, stride=stride, dilation=dilation, 51 | algo=algo, mode=flipkernel) 52 | 53 | CuVector{UInt8}(undef, workspace_size) do workspace 54 | cudnnConvolutionForward(y, x, w, padding=pad, stride=stride, dilation=dilation, mode=flipkernel, 55 | alpha=alpha, algo=algo, workspace=workspace, workspace_size=workspace_size) 56 | end 57 | end 58 | 59 | function ∇conv_filter!(dw::CuArray{T}, dy::CuArray{T}, x::CuArray{T}; 60 | pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat 61 | if version() < v"6" 62 | all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6") 63 | end 64 | 65 | workspace_size = 66 | cudnnGetConvolutionBackwardFilterWorkspaceSize(dw, x, dy, padding=pad, stride=stride, 67 | dilation=dilation, algo=algo, mode=flipkernel) 68 | 69 | CuVector{UInt8}(undef, workspace_size) do workspace 70 | cudnnConvolutionBackwardFilter(dw, x, dy, padding=pad, stride=stride, dilation=dilation, 71 | mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace, 72 | workspace_size=workspace_size) 73 | end 74 | end 75 | 76 | function ∇conv_data!(dx::CuArray{T}, dy::CuArray{T}, w::CuArray{T}; 77 | pad=0, stride=1, flipkernel=0, alpha=1, dilation=1, algo=0) where T<:CUDNNFloat 78 | if version() < v"6" 79 | all(x -> x == 1, dilation) || error("Only dilation = 1 is supported in cuDNN version < 6") 80 | end 81 | 82 | workspace_size = 83 | cudnnGetConvolutionBackwardDataWorkspaceSize(dx, w, dy, padding=pad, stride=stride, 84 | dilation=dilation, algo=algo, mode=flipkernel) 85 | CuVector{UInt8}(undef, workspace_size) do workspace 86 | cudnnConvolutionBackwardData(dx, w, dy, padding=pad, stride=stride, dilation=dilation, 87 | mode=flipkernel, alpha=alpha, algo=algo, workspace=workspace, 88 | workspace_size=workspace_size) 89 | end 90 | end 91 | 92 | ∇conv_bias!(db::CuArray{T}, dy::CuArray{T}; alpha=1, beta=0) where T<:CUDNNFloat = 93 | cudnnConvolutionBackwardBias(db, dy, alpha=alpha, beta=beta) 94 | 95 | maxpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where T<:CUDNNFloat = 96 | cudnnPoolingForward(y, x, window=k, padding=pad, stride=stride, mode=0) 97 | 98 | ∇maxpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, k; 99 | pad=map(_->0,k), stride=k) where T<:CUDNNFloat = 100 | cudnnPoolingBackward(dx, dy, x, y, window=k, padding=pad, stride=stride, mode=0) 101 | 102 | meanpool!(y::CuArray{T}, x::CuArray{T}, k; pad=map(_->0,k), stride=k) where T<:CUDNNFloat = 103 | cudnnPoolingForward(y, x, window=k, padding=pad, stride=stride, mode=1) 104 | 105 | ∇meanpool!(dx::CuArray{T}, dy::CuArray{T}, y::CuArray{T}, x::CuArray{T}, k; 106 | pad=map(_->0,k), stride=k) where T<:CUDNNFloat = 107 | cudnnPoolingBackward(dx, dy, x, y, window=k, padding=pad, stride=stride, mode=1) 108 | -------------------------------------------------------------------------------- /test/sparse_solver.jl: -------------------------------------------------------------------------------- 1 | @testset "CUSPARSE + CUSOLVER" begin 2 | 3 | if isdefined(CuArrays, :CUSPARSE) && isdefined(CuArrays, :CUSOLVER) 4 | using CuArrays.CUSOLVER 5 | using CuArrays.CUSPARSE 6 | 7 | using LinearAlgebra 8 | using SparseArrays 9 | 10 | m = 15 11 | n = 10 12 | l = 13 13 | k = 1 14 | 15 | @testset for elty in [Float32, Float64, ComplexF32, ComplexF64] 16 | @testset "csrlsvlu!" begin 17 | A = sparse(rand(elty,n,n)) 18 | b = rand(elty,n) 19 | x = zeros(elty,n) 20 | tol = convert(real(elty),1e-6) 21 | x = CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 22 | @test x ≈ Array(A)\b 23 | A = sparse(rand(elty,m,n)) 24 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 25 | A = sparse(rand(elty,n,n)) 26 | b = rand(elty,m) 27 | x = zeros(elty,n) 28 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 29 | b = rand(elty,n) 30 | x = zeros(elty,m) 31 | @test_throws DimensionMismatch CUSOLVER.csrlsvlu!(A,b,x,tol,one(Cint),'O') 32 | end 33 | 34 | @testset "csrlsvqr!" begin 35 | A = sparse(rand(elty,n,n)) 36 | d_A = CuSparseMatrixCSR(A) 37 | b = rand(elty,n) 38 | d_b = CuArray(b) 39 | x = zeros(elty,n) 40 | d_x = CuArray(x) 41 | tol = convert(real(elty),1e-4) 42 | d_x = CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 43 | h_x = collect(d_x) 44 | @test h_x ≈ Array(A)\b 45 | A = sparse(rand(elty,m,n)) 46 | d_A = CuSparseMatrixCSR(A) 47 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 48 | A = sparse(rand(elty,n,n)) 49 | b = rand(elty,m) 50 | x = zeros(elty,n) 51 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 52 | b = rand(elty,n) 53 | x = zeros(elty,m) 54 | @test_throws DimensionMismatch CUSOLVER.csrlsvqr!(d_A,d_b,d_x,tol,one(Cint),'O') 55 | end 56 | 57 | @testset "csrlsvchol!" begin 58 | A = rand(elty,n,n) 59 | A = sparse(A*A') #posdef 60 | d_A = CuSparseMatrixCSR(A) 61 | b = rand(elty,n) 62 | d_b = CuArray(b) 63 | x = zeros(elty,n) 64 | d_x = CuArray(x) 65 | tol = 10^2*eps(real(elty)) 66 | d_x = CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 67 | h_x = collect(d_x) 68 | @test h_x ≈ Array(A)\b 69 | b = rand(elty,m) 70 | d_b = CuArray(b) 71 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 72 | b = rand(elty,n) 73 | d_b = CuArray(b) 74 | x = rand(elty,m) 75 | d_x = CuArray(x) 76 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 77 | A = sparse(rand(elty,m,n)) 78 | d_A = CuSparseMatrixCSR(A) 79 | @test_throws DimensionMismatch CUSOLVER.csrlsvchol!(d_A,d_b,d_x,tol,zero(Cint),'O') 80 | end 81 | 82 | @testset "csreigvsi" begin 83 | A = sparse(rand(elty,n,n)) 84 | d_A = CuSparseMatrixCSR(A) 85 | evs = eigvals(Array(A)) 86 | x_0 = CuArray(rand(elty,n)) 87 | μ,x = CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 88 | @test μ ≈ evs[1] 89 | A = sparse(rand(elty,m,n)) 90 | d_A = CuSparseMatrixCSR(A) 91 | @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 92 | A = sparse(rand(elty,n,n)) 93 | d_A = CuSparseMatrixCSR(A) 94 | x_0 = CuArray(rand(elty,m)) 95 | @test_throws DimensionMismatch CUSOLVER.csreigvsi(d_A,convert(elty,evs[1]),x_0,convert(real(elty),1e-6),convert(Cint,1000),'O') 96 | end 97 | @testset "csreigs" begin 98 | celty = complex(elty) 99 | A = rand(real(elty),n,n) 100 | A = sparse(A + A') 101 | num = CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O') 102 | @test num <= n 103 | A = sparse(rand(celty,m,n)) 104 | d_A = CuSparseMatrixCSR(A) 105 | @test_throws DimensionMismatch CUSOLVER.csreigs(A,convert(celty,complex(-100,-100)),convert(celty,complex(100,100)),'O') 106 | end 107 | @testset "csrlsqvqr!" begin 108 | A = sparse(rand(elty,n,n)) 109 | b = rand(elty,n) 110 | x = zeros(elty,n) 111 | tol = convert(real(elty),1e-4) 112 | x = CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 113 | @test x[1] ≈ Array(A)\b 114 | A = sparse(rand(elty,n,m)) 115 | x = zeros(elty,n) 116 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 117 | A = sparse(rand(elty,n,n)) 118 | b = rand(elty,m) 119 | x = zeros(elty,n) 120 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 121 | b = rand(elty,n) 122 | x = zeros(elty,m) 123 | @test_throws DimensionMismatch CUSOLVER.csrlsqvqr!(A,b,x,tol,'O') 124 | end 125 | end 126 | 127 | end 128 | 129 | end 130 | -------------------------------------------------------------------------------- /src/fft/highlevel.jl: -------------------------------------------------------------------------------- 1 | # region is an iterable subset of dimensions 2 | # spec. an integer, range, tuple, or array 3 | 4 | # inplace complex 5 | function plan_fft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 6 | K = CUFFT_FORWARD 7 | inplace = true 8 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 9 | 10 | pp = _mkplan(xtype, size(X), region) 11 | 12 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 13 | end 14 | 15 | function plan_bfft!(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 16 | K = CUFFT_INVERSE 17 | inplace = true 18 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 19 | 20 | pp = _mkplan(xtype, size(X), region) 21 | 22 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 23 | end 24 | 25 | # out-of-place complex 26 | function plan_fft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 27 | K = CUFFT_FORWARD 28 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 29 | inplace = false 30 | 31 | pp = _mkplan(xtype, size(X), region) 32 | 33 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 34 | end 35 | 36 | function plan_bfft(X::CuArray{T,N}, region) where {T<:cufftComplexes,N} 37 | K = CUFFT_INVERSE 38 | inplace = false 39 | xtype = (T == cufftComplex) ? CUFFT_C2C : CUFFT_Z2Z 40 | 41 | pp = _mkplan(xtype, size(X), region) 42 | 43 | cCuFFTPlan{T,K,inplace,N}(pp, X, size(X), region, xtype) 44 | end 45 | 46 | # out-of-place real-to-complex 47 | function plan_rfft(X::CuArray{T,N}, region) where {T<:cufftReals,N} 48 | K = CUFFT_FORWARD 49 | inplace = false 50 | xtype = (T == cufftReal) ? CUFFT_R2C : CUFFT_D2Z 51 | 52 | pp = _mkplan(xtype, size(X), region) 53 | 54 | ydims = collect(size(X)) 55 | ydims[region[1]] = div(ydims[region[1]],2)+1 56 | 57 | rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype) 58 | end 59 | 60 | function plan_brfft(X::CuArray{T,N}, d::Integer, region::Any) where {T<:cufftComplexes,N} 61 | K = CUFFT_INVERSE 62 | inplace = false 63 | xtype = (T == cufftComplex) ? CUFFT_C2R : CUFFT_Z2D 64 | ydims = collect(size(X)) 65 | ydims[region[1]] = d 66 | 67 | pp = _mkplan(xtype, (ydims...,), region) 68 | 69 | rCuFFTPlan{T,K,inplace,N}(pp, X, (ydims...,), region, xtype) 70 | end 71 | 72 | # FIXME: plan_inv methods allocate needlessly (to provide type parameters) 73 | # Perhaps use FakeArray types to avoid this. 74 | 75 | function plan_inv(p::cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}) where {T,N,inplace} 76 | X = CuArray{T}(undef, p.sz) 77 | pp = _mkplan(p.xtype, p.sz, p.region) 78 | ScaledPlan(cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}(pp, X, p.sz, p.region, 79 | p.xtype), 80 | normalization(X, p.region)) 81 | end 82 | 83 | function plan_inv(p::cCuFFTPlan{T,CUFFT_INVERSE,inplace,N}) where {T,N,inplace} 84 | X = CuArray{T}(undef, p.sz) 85 | pp = _mkplan(p.xtype, p.sz, p.region) 86 | ScaledPlan(cCuFFTPlan{T,CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region, 87 | p.xtype), 88 | normalization(X, p.region)) 89 | end 90 | 91 | function plan_inv(p::rCuFFTPlan{T,CUFFT_INVERSE,inplace,N} 92 | ) where {T<:cufftComplexes,N,inplace} 93 | X = CuArray{real(T)}(undef, p.osz) 94 | Y = CuArray{T}(undef, p.sz) 95 | xtype = p.xtype == CUFFT_C2R ? CUFFT_R2C : CUFFT_D2Z 96 | pp = _mkplan(xtype, p.osz, p.region) 97 | ScaledPlan(rCuFFTPlan{real(T),CUFFT_FORWARD,inplace,N}(pp, X, p.sz, p.region, 98 | xtype), 99 | normalization(X, p.region)) 100 | end 101 | 102 | function plan_inv(p::rCuFFTPlan{T,CUFFT_FORWARD,inplace,N} 103 | ) where {T<:cufftReals,N,inplace} 104 | X = CuArray{complex(T)}(undef, p.osz) 105 | Y = CuArray{T}(undef, p.sz) 106 | xtype = p.xtype == CUFFT_R2C ? CUFFT_C2R : CUFFT_Z2D 107 | pp = _mkplan(xtype, p.sz, p.region) 108 | ScaledPlan(rCuFFTPlan{complex(T),CUFFT_INVERSE,inplace,N}(pp, X, p.sz, 109 | p.region, xtype), 110 | normalization(Y, p.region)) 111 | end 112 | 113 | 114 | # The rest of the standard API 115 | 116 | size(p::CuFFTPlan) = p.sz 117 | 118 | function mul!(y::CuArray{Ty}, p::CuFFTPlan{T,K,false}, x::CuArray{T} 119 | ) where {Ty,T,K} 120 | assert_applicable(p,x,y) 121 | unsafe_execute!(p,x,y) 122 | return y 123 | end 124 | 125 | function *(p::cCuFFTPlan{T,K,true,N}, x::CuArray{T,N}) where {T,K,N} 126 | assert_applicable(p,x) 127 | unsafe_execute!(p,x) 128 | x 129 | end 130 | 131 | function *(p::rCuFFTPlan{T,CUFFT_FORWARD,false,N}, x::CuArray{T,N} 132 | ) where {T<:cufftReals,N} 133 | @assert p.xtype ∈ [CUFFT_R2C,CUFFT_D2Z] 134 | y = CuArray{complex(T),N}(undef, p.osz) 135 | mul!(y,p,x) 136 | y 137 | end 138 | 139 | function *(p::rCuFFTPlan{T,CUFFT_INVERSE,false,N}, x::CuArray{T,N} 140 | ) where {T<:cufftComplexes,N} 141 | @assert p.xtype ∈ [CUFFT_C2R,CUFFT_Z2D] 142 | y = CuArray{real(T),N}(undef, p.osz) 143 | mul!(y,p,x) 144 | y 145 | end 146 | 147 | function *(p::cCuFFTPlan{T,K,false,N}, x::CuArray{T,N}) where {T,K,N} 148 | y = CuArray{T,N}(undef, p.osz) 149 | mul!(y,p,x) 150 | y 151 | end -------------------------------------------------------------------------------- /src/dnn/helpers.jl: -------------------------------------------------------------------------------- 1 | # For low level cudnn functions that require a pointer to a number 2 | cptr(x,a::CuArray{Float64})=Float64[x] 3 | cptr(x,a::CuArray{Float32})=Float32[x] 4 | cptr(x,a::CuArray{Float16})=Float32[x] 5 | 6 | # Conversion between Julia and CUDNN datatypes 7 | cudnnDataType(::Type{Float16})=CUDNN_DATA_HALF 8 | cudnnDataType(::Type{Float32})=CUDNN_DATA_FLOAT 9 | cudnnDataType(::Type{Float64})=CUDNN_DATA_DOUBLE 10 | juliaDataType(a)=(a==CUDNN_DATA_HALF ? Float16 : 11 | a==CUDNN_DATA_FLOAT ? Float32 : 12 | a==CUDNN_DATA_DOUBLE ? Float64 : error()) 13 | 14 | tuple_strides(A::Tuple) = _strides((1,), A) 15 | _strides(out::Tuple{Int}, A::Tuple{}) = () 16 | _strides(out::NTuple{N,Int}, A::NTuple{N}) where {N} = out 17 | function _strides(out::NTuple{M,Int}, A::Tuple) where M 18 | Base.@_inline_meta 19 | _strides((out..., out[M]*A[M]), A) 20 | end 21 | 22 | # Descriptors 23 | 24 | mutable struct TensorDesc; ptr; end 25 | free(td::TensorDesc) = cudnnDestroyTensorDescriptor(td.ptr) 26 | Base.unsafe_convert(::Type{cudnnTensorDescriptor_t}, td::TensorDesc) = td.ptr 27 | Base.unsafe_convert(::Type{Ptr{Nothing}}, td::TensorDesc) = convert(Ptr{Nothing}, td.ptr) 28 | 29 | function TensorDesc(T::Type, size::NTuple{N,Integer}, strides::NTuple{N,Integer} = tuple_strides(size)) where N 30 | sz = Cint.(size) |> reverse |> collect 31 | st = Cint.(strides) |> reverse |> collect 32 | d = Ref{cudnnTensorDescriptor_t}() 33 | cudnnCreateTensorDescriptor(d) 34 | cudnnSetTensorNdDescriptor(d[], cudnnDataType(T), length(sz), sz, st) 35 | this = TensorDesc(d[]) 36 | finalizer(free, this) 37 | return this 38 | end 39 | 40 | TensorDesc(a::CuArray) = TensorDesc(eltype(a), size(a), strides(a)) 41 | 42 | mutable struct FilterDesc 43 | ptr 44 | end 45 | free(fd::FilterDesc)=cudnnDestroyFilterDescriptor(fd.ptr) 46 | Base.unsafe_convert(::Type{cudnnFilterDescriptor_t}, fd::FilterDesc)=fd.ptr 47 | Base.unsafe_convert(::Type{Ptr{Nothing}}, fd::FilterDesc)=fd.ptr 48 | 49 | function createFilterDesc() 50 | d = Ref{cudnnFilterDescriptor_t}() 51 | @check cudnnCreateFilterDescriptor(d) 52 | return d[] 53 | end 54 | 55 | function FilterDesc(T::Type, size::Tuple; format = CUDNN_TENSOR_NCHW) 56 | # The only difference of a FilterDescriptor is no strides. 57 | sz = Cint.(size) |> reverse |> collect 58 | d = createFilterDesc() 59 | version() >= v"5" ? 60 | cudnnSetFilterNdDescriptor(d, cudnnDataType(T), format, length(sz), sz) : 61 | version() >= v"4" ? 62 | cudnnSetFilterNdDescriptor_v4(d, cudnnDataType(T), format, length(sz), sz) : 63 | cudnnSetFilterNdDescriptor(d, cudnnDataType(T), length(sz), sz) 64 | this = FilterDesc(d) 65 | finalizer(free, this) 66 | return this 67 | end 68 | 69 | FilterDesc(a::CuArray; format = CUDNN_TENSOR_NCHW) = FilterDesc(eltype(a), size(a), format = format) 70 | 71 | function Base.size(f::FilterDesc) 72 | typ = Ref{Cuint}() 73 | format = Ref{Cuint}() 74 | ndims = Ref{Cint}() 75 | dims = Vector{Cint}(undef, 8) 76 | cudnnGetFilterNdDescriptor(f, 8, typ, format, ndims, dims) 77 | @assert ndims[] ≤ 8 78 | return (dims[1:ndims[]]...,) |> reverse 79 | end 80 | 81 | mutable struct ConvDesc; ptr; end 82 | free(cd::ConvDesc) = cudnnDestroyConvolutionDescriptor(cd.ptr) 83 | Base.unsafe_convert(::Type{cudnnConvolutionDescriptor_t}, cd::ConvDesc)=cd.ptr 84 | 85 | function cdsize(w, nd) 86 | isa(w, Integer) ? Cint[fill(w,nd)...] : 87 | length(w)!=nd ? error("Dimension mismatch") : 88 | Cint[reverse(w)...] 89 | end 90 | 91 | pdsize(w, nd)=Cint[reverse(psize(w,nd))...] 92 | psize(w, nd)=(isa(w,Integer) ? fill(w,nd) : length(w) != nd ? error("Dimension mismatch") : w) 93 | 94 | function ConvDesc(T, N, padding, stride, dilation, mode) 95 | cd = Ref{cudnnConvolutionDescriptor_t}() 96 | cudnnCreateConvolutionDescriptor(cd) 97 | version() >= v"4" ? cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) : 98 | version() >= v"3" ? cudnnSetConvolutionNdDescriptor_v3(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode,cudnnDataType(T)) : 99 | cudnnSetConvolutionNdDescriptor(cd[],N,cdsize(padding,N),cdsize(stride,N),cdsize(dilation,N),mode) 100 | this = ConvDesc(cd[]) 101 | finalizer(free, this) 102 | return this 103 | end 104 | 105 | mutable struct PoolDesc; ptr; end 106 | free(pd::PoolDesc)=cudnnDestroyPoolingDescriptor(pd.ptr) 107 | Base.unsafe_convert(::Type{cudnnPoolingDescriptor_t}, pd::PoolDesc)=pd.ptr 108 | 109 | function PoolDesc(nd, window, padding, stride, mode, maxpoolingNanOpt=CUDNN_NOT_PROPAGATE_NAN) 110 | pd = Ref{cudnnPoolingDescriptor_t}() 111 | cudnnCreatePoolingDescriptor(pd) 112 | cudnnSetPoolingNdDescriptor(pd[],mode,maxpoolingNanOpt,nd,pdsize(window,nd),pdsize(padding,nd),pdsize(stride,nd)) 113 | this = PoolDesc(pd[]) 114 | finalizer(free, this) 115 | return this 116 | end 117 | 118 | mutable struct ActivationDesc; ptr; end 119 | free(ad::ActivationDesc)=cudnnDestroyActivationDescriptor(ad.ptr) 120 | Base.unsafe_convert(::Type{cudnnActivationDescriptor_t}, ad::ActivationDesc)=ad.ptr 121 | 122 | function ActivationDesc(mode, coeff, reluNanOpt=CUDNN_NOT_PROPAGATE_NAN) 123 | ad = Ref{cudnnActivationDescriptor_t}() 124 | cudnnCreateActivationDescriptor(ad) 125 | cudnnSetActivationDescriptor(ad[],mode,reluNanOpt,coeff) 126 | this = ActivationDesc(ad[]) 127 | finalizer(free, this) 128 | return this 129 | end 130 | -------------------------------------------------------------------------------- /src/solver/libcusolver.jl: -------------------------------------------------------------------------------- 1 | # low-level wrappers of the CUSOLVER library 2 | 3 | #helper functions 4 | function cusolverDnCreate() 5 | handle = Ref{cusolverDnHandle_t}() 6 | @check ccall((:cusolverDnCreate, libcusolver), 7 | cusolverStatus_t, 8 | (Ptr{cusolverDnHandle_t},), 9 | handle) 10 | return handle[] 11 | end 12 | 13 | function cusolverDnDestroy(handle) 14 | @check ccall((:cusolverDnDestroy, libcusolver), 15 | cusolverStatus_t, 16 | (cusolverDnHandle_t,), 17 | handle) 18 | end 19 | 20 | function cusolverDnSetStream(handle, streamId) 21 | @check ccall((:cusolverDnSetStream, libcusolver), 22 | cusolverStatus_t, 23 | (cusolverDnHandle_t, CuStream_t), 24 | handle, streamId) 25 | end 26 | 27 | function cusolverDnGetStream(handle, streamId) 28 | @check ccall((:cusolverDnGetStream, libcusolver), 29 | cusolverStatus_t, 30 | (cusolverDnHandle_t, Ptr{CuStream_t}), 31 | handle, streamId) 32 | end 33 | 34 | function cusolverSpCreate() 35 | handle = Ref{cusolverSpHandle_t}() 36 | @check ccall((:cusolverSpCreate, libcusolver), 37 | cusolverStatus_t, 38 | (Ptr{cusolverSpHandle_t},), 39 | handle) 40 | return handle[] 41 | end 42 | 43 | function cusolverSpDestroy(handle) 44 | @check ccall((:cusolverSpDestroy, libcusolver), 45 | cusolverStatus_t, 46 | (cusolverSpHandle_t,), 47 | handle) 48 | end 49 | 50 | function cusolverSpSetStream(handle, streamId) 51 | @check ccall((:cusolverSpSetStream, libcusolver), 52 | cusolverStatus_t, 53 | (cusolverSpHandle_t, CuStream_t), 54 | handle, streamId) 55 | end 56 | 57 | function cusolverSpGetStream(handle, streamId) 58 | @check ccall((:cusolverSpGetStream, libcusolver), 59 | cusolverStatus_t, 60 | (cusolverSpHandle_t, Ptr{CuStream_t}), 61 | handle, streamId) 62 | end 63 | 64 | function cusolverSpCreateCsrqrInfo(info) 65 | @check ccall((:cusolverSpCreateCsrqrInfo, libcusolver), 66 | cusolverStatus_t, 67 | (Ptr{csrqrInfo_t},), 68 | info) 69 | end 70 | 71 | function cusolverSpDestroyCsrqrInfo(info) 72 | @check ccall((:cusolverDestroyCsrqrInfo, libcusolver), 73 | cusolverStatus_t, 74 | (csrqrInfo_t,), 75 | info) 76 | end 77 | 78 | function cusolverDnCreateGesvdjInfo(info) 79 | @check ccall((:cusolverDnCreateGesvdjInfo, libcusolver), 80 | cusolverStatus_t, 81 | (Ptr{gesvdjInfo_t},), 82 | info) 83 | end 84 | 85 | function cusolverDnDestroyGesvdjInfo(info) 86 | @check ccall((:cusolverDnDestroyGesvdjInfo, libcusolver), 87 | cusolverStatus_t, 88 | (gesvdjInfo_t,), 89 | info) 90 | end 91 | 92 | function cusolverDnXgesvdjSetTolerance(info, tolerance) 93 | @check ccall((:cusolverDnXgesvdjSetTolerance, libcusolver), 94 | cusolverStatus_t, 95 | (gesvdjInfo_t, Float64), 96 | info, Float64(tolerance)) 97 | end 98 | 99 | function cusolverDnXgesvdjSetMaxSweeps(info, max_sweeps) 100 | @check ccall((:cusolverDnXgesvdjSetMaxSweeps, libcusolver), 101 | cusolverStatus_t, 102 | (gesvdjInfo_t, Cint), 103 | info, Cint(max_sweeps)) 104 | end 105 | 106 | function cusolverDnCreateSyevjInfo(info) 107 | @check ccall((:cusolverDnCreateSyevjInfo, libcusolver), 108 | cusolverStatus_t, 109 | (Ptr{syevjInfo_t},), 110 | info) 111 | end 112 | 113 | function cusolverDnDestroySyevjInfo(info) 114 | @check ccall((:cusolverDnDestroySyevjInfo, libcusolver), 115 | cusolverStatus_t, 116 | (syevjInfo_t,), 117 | info) 118 | end 119 | 120 | function cusolverDnXsyevjSetTolerance(info, tolerance) 121 | @check ccall((:cusolverDnXsyevjSetTolerance, libcusolver), 122 | cusolverStatus_t, 123 | (syevjInfo_t, Float64), 124 | info, Float64(tolerance)) 125 | end 126 | 127 | function cusolverDnXsyevjSetMaxSweeps(info, max_sweeps) 128 | @check ccall((:cusolverDnXsyevjSetMaxSweeps, libcusolver), 129 | cusolverStatus_t, 130 | (syevjInfo_t, Cint), 131 | info, Cint(max_sweeps)) 132 | end 133 | 134 | function cusolverRfCreate(handle) 135 | @check ccall((:cusolverRfCreate, libcusolver), 136 | cusolverStatus_t, 137 | (Ptr{cusolverRfHandle_t},), 138 | handle) 139 | end 140 | 141 | function cusolverRfDestroy(handle) 142 | @check ccall((:cusolverRfDestroy, libcusolver), 143 | cusolverStatus_t, 144 | (cusolverRfHandle_t,), 145 | handle) 146 | end 147 | 148 | function cusolverRfSetStream(handle, streamId) 149 | @check ccall((:cusolverRfSetStream, libcusolver), 150 | cusolverStatus_t, 151 | (cusolverRfHandle_t, CuStream_t), 152 | handle, streamId) 153 | end 154 | 155 | function cusolverRfGetStream(handle, streamId) 156 | @check ccall((:cusolverRfGetStream, libcusolver), 157 | cusolverStatus_t, 158 | (cusolverRfHandle_t, Ptr{CuStream_t}), 159 | handle, streamId) 160 | end 161 | 162 | function cusolverGetProperty(property::CUDAapi.libraryPropertyType) 163 | value_ref = Ref{Cint}() 164 | @check ccall((:cusolverGetProperty, libcusolver), 165 | cusolverStatus_t, 166 | (Cint, Ptr{Cint}), 167 | property, value_ref) 168 | value_ref[] 169 | end 170 | -------------------------------------------------------------------------------- /src/sparse/libcusparse_types.jl: -------------------------------------------------------------------------------- 1 | #enum cusparseStatus_t 2 | #error messages from CUSPARSE 3 | 4 | """ 5 | Status messages from CUSPARSE's C API. 6 | """ 7 | const cusparseStatus_t = UInt32 8 | const CUSPARSE_STATUS_SUCCESS = 0 9 | const CUSPARSE_STATUS_NOT_INITIALIZED = 1 10 | const CUSPARSE_STATUS_ALLOC_FAILED = 2 11 | const CUSPARSE_STATUS_INVALID_VALUE = 3 12 | const CUSPARSE_STATUS_ARCH_MISMATCH = 4 13 | const CUSPARSE_STATUS_MAPPING_ERROR = 5 14 | const CUSPARSE_STATUS_EXECUTION_FAILED = 6 15 | const CUSPARSE_STATUS_INTERNAL_ERROR = 7 16 | const CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8 17 | 18 | #enum cusparseAction_t 19 | """ 20 | Perform operation on indices only (`CUSPARSE_ACTION_SYMBOLIC`) or 21 | on both data and indices (`CUSPARSE_ACTION_NUMERIC`). Used in 22 | conversion routines. 23 | """ 24 | const cusparseAction_t = UInt32 25 | const CUSPARSE_ACTION_SYMBOLIC = 0 26 | const CUSPARSE_ACTION_NUMERIC = 1 27 | 28 | #enum cusparseDirection_t 29 | """ 30 | Parse dense matrix by rows (`CUSPARSE_DIRECTION_ROW`) or columns 31 | (`CUSPARSE_DIRECTION_COL`) to compute its number of non-zeros. 32 | """ 33 | const cusparseDirection_t = UInt32 34 | const CUSPARSE_DIRECTION_ROW = 0 35 | const CUSPARSE_DIRECTION_COL = 1 36 | 37 | #enum cusparseHybPartition_t 38 | """ 39 | How to partition the HYB matrix in a [`CudaSparseMatrixHYB`](@ref). 40 | There are three choices: 41 | * `CUSPARSE_HYB_PARTITION_AUTO` - let CUSPARSE decide internally for best performance. 42 | * `CUSPARSE_HYB_PARTITION_USER` - set the partition manually in the conversion function. 43 | * `CUSPARSE_HYB_PARTITION_MAX` - use the maximum partition, putting the matrix in ELL format. 44 | """ 45 | const cusparseHybPartition_t = UInt32 46 | const CUSPARSE_HYB_PARTITION_AUTO = 0 47 | const CUSPARSE_HYB_PARTITION_USER = 1 48 | const CUSPARSE_HYB_PARTITION_MAX = 2 49 | 50 | #enum cusparseFillMode_t 51 | """ 52 | Determines if a symmetric/Hermitian/triangular matrix has its upper 53 | (`CUSPARSE_FILL_MODE_UPPER`) or lower (`CUSPARSE_FILL_MODE_LOWER`) 54 | triangle filled. 55 | """ 56 | const cusparseFillMode_t = UInt32 57 | const CUSPARSE_FILL_MODE_LOWER = 0 58 | const CUSPARSE_FILL_MODE_UPPER = 1 59 | 60 | #enum cusparseDiagType_t 61 | """ 62 | Determines if the diagonal of a matrix is all ones (`CUSPARSE_DIAG_TYPE_UNIT`) 63 | or not all ones (`CUSPARSE_DIAG_TYPE_NON_UNIT`). 64 | """ 65 | const cusparseDiagType_t = UInt32 66 | const CUSPARSE_DIAG_TYPE_NON_UNIT = 0 67 | const CUSPARSE_DIAG_TYPE_UNIT = 1 68 | 69 | #enum cusparsePointerMode_t 70 | """ 71 | Determines if scalar arguments to a function are present on the host CPU 72 | (`CUSPARSE_POINTER_MODE_HOST`) or on the GPU (`CUSPARSE_POINTER_MODE_DEVICE`). 73 | """ 74 | const cusparsePointerMode_t = UInt32 75 | const CUSPARSE_POINTER_MODE_HOST = 0 76 | const CUSPARSE_POINTER_MODE_DEVICE = 1 77 | 78 | #enum cusparseOperation_t 79 | """ 80 | Determines whether to perform an operation, such as a matrix multiplication 81 | or solve, on the matrix as-is (`CUSPARSE_OPERATION_NON_TRANSPOSE`), on the 82 | matrix's transpose (`CUSPARSE_OPERATION_TRANSPOSE`), or on its conjugate 83 | transpose (`CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE`). 84 | """ 85 | const cusparseOperation_t = UInt32 86 | const CUSPARSE_OPERATION_NON_TRANSPOSE = 0 87 | const CUSPARSE_OPERATION_TRANSPOSE = 1 88 | const CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 89 | 90 | #enum cusparseMatrixType_t 91 | """ 92 | Indicates whether a matrix is a general matrix (`CUSPARSE_MATRIX_TYPE_GENERAL`), 93 | symmetric (`CUSPARSE_MATRIX_TYPE_SYMMETRIC`), Hermitian 94 | (`CUSPARSE_MATRIX_TYPE_HERMITIAN`), or triangular 95 | (`CUSPARSE_MATRIX_TYPE_TRIANGULAR`). Note that for some matrix types 96 | (those in [`CompressedSparse`](@ref)), this can be inferred for some function 97 | calls. 98 | """ 99 | const cusparseMatrixType_t = UInt32 100 | const CUSPARSE_MATRIX_TYPE_GENERAL = 0 101 | const CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1 102 | const CUSPARSE_MATRIX_TYPE_HERMITIAN = 2 103 | const CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 104 | 105 | #enum cusparseSolvePolicy_t 106 | """ 107 | Indicates whether to keep level info in solvers (`CUSPARSE_SOLVE_POLICY_USE_LEVEL`) 108 | or whether to not use it (`CUSPARSE_SOLVE_POLICY_NO_LEVEL`). 109 | """ 110 | const cusparseSolvePolicy_t = UInt32 111 | const CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0 112 | const CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 113 | 114 | #enum cusparseIndexBase_t 115 | """ 116 | Indicates whether a sparse object is zero-indexed (`CUSPARSE_INDEX_BASE_ZERO`) 117 | or one-indexed (`CUSPARSE_INDEX_BASE_ONE`). CUSPARSE.jl supports both. Julia 118 | sparse matrices are one-indexed, but you may wish to pass matrices from other 119 | libraries which use zero-indexing (e.g. C language ODE solvers). 120 | """ 121 | const cusparseIndexBase_t = UInt32 122 | const CUSPARSE_INDEX_BASE_ZERO = 0 123 | const CUSPARSE_INDEX_BASE_ONE = 1 124 | 125 | #struct cusparseMatDescr_t 126 | """ 127 | Describes shape and properties of a CUSPARSE matrix. A convenience wrapper. 128 | 129 | Contains: 130 | * `MatrixType` - a [`cusparseMatrixType_t`](@ref) 131 | * `FillMode` - a [`cusparseFillMode_t`](@ref) 132 | * `DiagType` - a [`cusparseDiagType_t`](@ref) 133 | * `IndexBase` - a [`cusparseIndexBase_t`](@ref) 134 | """ 135 | struct cusparseMatDescr_t 136 | MatrixType::cusparseMatrixType_t 137 | FillMode::cusparseFillMode_t 138 | DiagType::cusparseDiagType_t 139 | IndexBase::cusparseIndexBase_t 140 | function cusparseMatDescr_t(MatrixType,FillMode,DiagType,IndexBase) 141 | new(MatrixType,FillMode,DiagType,IndexBase) 142 | end 143 | end 144 | 145 | """ 146 | An opaque struct containing information about the solution approach 147 | CUSPARSE will take. Generated by [`sv_analysis`](@ref) or 148 | [`sm_analysis`](@ref) and passed to [`sv_solve!`](@ref), [`sm_solve`](@ref), 149 | [`ic0!`](@ref), or [`ilu0!`](@ref). 150 | """ 151 | const cusparseSolveAnalysisInfo_t = Ptr{Cvoid} 152 | const bsrsm2Info_t = Ptr{Cvoid} 153 | const bsrsv2Info_t = Ptr{Cvoid} 154 | const csrsv2Info_t = Ptr{Cvoid} 155 | const csric02Info_t = Ptr{Cvoid} 156 | const csrilu02Info_t = Ptr{Cvoid} 157 | const bsric02Info_t = Ptr{Cvoid} 158 | const bsrilu02Info_t = Ptr{Cvoid} 159 | 160 | const cusparseContext = Cvoid 161 | const cusparseHandle_t = Ptr{cusparseContext} 162 | 163 | #complex numbers 164 | 165 | const cuComplex = Complex{Float32} 166 | const cuDoubleComplex = Complex{Float64} 167 | 168 | const CusparseFloat = Union{Float64,Float32,ComplexF64,ComplexF32} 169 | const CusparseReal = Union{Float64,Float32} 170 | const CusparseComplex = Union{ComplexF64,ComplexF32} 171 | 172 | const cusparseHybMat_t = Ptr{Cvoid} 173 | -------------------------------------------------------------------------------- /src/rand/libcurand.jl: -------------------------------------------------------------------------------- 1 | function create_generator(typ::Int=CURAND_RNG_PSEUDO_DEFAULT) 2 | ptr = Ref{curandGenerator_t}() 3 | @check ccall((:curandCreateGenerator, libcurand), 4 | curandStatus_t, 5 | (Ptr{curandGenerator_t}, Cint), ptr, typ) 6 | r = RNG(ptr[], typ) 7 | finalizer(destroy_generator, r) 8 | return r 9 | end 10 | 11 | function destroy_generator(rng::RNG) 12 | @check ccall((:curandDestroyGenerator, libcurand), 13 | curandStatus_t, 14 | (curandGenerator_t,), rng) 15 | end 16 | 17 | function get_version() 18 | ver = Ref{Cint}() 19 | @check ccall((:curandGetVersion, libcurand), 20 | curandStatus_t, 21 | (Ref{Cint},), ver) 22 | return ver[] 23 | end 24 | 25 | # TODO: curandSetStream 26 | 27 | function set_pseudo_random_generator_seed(rng::RNG, seed::Int64) 28 | @check ccall((:curandSetPseudoRandomGeneratorSeed, libcurand), 29 | curandStatus_t, 30 | (curandGenerator_t, Clonglong), rng, seed) 31 | end 32 | 33 | function set_generator_offset(rng::RNG, offset::Int64) 34 | @check ccall((:curandSetGeneratorOffset, libcurand), 35 | curandStatus_t, 36 | (curandGenerator_t, Clonglong), rng, offset) 37 | end 38 | 39 | function set_generator_ordering(rng::RNG, order::Int) 40 | @check ccall((:curandSetGeneratorOrdering, libcurand), 41 | curandStatus_t, 42 | (curandGenerator_t, Cint), rng, order) 43 | end 44 | 45 | function set_quasi_random_generator_dimensions(rng::RNG, num_dimensions::UInt) 46 | @check ccall((:curandSetQuasiRandomGeneratorDimensions, libcurand), 47 | curandStatus_t, 48 | (curandGenerator_t, Cuint), 49 | rng, num_dimensions) 50 | end 51 | 52 | 53 | """ 54 | Generate 64-bit quasirandom numbers. 55 | """ 56 | function generate(rng::RNG, arr::CuArray, n::UInt) 57 | @check ccall((:curandGenerate, libcurand), 58 | curandStatus_t, 59 | (curandGenerator_t, CuPtr{UInt32}, Csize_t), 60 | rng, arr, length(arr)) 61 | return arr 62 | end 63 | 64 | 65 | """ 66 | Generate uniformly distributed floats. 67 | 68 | Valid RNG types are: 69 | - CURAND_RNG_QUASI_SOBOL64 70 | - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 71 | """ 72 | function generate_long_long(rng::RNG, arr::CuArray) 73 | @check ccall((:curandGenerateLongLong, libcurand), 74 | curandStatus_t, 75 | (curandGenerator_t, CuPtr{Culonglong}, Csize_t), 76 | rng, arr, length(arr)) 77 | return arr 78 | end 79 | 80 | # uniform 81 | function generate_uniform(rng::RNG, arr::CuArray) 82 | @check ccall((:curandGenerateUniform, libcurand), 83 | curandStatus_t, 84 | (curandGenerator_t, CuPtr{Float32}, Csize_t), 85 | rng, arr, length(arr)) 86 | return arr 87 | end 88 | 89 | function generate_uniform_double(rng::RNG, arr::CuArray) 90 | @check ccall((:curandGenerateUniformDouble, libcurand), 91 | curandStatus_t, 92 | (curandGenerator_t, CuPtr{Float64}, Csize_t), 93 | rng, arr, length(arr)) 94 | return arr 95 | end 96 | 97 | # normal 98 | function generate_normal(rng::RNG, arr::CuArray, mean, stddev) 99 | @check ccall((:curandGenerateNormal, libcurand), 100 | curandStatus_t, 101 | (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat), 102 | rng, arr, length(arr), mean, stddev) 103 | return arr 104 | end 105 | 106 | function generate_normal_double(rng::RNG, arr::CuArray, mean, stddev) 107 | @check ccall((:curandGenerateNormalDouble, libcurand), 108 | curandStatus_t, 109 | (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble), 110 | rng, arr, length(arr), mean, stddev) 111 | return arr 112 | end 113 | 114 | 115 | # lognormal 116 | function generate_log_normal(rng::RNG, arr::CuArray, mean, stddev) 117 | @check ccall((:curandGenerateLogNormal, libcurand), 118 | curandStatus_t, 119 | (curandGenerator_t, CuPtr{Cfloat}, Csize_t, Cfloat, Cfloat), 120 | rng, arr, length(arr), mean, stddev) 121 | return arr 122 | end 123 | 124 | function generate_log_normal_double(rng::RNG, arr::CuArray, mean, stddev) 125 | @check ccall((:curandGenerateLogNormalDouble, libcurand), 126 | curandStatus_t, 127 | (curandGenerator_t, CuPtr{Cdouble}, Csize_t, Cdouble, Cdouble), 128 | rng, arr, length(arr), mean, stddev) 129 | return arr 130 | end 131 | 132 | # Poisson 133 | """Construct the histogram array for a Poisson distribution.""" 134 | function create_poisson_distribtion(lambda) 135 | ptr = Ref{curandDiscreteDistribution_t}() 136 | @check ccall((:curandCreatePoissonDistribution, libcurand), 137 | curandStatus_t, 138 | (Cdouble, Ptr{curandDiscreteDistribution_t}), 139 | lambda, ptr) 140 | dist = DiscreteDistribution(ptr[]) 141 | finalizer(destroy_distribution, dist) 142 | return dist 143 | end 144 | 145 | """Destroy the histogram array for a discrete distribution (e.g. Poisson).""" 146 | function destroy_distribution(dist::DiscreteDistribution) 147 | @check ccall((:curandDestroyDistribution, libcurand), 148 | curandStatus_t, 149 | (curandDiscreteDistribution_t,), 150 | dist) 151 | end 152 | 153 | """Generate Poisson-distributed unsigned ints.""" 154 | function generate_poisson(rng::RNG, arr::CuArray, lambda) 155 | @check ccall((:curandGeneratePoisson, libcurand), 156 | curandStatus_t, 157 | (curandGenerator_t, CuPtr{Cuint}, Csize_t, Cdouble), 158 | rng, arr, length(arr), lambda) 159 | return arr 160 | end 161 | 162 | # seeds 163 | """Generate the starting state of the generator. """ 164 | function generate_seeds(rng::RNG) 165 | @check ccall((:curandGenerateSeeds, libcurand), 166 | curandStatus_t, 167 | (curandGenerator_t,), rng) 168 | end 169 | 170 | # TODO: curandGetDirectionVectors32 171 | # TODO: curandGetScrambleConstants32 172 | # TODO: curandGetDirectionVectors64 173 | # TODO: curandGetScrambleConstants64 174 | 175 | function curandGetProperty(property::CUDAapi.libraryPropertyType) 176 | value_ref = Ref{Cint}() 177 | @check ccall((:curandGetProperty, libcurand), 178 | curandStatus_t, 179 | (Cint, Ptr{Cint}), 180 | property, value_ref) 181 | value_ref[] 182 | end 183 | -------------------------------------------------------------------------------- /test/base.jl: -------------------------------------------------------------------------------- 1 | using ForwardDiff: Dual 2 | using LinearAlgebra 3 | using Adapt: adapt 4 | 5 | import CUDAdrv 6 | import CUDAdrv: CuPtr, CU_NULL 7 | 8 | @testset "GPUArrays test suite" begin 9 | GPUArrays.test(CuArray) 10 | end 11 | 12 | @testset "Memory" begin 13 | CuArrays.alloc(0) 14 | 15 | @test (CuArrays.@allocated CuArray{Int32}(undef,1)) == 4 16 | 17 | ret, out = @grab_output CuArrays.@time CuArray{Int32}(undef, 1) 18 | @test isa(ret, CuArray{Int32}) 19 | @test occursin("1 GPU allocation: 4 bytes", out) 20 | 21 | ret, out = @grab_output CuArrays.@time Base.unsafe_wrap(CuArray, CuPtr{Int32}(12345678), (2, 3)) 22 | @test isa(ret, CuArray{Int32}) 23 | @test !occursin("GPU allocation", out) 24 | end 25 | 26 | @testset "Array" begin 27 | xs = CuArray{Int}(undef, 2, 3) 28 | @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4] 29 | @test collect(cu[1, 2, 3]) == [1, 2, 3] 30 | @test collect(cu([1, 2, 3])) == [1, 2, 3] 31 | @test testf(vec, rand(5,3)) 32 | @test cu(1:3) === 1:3 33 | 34 | # Check that allowscalar works 35 | @test_throws ErrorException xs[1] 36 | @test_throws ErrorException xs[1] = 1 37 | 38 | # unsafe_wrap 39 | buf = CUDAdrv.Mem.Buffer(CU_NULL, 2, CUDAdrv.CuCurrentContext()) 40 | @test Base.unsafe_wrap(CuArray, CU_NULL, 1; own=false).own == false 41 | @test Base.unsafe_wrap(CuArray, CU_NULL, 1; ctx=CUDAdrv.CuCurrentContext()).buf.ctx == CUDAdrv.CuCurrentContext() 42 | @test Base.unsafe_wrap(CuArray, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 43 | @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 44 | @test Base.unsafe_wrap(CuArray{Nothing,1}, CU_NULL, 2) == CuArray{Nothing,1}(buf, (2,)) 45 | @test Base.unsafe_wrap(CuArray, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 46 | @test Base.unsafe_wrap(CuArray{Nothing}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 47 | @test Base.unsafe_wrap(CuArray{Nothing,2}, CU_NULL, (1,2)) == CuArray{Nothing,2}(buf, (1,2)) 48 | 49 | @test collect(cuzeros(2, 2)) == zeros(Float32, 2, 2) 50 | @test collect(cuones(2, 2)) == ones(Float32, 2, 2) 51 | 52 | @test collect(cufill(0, 2, 2)) == zeros(Float32, 2, 2) 53 | @test collect(cufill(1, 2, 2)) == ones(Float32, 2, 2) 54 | end 55 | 56 | @testset "Adapt" begin 57 | A = rand(Float32, 3, 3) 58 | dA = CuArray(A) 59 | @test adapt(Array, dA) ≈ A 60 | @test adapt(CuArray, A) ≈ dA 61 | end 62 | 63 | @testset "Broadcast" begin 64 | @test testf((x) -> fill!(x, 1), rand(3,3)) 65 | @test testf((x, y) -> map(+, x, y), rand(2, 3), rand(2, 3)) 66 | @test testf((x) -> sin.(x), rand(2, 3)) 67 | @test testf((x) -> log.(x) .+ 1, rand(2, 3)) 68 | @test testf((x) -> 2x, rand(2, 3)) 69 | @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) 70 | @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) 71 | @test (CuArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == CuArray([C_NULL]) 72 | @test CuArray([1,2,3]) .+ CuArray([1.0,2.0,3.0]) == CuArray([2,4,6]) 73 | 74 | @eval struct Whatever{T} 75 | x::Int 76 | end 77 | @test Array(Whatever{Int}.(CuArray([1]))) == Whatever{Int}.([1]) 78 | end 79 | 80 | @testset "Cufunc" begin 81 | gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) 82 | sig(x) = one(x) / (one(x) + exp(-x)) 83 | f(x) = gelu(log(x)) * sig(x) * tanh(x) 84 | 85 | CuArrays.@cufunc gelu(x) = oftype(x, 0.5) * x * (1 + tanh(oftype(x, √(2/π))*(x + oftype(x, 0.044715) * x^3))) 86 | CuArrays.@cufunc sig(x) = one(x) / (one(x) + exp(-x)) 87 | CuArrays.@cufunc f(x) = gelu(log(x)) * sig(x) * tanh(x) 88 | 89 | @test :gelu ∈ CuArrays.cufuncs() 90 | @test :sig ∈ CuArrays.cufuncs() 91 | @test :f ∈ CuArrays.cufuncs() 92 | @test testf((x) -> gelu.(x), rand(3,3)) 93 | @test testf((x) -> sig.(x), rand(3,3)) 94 | @test testf((x) -> f.(x), rand(3,3)) 95 | end 96 | 97 | # https://github.com/JuliaGPU/CUDAnative.jl/issues/223 98 | @testset "Ref Broadcast" begin 99 | foobar(idx, A) = A[idx] 100 | @test CuArray([42]) == foobar.(CuArray([1]), Base.RefValue(CuArray([42]))) 101 | end 102 | 103 | @testset "Broadcast Fix" begin 104 | @test testf(x -> log.(x), rand(3,3)) 105 | @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3)) 106 | 107 | if isdefined(CuArrays, :CUDNN) 108 | using NNlib 109 | 110 | @test testf(x -> logσ.(x), rand(5)) 111 | 112 | f(x) = logσ.(x) 113 | ds = Dual.(rand(5),1) 114 | @test f(ds) ≈ collect(f(CuArray(ds))) 115 | end 116 | end 117 | 118 | @testset "Reduce" begin 119 | @test testf(x -> sum(x, dims=1), rand(2, 3)) 120 | @test testf(x -> sum(x, dims=2), rand(2, 3)) 121 | @test testf(x -> sum(x -> x^2, x, dims=1), rand(2, 3)) 122 | @test testf(x -> prod(x, dims=2), rand(2, 3)) 123 | 124 | @test testf(x -> sum(x), rand(2, 3)) 125 | @test testf(x -> prod(x), rand(2, 3)) 126 | end 127 | 128 | @testset "0D" begin 129 | x = CuArray{Float64}(undef) 130 | x .= 1 131 | @test collect(x)[] == 1 132 | x /= 2 133 | @test collect(x)[] == 0.5 134 | end 135 | 136 | @testset "Slices" begin 137 | @test testf(rand(5)) do x 138 | y = x[2:4] 139 | y .= 1 140 | x 141 | end 142 | @test testf(rand(5)) do x 143 | y = view(x, 2:4) 144 | y .= 1 145 | x 146 | end 147 | @test testf(x->view(x, :, 1:4, 3), rand(Float32, 5, 4, 3)) 148 | @allowscalar let x = cu(rand(Float32, 5, 4, 3)) 149 | @test_throws BoundsError view(x, :, :, 1:10) 150 | 151 | # Contiguous views should return new CuArray 152 | @test typeof(view(x, :, 1, 2)) == CuVector{Float32} 153 | @test typeof(view(x, 1:4, 1, 2)) == CuVector{Float32} 154 | @test typeof(view(x, :, 1:4, 3)) == CuMatrix{Float32} 155 | @test typeof(view(x, :, :, 1)) == CuMatrix{Float32} 156 | @test typeof(view(x, :, :, :)) == CuArray{Float32,3} 157 | @test typeof(view(x, :)) == CuVector{Float32} 158 | @test typeof(view(x, 1:3)) == CuVector{Float32} 159 | 160 | # Non-contiguous views should fall back to base's SubArray 161 | @test typeof(view(x, 1:3, 1:3, 3)) <: SubArray 162 | @test typeof(view(x, 1, :, 3)) <: SubArray 163 | @test typeof(view(x, 1, 1:4, 3)) <: SubArray 164 | @test typeof(view(x, :, 1, 1:3)) <: SubArray 165 | @test typeof(view(x, :, 1:2:4, 1)) <: SubArray 166 | @test typeof(view(x, 1:2:5, 1, 1)) <: SubArray 167 | end 168 | end 169 | 170 | @testset "Reshape" begin 171 | A = [1 2 3 4 172 | 5 6 7 8] 173 | gA = reshape(CuArray(A),1,8) 174 | _A = reshape(A,1,8) 175 | _gA = Array(gA) 176 | @test all(_A .== _gA) 177 | A = [1,2,3,4] 178 | gA = reshape(CuArray(A),4) 179 | end 180 | 181 | @testset "$f! with diagonal $d" for (f, f!) in ((triu, triu!), (tril, tril!)), 182 | d in -2:2 183 | A = randn(10, 10) 184 | @test f(A, d) == Array(f!(CuArray(A), d)) 185 | end 186 | 187 | @testset "Utilities" begin 188 | t = @elapsed ret = CuArrays.@sync begin 189 | # TODO: do something that takes a while on the GPU 190 | # (need to wrap clock64 in CUDAnative for that) 191 | 42 192 | end 193 | @test t >= 0 194 | @test ret == 42 195 | end 196 | -------------------------------------------------------------------------------- /test/fft.jl: -------------------------------------------------------------------------------- 1 | @testset "CUFFT" begin 2 | 3 | if !isdefined(CuArrays, :CUFFT) 4 | @warn "Not testing CUFFT" 5 | else 6 | using CuArrays.CUFFT 7 | @info "Testing CUFFT $(CUFFT.version())" 8 | 9 | # notes: 10 | # plan_bfft does not need separate testing since it is used by plan_ifft 11 | 12 | using FFTW 13 | 14 | N1 = 8 15 | N2 = 32 16 | N3 = 64 17 | N4 = 8 18 | 19 | MYRTOL = 1e-5 20 | MYATOL = 1e-8 21 | 22 | # out-of-place 23 | function dotest1(X::AbstractArray{T,N}) where {T <: Complex,N} 24 | fftw_X = fft(X) 25 | d_X = CuArray(X) 26 | p = plan_fft(d_X) 27 | d_Y = p * d_X 28 | Y = collect(d_Y) 29 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 30 | 31 | pinv = plan_ifft(d_Y) 32 | d_Z = pinv * d_Y 33 | Z = collect(d_Z) 34 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 35 | 36 | pinv2 = inv(p) 37 | d_Z = pinv2 * d_Y 38 | Z = collect(d_Z) 39 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 40 | end 41 | 42 | function dotest1(X::AbstractArray{T,N}) where {T <: Real,N} 43 | fftw_X = rfft(X) 44 | d_X = CuArray(X) 45 | p = plan_rfft(d_X) 46 | d_Y = p * d_X 47 | Y = collect(d_Y) 48 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 49 | 50 | pinv = plan_irfft(d_Y,size(X,1)) 51 | d_Z = pinv * d_Y 52 | Z = collect(d_Z) 53 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 54 | 55 | pinv2 = inv(p) 56 | d_Z = pinv2 * d_Y 57 | Z = collect(d_Z) 58 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 59 | 60 | pinv3 = inv(pinv) 61 | d_W = pinv3 * d_X 62 | W = collect(d_W) 63 | @test isapprox(W, Y, rtol = MYRTOL, atol = MYATOL) 64 | end 65 | 66 | # in-place 67 | function dotest2(X::AbstractArray{T,N}) where {T <: Complex,N} 68 | fftw_X = fft(X) 69 | d_X = CuArray(X) 70 | p = plan_fft!(d_X) 71 | p * d_X 72 | Y = collect(d_X) 73 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 74 | 75 | pinv = plan_ifft!(d_X) 76 | pinv * d_X 77 | Z = collect(d_X) 78 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 79 | end 80 | 81 | # no inplace rfft for now 82 | 83 | # batch transforms 84 | function dotest3(X::AbstractArray{T,N},region) where {T <: Complex,N} 85 | fftw_X = fft(X,region) 86 | d_X = CuArray(X) 87 | p = plan_fft(d_X,region) 88 | d_Y = p * d_X 89 | Y = collect(d_Y) 90 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 91 | 92 | pinv = plan_ifft(d_Y,region) 93 | d_Z = pinv * d_Y 94 | Z = collect(d_Z) 95 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 96 | end 97 | 98 | function dotest3(X::AbstractArray{T,N},region) where {T <: Real,N} 99 | fftw_X = rfft(X,region) 100 | d_X = CuArray(X) 101 | p = plan_rfft(d_X,region) 102 | d_Y = p * d_X 103 | Y = collect(d_Y) 104 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 105 | 106 | pinv = plan_irfft(d_Y,size(X,region[1]),region) 107 | d_Z = pinv * d_Y 108 | Z = collect(d_Z) 109 | @test isapprox(Z, X, rtol = MYRTOL, atol = MYATOL) 110 | end 111 | 112 | 113 | @testset "FFT" for (rtype,ctype) in [(Float32,ComplexF32), (Float64,ComplexF64)] 114 | 115 | @testset "1D FFT" begin 116 | dims = (N1,) 117 | X = rand(ctype, dims) 118 | dotest1(X) 119 | end 120 | @testset "1D inplace FFT" begin 121 | dims = (N1,) 122 | X = rand(ctype, dims) 123 | dotest2(X) 124 | end 125 | 126 | @testset "2D FFT" begin 127 | dims = (N1,N2) 128 | X = rand(ctype, dims) 129 | dotest1(X) 130 | end 131 | @testset "2D inplace FFT" begin 132 | dims = (N1,N2) 133 | X = rand(ctype, dims) 134 | dotest2(X) 135 | end 136 | 137 | @testset "Batch 1D FFT" begin 138 | dims = (N1,N2) 139 | X = rand(ctype, dims) 140 | dotest3(X,1) 141 | 142 | dims = (N1,N2) 143 | X = rand(ctype, dims) 144 | dotest3(X,2) 145 | 146 | dims = (N1,N2) 147 | X = rand(ctype, dims) 148 | dotest3(X,(1,2)) 149 | end 150 | 151 | @testset "3D FFT" begin 152 | dims = (N1,N2,N3) 153 | X = rand(ctype, dims) 154 | dotest1(X) 155 | end 156 | @testset "3D inplace FFT" begin 157 | dims = (N1,N2,N3) 158 | X = rand(ctype, dims) 159 | dotest2(X) 160 | end 161 | 162 | @testset "Batch 2D FFT (in 3D)" begin 163 | dims = (N1,N2,N3) 164 | for region in [(1,2),(2,3),(1,3)] 165 | X = rand(ctype, dims) 166 | dotest3(X,region) 167 | end 168 | 169 | X = rand(ctype, dims) 170 | @test_throws ArgumentError dotest3(X,(3,1)) 171 | end 172 | 173 | @testset "Batch 2D FFT (in 4D)" begin 174 | dims = (N1,N2,N3,N4) 175 | for region in [(1,2),(1,4),(3,4)] 176 | X = rand(ctype, dims) 177 | dotest3(X,region) 178 | end 179 | for region in [(1,3),(2,3),(2,4)] 180 | X = rand(ctype, dims) 181 | @test_throws ArgumentError dotest3(X,region) 182 | end 183 | 184 | end 185 | 186 | @testset "1D real FFT" begin 187 | X = rand(rtype, N1) 188 | dotest1(X) 189 | end 190 | 191 | @testset "Batch 1D real FFT" begin 192 | dims = (N1,N2) 193 | X = rand(rtype, dims) 194 | dotest3(X,1) 195 | 196 | dims = (N1,N2) 197 | X = rand(rtype, dims) 198 | dotest3(X,2) 199 | 200 | dims = (N1,N2) 201 | X = rand(rtype, dims) 202 | dotest3(X,(1,2)) 203 | end 204 | 205 | @testset "2D real FFT" begin 206 | X = rand(rtype, N1,N2) 207 | dotest1(X) 208 | end 209 | 210 | @testset "Batch 2D real FFT (in 3D)" begin 211 | dims = (N1,N2,N3) 212 | for region in [(1,2),(2,3),(1,3)] 213 | X = rand(rtype, dims) 214 | dotest3(X,region) 215 | end 216 | 217 | X = rand(rtype, dims) 218 | @test_throws ArgumentError dotest3(X,(3,1)) 219 | end 220 | 221 | @testset "Batch 2D real FFT (in 4D)" begin 222 | dims = (N1,N2,N3,N4) 223 | for region in [(1,2),(1,4),(3,4)] 224 | X = rand(rtype, dims) 225 | dotest3(X,region) 226 | end 227 | for region in [(1,3),(2,3),(2,4)] 228 | X = rand(rtype, dims) 229 | @test_throws ArgumentError dotest3(X,region) 230 | end 231 | end 232 | 233 | @testset "3D real FFT" begin 234 | X = rand(rtype, N1, N2, N3) 235 | dotest1(X) 236 | end 237 | 238 | end # testset FFT 239 | 240 | # integer array arguments 241 | function dotest5(X::AbstractArray{T,N}) where {T <: Complex,N} 242 | fftw_X = fft(X) 243 | d_X = CuArray(X) 244 | p = plan_fft(d_X) 245 | d_Y = p * d_X 246 | Y = collect(d_Y) 247 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 248 | d_Y = fft(d_X) 249 | Y = collect(d_Y) 250 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 251 | end 252 | 253 | function dotest5(X::AbstractArray{T,N}) where {T <: Real,N} 254 | fftw_X = rfft(X) 255 | d_X = CuArray(X) 256 | p = plan_rfft(d_X) 257 | d_Y = p * d_X 258 | Y = collect(d_Y) 259 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 260 | d_Y = rfft(d_X) 261 | Y = collect(d_Y) 262 | @test isapprox(Y, fftw_X, rtol = MYRTOL, atol = MYATOL) 263 | end 264 | 265 | @testset "Int FFT" for (rtype,ctype) in [(Int32,Complex{Int32}), (Int64,Complex{Int64})] 266 | 267 | @testset "1D FFT" begin 268 | dims = (N1,) 269 | X = rand(ctype, dims) 270 | dotest5(X) 271 | end 272 | 273 | @testset "1D real FFT" begin 274 | X = rand(rtype, N1) 275 | dotest5(X) 276 | end 277 | 278 | 279 | end # testset int FFT 280 | 281 | end 282 | 283 | end 284 | -------------------------------------------------------------------------------- /src/dnn/libcudnn_types.jl: -------------------------------------------------------------------------------- 1 | const CUDNN_DIM_MAX = 8 2 | const CUDNN_LRN_MIN_N = 1 3 | const CUDNN_LRN_MAX_N = 16 4 | const CUDNN_LRN_MIN_K = 1.0e-5 5 | const CUDNN_LRN_MIN_BETA = 0.01 6 | const CUDNN_BN_MIN_EPSILON = 1.0e-5 7 | 8 | mutable struct cudnnContext 9 | end 10 | 11 | const cudnnHandle_t = Ptr{cudnnContext} 12 | 13 | # begin enum cudnnStatus_t 14 | const cudnnStatus_t = UInt32 15 | const CUDNN_STATUS_SUCCESS = (UInt32)(0) 16 | const CUDNN_STATUS_NOT_INITIALIZED = (UInt32)(1) 17 | const CUDNN_STATUS_ALLOC_FAILED = (UInt32)(2) 18 | const CUDNN_STATUS_BAD_PARAM = (UInt32)(3) 19 | const CUDNN_STATUS_INTERNAL_ERROR = (UInt32)(4) 20 | const CUDNN_STATUS_INVALID_VALUE = (UInt32)(5) 21 | const CUDNN_STATUS_ARCH_MISMATCH = (UInt32)(6) 22 | const CUDNN_STATUS_MAPPING_ERROR = (UInt32)(7) 23 | const CUDNN_STATUS_EXECUTION_FAILED = (UInt32)(8) 24 | const CUDNN_STATUS_NOT_SUPPORTED = (UInt32)(9) 25 | const CUDNN_STATUS_LICENSE_ERROR = (UInt32)(10) 26 | # end enum cudnnStatus_t 27 | 28 | mutable struct cudnnTensorStruct 29 | end 30 | 31 | const cudnnTensorDescriptor_t = Ptr{cudnnTensorStruct} 32 | 33 | mutable struct cudnnConvolutionStruct 34 | end 35 | 36 | const cudnnConvolutionDescriptor_t = Ptr{cudnnConvolutionStruct} 37 | 38 | mutable struct cudnnPoolingStruct 39 | end 40 | 41 | const cudnnPoolingDescriptor_t = Ptr{cudnnPoolingStruct} 42 | 43 | mutable struct cudnnFilterStruct 44 | end 45 | 46 | const cudnnFilterDescriptor_t = Ptr{cudnnFilterStruct} 47 | 48 | mutable struct cudnnLRNStruct 49 | end 50 | 51 | const cudnnLRNDescriptor_t = Ptr{cudnnLRNStruct} 52 | 53 | mutable struct cudnnActivationStruct 54 | end 55 | 56 | const cudnnActivationDescriptor_t = Ptr{cudnnActivationStruct} 57 | 58 | # begin enum cudnnDataType_t 59 | const cudnnDataType_t = UInt32 60 | const CUDNN_DATA_FLOAT = (UInt32)(0) 61 | const CUDNN_DATA_DOUBLE = (UInt32)(1) 62 | const CUDNN_DATA_HALF = (UInt32)(2) 63 | # end enum cudnnDataType_t 64 | 65 | # begin enum cudnnNanPropagation_t 66 | const cudnnNanPropagation_t = UInt32 67 | const CUDNN_NOT_PROPAGATE_NAN = (UInt32)(0) 68 | const CUDNN_PROPAGATE_NAN = (UInt32)(1) 69 | # end enum cudnnNanPropagation_t 70 | 71 | # begin enum cudnnTensorFormat_t 72 | const cudnnTensorFormat_t = UInt32 73 | const CUDNN_TENSOR_NCHW = (UInt32)(0) 74 | const CUDNN_TENSOR_NHWC = (UInt32)(1) 75 | # end enum cudnnTensorFormat_t 76 | 77 | # begin enum cudnnAddMode_t 78 | const cudnnAddMode_t = UInt32 79 | const CUDNN_ADD_IMAGE = (UInt32)(0) 80 | const CUDNN_ADD_SAME_HW = (UInt32)(0) 81 | const CUDNN_ADD_FEATURE_MAP = (UInt32)(1) 82 | const CUDNN_ADD_SAME_CHW = (UInt32)(1) 83 | const CUDNN_ADD_SAME_C = (UInt32)(2) 84 | const CUDNN_ADD_FULL_TENSOR = (UInt32)(3) 85 | # end enum cudnnAddMode_t 86 | 87 | # begin enum cudnnConvolutionMode_t 88 | const cudnnConvolutionMode_t = UInt32 89 | const CUDNN_CONVOLUTION = (UInt32)(0) 90 | const CUDNN_CROSS_CORRELATION = (UInt32)(1) 91 | # end enum cudnnConvolutionMode_t 92 | 93 | # begin enum cudnnConvolutionFwdPreference_t 94 | const cudnnConvolutionFwdPreference_t = UInt32 95 | const CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = (UInt32)(0) 96 | const CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = (UInt32)(1) 97 | const CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 98 | # end enum cudnnConvolutionFwdPreference_t 99 | 100 | # begin enum cudnnConvolutionFwdAlgo_t 101 | const cudnnConvolutionFwdAlgo_t = UInt32 102 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = (UInt32)(0) 103 | const CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = (UInt32)(1) 104 | const CUDNN_CONVOLUTION_FWD_ALGO_GEMM = (UInt32)(2) 105 | const CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = (UInt32)(3) 106 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT = (UInt32)(4) 107 | const CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = (UInt32)(5) 108 | # end enum cudnnConvolutionFwdAlgo_t 109 | 110 | mutable struct cudnnConvolutionFwdAlgoPerf_t 111 | algo::cudnnConvolutionFwdAlgo_t 112 | status::cudnnStatus_t 113 | time::Cfloat 114 | memory::Cint 115 | end 116 | 117 | # begin enum cudnnConvolutionBwdFilterPreference_t 118 | const cudnnConvolutionBwdFilterPreference_t = UInt32 119 | const CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = (UInt32)(0) 120 | const CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = (UInt32)(1) 121 | const CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 122 | # end enum cudnnConvolutionBwdFilterPreference_t 123 | 124 | # begin enum cudnnConvolutionBwdFilterAlgo_t 125 | const cudnnConvolutionBwdFilterAlgo_t = UInt32 126 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = (UInt32)(0) 127 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = (UInt32)(1) 128 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = (UInt32)(2) 129 | const CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = (UInt32)(3) 130 | # end enum cudnnConvolutionBwdFilterAlgo_t 131 | 132 | mutable struct cudnnConvolutionBwdFilterAlgoPerf_t 133 | algo::cudnnConvolutionBwdFilterAlgo_t 134 | status::cudnnStatus_t 135 | time::Cfloat 136 | memory::Cint 137 | end 138 | 139 | # begin enum cudnnConvolutionBwdDataPreference_t 140 | const cudnnConvolutionBwdDataPreference_t = UInt32 141 | const CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = (UInt32)(0) 142 | const CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = (UInt32)(1) 143 | const CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = (UInt32)(2) 144 | # end enum cudnnConvolutionBwdDataPreference_t 145 | 146 | # begin enum cudnnConvolutionBwdDataAlgo_t 147 | const cudnnConvolutionBwdDataAlgo_t = UInt32 148 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = (UInt32)(0) 149 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = (UInt32)(1) 150 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = (UInt32)(2) 151 | const CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = (UInt32)(3) 152 | # end enum cudnnConvolutionBwdDataAlgo_t 153 | 154 | mutable struct cudnnConvolutionBwdDataAlgoPerf_t 155 | algo::cudnnConvolutionBwdDataAlgo_t 156 | status::cudnnStatus_t 157 | time::Cfloat 158 | memory::Cint 159 | end 160 | 161 | # begin enum cudnnSoftmaxAlgorithm_t 162 | const cudnnSoftmaxAlgorithm_t = UInt32 163 | const CUDNN_SOFTMAX_FAST = (UInt32)(0) 164 | const CUDNN_SOFTMAX_ACCURATE = (UInt32)(1) 165 | const CUDNN_SOFTMAX_LOG = (UInt32)(2) 166 | # end enum cudnnSoftmaxAlgorithm_t 167 | 168 | # begin enum cudnnSoftmaxMode_t 169 | const cudnnSoftmaxMode_t = UInt32 170 | const CUDNN_SOFTMAX_MODE_INSTANCE = (UInt32)(0) 171 | const CUDNN_SOFTMAX_MODE_CHANNEL = (UInt32)(1) 172 | # end enum cudnnSoftmaxMode_t 173 | 174 | # begin enum cudnnPoolingMode_t 175 | const cudnnPoolingMode_t = UInt32 176 | const CUDNN_POOLING_MAX = (UInt32)(0) 177 | const CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = (UInt32)(1) 178 | const CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = (UInt32)(2) 179 | # end enum cudnnPoolingMode_t 180 | 181 | # begin enum cudnnActivationMode_t 182 | const cudnnActivationMode_t = UInt32 183 | const CUDNN_ACTIVATION_SIGMOID = (UInt32)(0) 184 | const CUDNN_ACTIVATION_RELU = (UInt32)(1) 185 | const CUDNN_ACTIVATION_TANH = (UInt32)(2) 186 | const CUDNN_ACTIVATION_CLIPPED_RELU = (UInt32)(3) 187 | const CUDNN_ACTIVATION_ELU = (UInt32)(4) 188 | const CUDNN_ACTIVATION_IDENTITY = (UInt32)(5) 189 | # end enum cudnnActivationMode_t 190 | 191 | # begin enum cudnnLRNMode_t 192 | const cudnnLRNMode_t = UInt32 193 | const CUDNN_LRN_CROSS_CHANNEL_DIM1 = (UInt32)(0) 194 | # end enum cudnnLRNMode_t 195 | 196 | # begin enum cudnnDivNormMode_t 197 | const cudnnDivNormMode_t = UInt32 198 | const CUDNN_DIVNORM_PRECOMPUTED_MEANS = (UInt32)(0) 199 | # end enum cudnnDivNormMode_t 200 | 201 | # begin enum cudnnBatchNormMode_t 202 | const cudnnBatchNormMode_t = UInt32 203 | const CUDNN_BATCHNORM_PER_ACTIVATION = (UInt32)(0) 204 | const CUDNN_BATCHNORM_SPATIAL = (UInt32)(1) 205 | # end enum cudnnBatchNormMode_t 206 | 207 | # begin enum cudnnMathType_t 208 | const cudnnMathType_t = UInt32 209 | const CUDNN_DEFAULT_MATH = 0 210 | const CUDNN_TENSOR_OP_MATH = 1 211 | const CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2 212 | # end enum cudnnMathType_t 213 | -------------------------------------------------------------------------------- /src/fft/wrappers.jl: -------------------------------------------------------------------------------- 1 | # wrappers of the low-level CUBLAS functionality 2 | 3 | # Note: we don't implement padded storage dimensions 4 | function _mkplan(xtype, xdims, region) 5 | nrank = length(region) 6 | sz = [xdims[i] for i in region] 7 | csz = copy(sz) 8 | csz[1] = div(sz[1],2) + 1 9 | batch = prod(xdims) ÷ prod(sz) 10 | 11 | pp = Ref{cufftHandle_t}() 12 | if (nrank == 1) && (batch == 1) 13 | cufftPlan1d(pp, sz[1], xtype, 1) 14 | elseif (nrank == 2) && (batch == 1) 15 | cufftPlan2d(pp, sz[2], sz[1], xtype) 16 | elseif (nrank == 3) && (batch == 1) 17 | cufftPlan3d(pp, sz[3], sz[2], sz[1], xtype) 18 | else 19 | rsz = (length(sz) > 1) ? rsz = reverse(sz) : sz 20 | if ((region...,) == ((1:nrank)...,)) 21 | # handle simple case ... simply! (for robustness) 22 | cufftPlanMany(pp, nrank, Cint[rsz...], C_NULL, 1, 1, C_NULL, 1, 1, 23 | xtype, batch) 24 | else 25 | if nrank==1 || all(diff(collect(region)) .== 1) 26 | # _stride: successive elements in innermost dimension 27 | # _dist: distance between first elements of batches 28 | if region[1] == 1 29 | istride = 1 30 | idist = prod(sz) 31 | cdist = prod(csz) 32 | else 33 | if region[end] != length(xdims) 34 | throw(ArgumentError("batching dims must be sequential")) 35 | end 36 | istride = prod(xdims[1:region[1]-1]) 37 | idist = 1 38 | cdist = 1 39 | end 40 | inembed = Cint[rsz...] 41 | cnembed = (length(csz) > 1) ? Cint[reverse(csz)...] : Cint[csz[1]] 42 | ostride = istride 43 | if xtype == CUFFT_R2C || xtype == CUFFT_D2Z 44 | odist = cdist 45 | onembed = cnembed 46 | else 47 | odist = idist 48 | onembed = inembed 49 | end 50 | if xtype == CUFFT_C2R || xtype == CUFFT_Z2D 51 | idist = cdist 52 | inembed = cnembed 53 | end 54 | else 55 | if any(diff(collect(region)) .< 1) 56 | throw(ArgumentError("region must be an increasing sequence")) 57 | end 58 | cdims = collect(xdims) 59 | cdims[region[1]] = div(cdims[region[1]],2)+1 60 | 61 | if region[1] == 1 62 | istride = 1 63 | ii=1 64 | while (ii < nrank) && (region[ii] == region[ii+1]-1) 65 | ii += 1 66 | end 67 | idist = prod(xdims[1:ii]) 68 | cdist = prod(cdims[1:ii]) 69 | ngaps = 0 70 | else 71 | istride = prod(xdims[1:region[1]-1]) 72 | idist = 1 73 | cdist = 1 74 | ngaps = 1 75 | end 76 | nem = ones(Int,nrank) 77 | cem = ones(Int,nrank) 78 | id = 1 79 | for ii=1:nrank-1 80 | if region[ii+1] > region[ii]+1 81 | ngaps += 1 82 | end 83 | while id < region[ii+1] 84 | nem[ii] *= xdims[id] 85 | cem[ii] *= cdims[id] 86 | id += 1 87 | end 88 | @assert nem[ii] >= sz[ii] 89 | end 90 | if region[end] < length(xdims) 91 | ngaps += 1 92 | end 93 | # CUFFT represents batches by a single stride (_dist) 94 | # so we must verify that region is consistent with this: 95 | if ngaps > 1 96 | throw(ArgumentError("batch regions must be sequential")) 97 | end 98 | 99 | inembed = Cint[reverse(nem)...] 100 | cnembed = Cint[reverse(cem)...] 101 | ostride = istride 102 | if xtype == CUFFT_R2C || xtype == CUFFT_D2Z 103 | odist = cdist 104 | onembed = cnembed 105 | else 106 | odist = idist 107 | onembed = inembed 108 | end 109 | if xtype == CUFFT_C2R || xtype == CUFFT_Z2D 110 | idist = cdist 111 | inembed = cnembed 112 | end 113 | end 114 | cufftPlanMany(pp, nrank, Cint[rsz...], 115 | inembed, istride, idist, onembed, ostride, odist, 116 | xtype, batch) 117 | end 118 | end 119 | pp[] 120 | end 121 | 122 | # this is used implicitly in the unsafe_execute methods below: 123 | unsafe_convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan 124 | 125 | convert(::Type{cufftHandle_t}, p::CuFFTPlan) = p.plan 126 | 127 | destroy_plan(plan::CuFFTPlan) = cufftDestroy(plan) 128 | 129 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}) where {T,K} 130 | (size(X) == p.sz) || 131 | throw(ArgumentError("CuFFT plan applied to wrong-size input")) 132 | end 133 | 134 | function assert_applicable(p::CuFFTPlan{T,K}, X::CuArray{T}, Y::CuArray{Ty}) where {T,K,Ty} 135 | assert_applicable(p, X) 136 | (size(Y) == p.osz) || 137 | throw(ArgumentError("CuFFT plan applied to wrong-size output")) 138 | # type errors should be impossible by dispatch, but just in case: 139 | if p.xtype ∈ [CUFFT_C2R, CUFFT_Z2D] 140 | (Ty == real(T)) || 141 | throw(ArgumentError("Type mismatch for argument Y")) 142 | elseif p.xtype ∈ [CUFFT_R2C, CUFFT_D2Z] 143 | (Ty == complex(T)) || 144 | throw(ArgumentError("Type mismatch for argument Y")) 145 | else 146 | (Ty == T) || 147 | throw(ArgumentError("Type mismatch for argument Y")) 148 | end 149 | end 150 | 151 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,true,N}, 152 | x::CuArray{cufftComplex,N}) where {K,N} 153 | @assert plan.xtype == CUFFT_C2C 154 | cufftExecC2C(plan, x, x, K) 155 | end 156 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,true,N}, 157 | x::CuArray{cufftComplex,N}) where {K,N} 158 | @assert plan.xtype == CUFFT_C2R 159 | cufftExecC2R(plan, x, x) 160 | end 161 | 162 | function unsafe_execute!(plan::cCuFFTPlan{cufftComplex,K,false,N}, 163 | x::CuArray{cufftComplex,N}, y::CuArray{cufftComplex} 164 | ) where {K,N} 165 | @assert plan.xtype == CUFFT_C2C 166 | cufftExecC2C(plan, x, y, K) 167 | end 168 | function unsafe_execute!(plan::rCuFFTPlan{cufftComplex,K,false,N}, 169 | x::CuArray{cufftComplex,N}, y::CuArray{cufftReal} 170 | ) where {K,N} 171 | @assert plan.xtype == CUFFT_C2R 172 | cufftExecC2R(plan, x, y) 173 | end 174 | 175 | function unsafe_execute!(plan::rCuFFTPlan{cufftReal,K,false,N}, 176 | x::CuArray{cufftReal,N}, y::CuArray{cufftComplex,N} 177 | ) where {K,N} 178 | @assert plan.xtype == CUFFT_R2C 179 | cufftExecR2C(plan, x, y) 180 | end 181 | 182 | # double prec. 183 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,true,N}, 184 | x::CuArray{cufftDoubleComplex,N}) where {K,N} 185 | @assert plan.xtype == CUFFT_Z2Z 186 | cufftExecZ2Z(plan, x, x, K) 187 | end 188 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,true,N}, 189 | x::CuArray{cufftDoubleComplex,N}) where {K,N} 190 | @assert plan.xtype == CUFFT_Z2D 191 | cufftExecZ2D(plan, x, x) 192 | end 193 | 194 | function unsafe_execute!(plan::cCuFFTPlan{cufftDoubleComplex,K,false,N}, 195 | x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleComplex} 196 | ) where {K,N} 197 | @assert plan.xtype == CUFFT_Z2Z 198 | cufftExecZ2Z(plan, x, y, K) 199 | end 200 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleComplex,K,false,N}, 201 | x::CuArray{cufftDoubleComplex,N}, y::CuArray{cufftDoubleReal} 202 | ) where {K,N} 203 | @assert plan.xtype == CUFFT_Z2D 204 | cufftExecZ2D(plan, x, y) 205 | end 206 | 207 | function unsafe_execute!(plan::rCuFFTPlan{cufftDoubleReal,K,false,N}, 208 | x::CuArray{cufftDoubleReal,N}, y::CuArray{cufftDoubleComplex,N} 209 | ) where {K,N} 210 | @assert plan.xtype == CUFFT_D2Z 211 | cufftExecD2Z(plan, x, y) 212 | end 213 | -------------------------------------------------------------------------------- /src/array.jl: -------------------------------------------------------------------------------- 1 | import CUDAnative: DevicePtr 2 | 3 | mutable struct CuArray{T,N} <: GPUArray{T,N} 4 | buf::Mem.Buffer 5 | own::Bool 6 | 7 | dims::Dims{N} 8 | offset::Int 9 | 10 | function CuArray{T,N}(buf::Mem.Buffer, dims::Dims{N}; offset::Integer=0, own::Bool=true) where {T,N} 11 | xs = new{T,N}(buf, own, dims, offset) 12 | if own 13 | Mem.retain(buf) 14 | finalizer(unsafe_free!, xs) 15 | end 16 | return xs 17 | end 18 | end 19 | 20 | CuVector{T} = CuArray{T,1} 21 | CuMatrix{T} = CuArray{T,2} 22 | CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} 23 | 24 | const INVALID = Mem.alloc(0) 25 | 26 | function unsafe_free!(xs::CuArray{<:Any,N}) where {N} 27 | xs.buf === INVALID && return 28 | Mem.release(xs.buf) && dealloc(xs.buf, prod(xs.dims)*sizeof(eltype(xs))) 29 | xs.dims = Tuple(0 for _ in 1:N) 30 | xs.buf = INVALID 31 | return 32 | end 33 | 34 | 35 | ## construction 36 | 37 | # type and dimensionality specified, accepting dims as tuples of Ints 38 | CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} = 39 | CuArray{T,N}(alloc(prod(dims)*sizeof(T)), dims) 40 | 41 | # type and dimensionality specified, accepting dims as series of Ints 42 | CuArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = CuArray{T,N}(undef, dims) 43 | 44 | # type but not dimensionality specified 45 | CuArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 46 | CuArray{T}(::UndefInitializer, dims::Integer...) where {T} = 47 | CuArray{T}(undef, convert(Tuple{Vararg{Int}}, dims)) 48 | 49 | # empty vector constructor 50 | CuArray{T,1}() where {T} = CuArray{T,1}(undef, 0) 51 | 52 | # do-block constructors 53 | for (ctor, tvars) in (:CuArray => (), :(CuArray{T}) => (:T,), :(CuArray{T,N}) => (:T, :N)) 54 | @eval begin 55 | function $ctor(f::Function, args...) where {$(tvars...)} 56 | xs = $ctor(args...) 57 | try 58 | f(xs) 59 | finally 60 | unsafe_free!(xs) 61 | end 62 | end 63 | end 64 | end 65 | 66 | 67 | Base.similar(a::CuArray{T,N}) where {T,N} = CuArray{T,N}(undef, size(a)) 68 | Base.similar(a::CuArray{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 69 | Base.similar(a::CuArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = CuArray{T,N}(undef, dims) 70 | 71 | 72 | """ 73 | unsafe_wrap(::CuArray, ptr::CuPtr{T}, dims; own=false, ctx=CuCurrentContext()) 74 | 75 | Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer 76 | element type `T` determines the array element type. `dims` is either an integer (for a 1d 77 | array) or a tuple of the array dimensions. `own` optionally specified whether Julia should 78 | take ownership of the memory, calling `free` when the array is no longer referenced. The 79 | `ctx` argument determines the CUDA context where the data is allocated in. 80 | """ 81 | function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}}, 82 | p::CuPtr{T}, dims::NTuple{N,Int}; 83 | own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T,N} 84 | buf = Mem.Buffer(convert(CuPtr{Cvoid}, p), prod(dims) * sizeof(T), ctx) 85 | return CuArray{T, length(dims)}(buf, dims; own=own) 86 | end 87 | function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}}, 88 | p::CuPtr{T}, dim::Integer; 89 | own::Bool=false, ctx::CuContext=CuCurrentContext()) where {T} 90 | unsafe_wrap(Atype, p, (dim,); own=own, ctx=ctx) 91 | end 92 | Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} = 93 | throw(ArgumentError("cannot wrap a CPU pointer with a $T")) 94 | 95 | 96 | ## array interface 97 | 98 | Base.elsize(::Type{<:CuArray{T}}) where {T} = sizeof(T) 99 | 100 | Base.size(x::CuArray) = x.dims 101 | Base.sizeof(x::CuArray) = Base.elsize(x) * length(x) 102 | 103 | 104 | ## interop with other arrays 105 | 106 | CuArray{T,N}(xs::AbstractArray{T,N}) where {T,N} = 107 | isbits(xs) ? 108 | (CuArray{T,N}(undef, size(xs)) .= xs) : 109 | copyto!(CuArray{T,N}(undef, size(xs)), collect(xs)) 110 | 111 | CuArray{T,N}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}((x -> T(x)).(xs)) 112 | 113 | # underspecified constructors 114 | CuArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = CuArray{T,N}(xs) 115 | (::Type{CuArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = CuArray{S,N}(x) 116 | CuArray(A::AbstractArray{T,N}) where {T,N} = CuArray{T,N}(A) 117 | 118 | # idempotency 119 | CuArray{T,N}(xs::CuArray{T,N}) where {T,N} = xs 120 | 121 | 122 | ## conversions 123 | 124 | Base.convert(::Type{T}, x::T) where T <: CuArray = x 125 | 126 | function Base._reshape(parent::CuArray, dims::Dims) 127 | n = length(parent) 128 | prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims")) 129 | return CuArray{eltype(parent),length(dims)}(parent.buf, dims; 130 | offset=parent.offset, own=parent.own) 131 | end 132 | function Base._reshape(parent::CuArray{T,1}, dims::Tuple{Int}) where T 133 | n = length(parent) 134 | prod(dims) == n || throw(DimensionMismatch("parent has $n elements, which is incompatible with size $dims")) 135 | return parent 136 | end 137 | 138 | 139 | ## interop with C libraries 140 | 141 | """ 142 | buffer(array::CuArray [, index]) 143 | 144 | Get the native address of a CuArray, optionally at a given location `index`. 145 | Equivalent of `Base.pointer` on `Array`s. 146 | """ 147 | function buffer(xs::CuArray, index=1) 148 | extra_offset = (index-1) * Base.elsize(xs) 149 | Mem.Buffer(xs.buf.ptr + xs.offset + extra_offset, 150 | sizeof(xs) - extra_offset, 151 | xs.buf.ctx) 152 | end 153 | 154 | Base.cconvert(::Type{<:Ptr}, x::CuArray) = throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) 155 | Base.cconvert(::Type{<:CuPtr}, x::CuArray) = buffer(x) 156 | 157 | 158 | ## interop with CUDAnative 159 | 160 | function Base.convert(::Type{CuDeviceArray{T,N,AS.Global}}, a::CuArray{T,N}) where {T,N} 161 | ptr = Base.unsafe_convert(CuPtr{T}, buffer(a)) 162 | CuDeviceArray{T,N,AS.Global}(a.dims, DevicePtr{T,AS.Global}(ptr)) 163 | end 164 | 165 | Adapt.adapt_storage(::CUDAnative.Adaptor, xs::CuArray{T,N}) where {T,N} = 166 | convert(CuDeviceArray{T,N,AS.Global}, xs) 167 | 168 | 169 | 170 | ## interop with CPU array 171 | 172 | # We don't convert isbits types in `adapt`, since they are already 173 | # considered GPU-compatible. 174 | 175 | Adapt.adapt_storage(::Type{<:CuArray}, xs::AbstractArray) = 176 | isbits(xs) ? xs : convert(CuArray, xs) 177 | 178 | Adapt.adapt_storage(::Type{<:CuArray{T}}, xs::AbstractArray{<:Real}) where T <: AbstractFloat = 179 | isbits(xs) ? xs : convert(CuArray{T}, xs) 180 | 181 | Adapt.adapt_storage(::Type{<:Array}, xs::CuArray) = convert(Array, xs) 182 | 183 | Base.collect(x::CuArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x) 184 | 185 | function Base.unsafe_copyto!(dest::CuArray{T}, doffs, src::Array{T}, soffs, n) where T 186 | Mem.upload!(buffer(dest, doffs), pointer(src, soffs), n*sizeof(T)) 187 | return dest 188 | end 189 | 190 | function Base.unsafe_copyto!(dest::Array{T}, doffs, src::CuArray{T}, soffs, n) where T 191 | Mem.download!(pointer(dest, doffs), buffer(src, soffs), n*sizeof(T)) 192 | return dest 193 | end 194 | 195 | function Base.unsafe_copyto!(dest::CuArray{T}, doffs, src::CuArray{T}, soffs, n) where T 196 | Mem.transfer!(buffer(dest, doffs), buffer(src, soffs), n*sizeof(T)) 197 | return dest 198 | end 199 | 200 | function Base.deepcopy_internal(x::CuArray, dict::IdDict) 201 | haskey(dict, x) && return dict[x]::typeof(x) 202 | return dict[x] = copy(x) 203 | end 204 | 205 | 206 | ## utilities 207 | 208 | cu(xs) = adapt(CuArray{Float32}, xs) 209 | Base.getindex(::typeof(cu), xs...) = CuArray([xs...]) 210 | 211 | cuzeros(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 0) 212 | cuones(T::Type, dims...) = fill!(CuArray{T}(undef, dims...), 1) 213 | cuzeros(dims...) = cuzeros(Float32, dims...) 214 | cuones(dims...) = cuones(Float32, dims...) 215 | cufill(v, dims...) = fill!(CuArray{typeof(v)}(undef, dims...), v) 216 | cufill(v, dims::Dims) = fill!(CuArray{typeof(v)}(undef, dims...), v) 217 | 218 | # optimized implementation of `fill!` for types that are directly supported by memset 219 | const MemsetTypes = Dict(1=>UInt8, 2=>UInt16, 4=>UInt32) 220 | const MemsetCompatTypes = Union{UInt8, Int8, 221 | UInt16, Int16, Float16, 222 | UInt32, Int32, Float32} 223 | function Base.fill!(A::CuArray{T}, x) where T <: MemsetCompatTypes 224 | y = reinterpret(MemsetTypes[sizeof(T)], convert(T, x)) 225 | Mem.set!(buffer(A), y, length(A)) 226 | A 227 | end 228 | 229 | 230 | ## generic linear algebra routines 231 | 232 | function LinearAlgebra.tril!(A::CuMatrix{T}, d::Integer = 0) where T 233 | function kernel!(_A, _d) 234 | li = (blockIdx().x - 1) * blockDim().x + threadIdx().x 235 | m, n = size(_A) 236 | if 0 < li <= m*n 237 | i, j = Tuple(CartesianIndices(_A)[li]) 238 | if i < j - _d 239 | _A[i, j] = 0 240 | end 241 | end 242 | return nothing 243 | end 244 | 245 | blk, thr = cudims(A) 246 | @cuda blocks=blk threads=thr kernel!(A, d) 247 | return A 248 | end 249 | 250 | function LinearAlgebra.triu!(A::CuMatrix{T}, d::Integer = 0) where T 251 | function kernel!(_A, _d) 252 | li = (blockIdx().x - 1) * blockDim().x + threadIdx().x 253 | m, n = size(_A) 254 | if 0 < li <= m*n 255 | i, j = Tuple(CartesianIndices(_A)[li]) 256 | if j < i + _d 257 | _A[i, j] = 0 258 | end 259 | end 260 | return nothing 261 | end 262 | 263 | blk, thr = cudims(A) 264 | @cuda blocks=blk threads=thr kernel!(A, d) 265 | return A 266 | end 267 | -------------------------------------------------------------------------------- /src/blas/highlevel.jl: -------------------------------------------------------------------------------- 1 | # LinearAlgebra-style wrappers of the CUBLAS functionality 2 | 3 | 4 | cublas_size(t::Char, M::CuVecOrMat) = (size(M, t=='N' ? 1 : 2), size(M, t=='N' ? 2 : 1)) 5 | 6 | CublasArray{T<:CublasFloat} = CuArray{T} 7 | 8 | 9 | # 10 | # BLAS 1 11 | # 12 | 13 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Number) = 14 | scal!(length(x), convert(eltype(x), k), x, 1) 15 | 16 | # Work around ambiguity with GPUArrays wrapper 17 | LinearAlgebra.rmul!(x::CuArray{<:CublasFloat}, k::Real) = 18 | invoke(rmul!, Tuple{typeof(x), Number}, x, k) 19 | 20 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{Float32,Float64} 21 | n = length(DX) 22 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 23 | dot(n, DX, 1, DY, 1) 24 | end 25 | 26 | function LinearAlgebra.BLAS.dotc(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 27 | n = length(DX) 28 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 29 | dotc(n, DX, 1, DY, 1) 30 | end 31 | 32 | function LinearAlgebra.BLAS.dot(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 33 | dotc(DX, DY) 34 | end 35 | 36 | function LinearAlgebra.BLAS.dotu(DX::CuArray{T}, DY::CuArray{T}) where T<:Union{ComplexF32,ComplexF64} 37 | n = length(DX) 38 | n==length(DY) || throw(DimensionMismatch("dot product arguments have lengths $(length(DX)) and $(length(DY))")) 39 | dotu(n, DX, 1, DY, 1) 40 | end 41 | 42 | LinearAlgebra.norm(x::CublasArray) = nrm2(x) 43 | LinearAlgebra.BLAS.asum(x::CublasArray) = asum(length(x), x, 1) 44 | 45 | function LinearAlgebra.axpy!(alpha::Number, x::CuArray{T}, y::CuArray{T}) where T<:CublasFloat 46 | length(x)==length(y) || throw(DimensionMismatch("")) 47 | axpy!(length(x), convert(T,alpha), x, 1, y, 1) 48 | end 49 | 50 | Base.argmin(xs::CublasArray{<:CublasReal}) = iamin(xs) 51 | Base.argmax(xs::CublasArray{<:CublasReal}) = iamax(xs) 52 | 53 | 54 | 55 | # 56 | # BLAS 2 57 | # 58 | 59 | # GEMV 60 | 61 | function gemv_wrapper!(y::CuVector{T}, tA::Char, A::CuMatrix{T}, x::CuVector{T}, 62 | alpha = one(T), beta = zero(T)) where T<:CublasFloat 63 | mA, nA = cublas_size(tA, A) 64 | if nA != length(x) 65 | throw(DimensionMismatch("second dimension of A, $nA, does not match length of x, $(length(x))")) 66 | end 67 | if mA != length(y) 68 | throw(DimensionMismatch("first dimension of A, $mA, does not match length of y, $(length(y))")) 69 | end 70 | if mA == 0 71 | return y 72 | end 73 | if nA == 0 74 | return rmul!(y, 0) 75 | end 76 | gemv!(tA, alpha, A, x, beta, y) 77 | end 78 | 79 | LinearAlgebra.mul!(Y::CuVector{T}, A::CuMatrix{T}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'N', A, B) 80 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Transpose{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B) 81 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasFloat = gemv_wrapper!(Y, 'T', A.parent, B) 82 | LinearAlgebra.lmul!(Y::CuVector{T}, A::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuVector{T}) where T<:CublasComplex = gemv_wrapper!(Y, 'C', A.parent, B) 83 | 84 | 85 | 86 | # 87 | # BLAS 3 88 | # 89 | 90 | # GEMM 91 | 92 | function gemm_wrapper!(C::CuVecOrMat{T}, tA::Char, tB::Char, 93 | A::CuVecOrMat{T}, 94 | B::CuVecOrMat{T}, 95 | alpha = one(T), 96 | beta = zero(T)) where T <: CublasFloat 97 | mA, nA = cublas_size(tA, A) 98 | mB, nB = cublas_size(tB, B) 99 | 100 | if nA != mB 101 | throw(DimensionMismatch("A has dimensions ($mA,$nA) but B has dimensions ($mB,$nB)")) 102 | end 103 | 104 | if C === A || B === C 105 | throw(ArgumentError("output matrix must not be aliased with input matrix")) 106 | end 107 | 108 | if mA == 0 || nA == 0 || nB == 0 109 | if size(C) != (mA, nB) 110 | throw(DimensionMismatch("C has dimensions $(size(C)), should have ($mA,$nB)")) 111 | end 112 | return LinearAlgebra.rmul!(C, 0) 113 | end 114 | 115 | gemm!(tA, tB, alpha, A, B, beta, C) 116 | end 117 | 118 | # Mutating 119 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuVecOrMat{T}, B::CuVecOrMat{T}) where T<:CublasFloat = gemm_wrapper!(C, 'N', 'N', A, B) 120 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 121 | gemm_wrapper!(C, 'T', 'N', parent(trA), B) 122 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 123 | gemm_wrapper!(C, 'N', 'T', A, parent(trB)) 124 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 125 | gemm_wrapper!(C, 'T', 'T', parent(trA), parent(trB)) 126 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasReal = 127 | gemm_wrapper!(C, 'T', 'N', parent(adjA), B) 128 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 129 | gemm_wrapper!(C, 'C', 'N', parent(adjA), B) 130 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal = 131 | gemm_wrapper!(C, 'N', 'T', A, parent(adjB)) 132 | LinearAlgebra.mul!(C::CuMatrix{T}, A::CuMatrix{T}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 133 | gemm_wrapper!(C, 'N', 'C', A, parent(adjB)) 134 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, CuMatrix{T}}) where T<:CublasReal = 135 | gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(adjB)) 136 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 137 | gemm_wrapper!(C, 'C', 'C', parent(adjA), parent(adjB)) 138 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}) where T<:CublasReal = 139 | gemm_wrapper!(C, 'T', 'T', parent(trA), parent(adjB)) 140 | LinearAlgebra.mul!(C::CuMatrix{T}, trA::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}, adjB::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}) where T<:CublasFloat = 141 | gemm_wrapper!(C, 'T', 'C', parent(trA), parent(adjB)) 142 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{T, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T<:CublasReal = 143 | gemm_wrapper!(C, 'T', 'T', parent(adjA), parent(trB)) 144 | LinearAlgebra.mul!(C::CuMatrix{T}, adjA::LinearAlgebra.Adjoint{<:Any, <:CuMatrix{T}}, trB::LinearAlgebra.Transpose{<:Any, <:CuMatrix{T}}) where T <: CublasFloat = 145 | gemm_wrapper!(C, 'C', 'T', parent(adjA), parent(trB)) 146 | 147 | 148 | # TRSM 149 | 150 | # ldiv! 151 | ## No transpose/adjoint 152 | LinearAlgebra.ldiv!(A::UpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 153 | CUBLAS.trsm!('L', 'U', 'N', 'N', one(T), parent(A), B) 154 | LinearAlgebra.ldiv!(A::UnitUpperTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 155 | CUBLAS.trsm!('L', 'U', 'N', 'U', one(T), parent(A), B) 156 | LinearAlgebra.ldiv!(A::LowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 157 | CUBLAS.trsm!('L', 'L', 'N', 'N', one(T), parent(A), B) 158 | LinearAlgebra.ldiv!(A::UnitLowerTriangular{T,CuMatrix{T}}, B::CuMatrix{T}) where T<:CublasFloat = 159 | CUBLAS.trsm!('L', 'L', 'N', 'U', one(T), parent(A), B) 160 | ## Adjoint 161 | LinearAlgebra.ldiv!(A::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 162 | CUBLAS.trsm!('L', 'U', 'C', 'N', one(T), parent(parent(A)), B) 163 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 164 | CUBLAS.trsm!('L', 'U', 'C', 'U', one(T), parent(parent(A)), B) 165 | LinearAlgebra.ldiv!(A::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 166 | CUBLAS.trsm!('L', 'L', 'C', 'N', one(T), parent(parent(A)), B) 167 | LinearAlgebra.ldiv!(A::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 168 | CUBLAS.trsm!('L', 'L', 'C', 'U', one(T), parent(parent(A)), B) 169 | ## Transpose 170 | LinearAlgebra.ldiv!(A::Transpose{T,UpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 171 | CUBLAS.trsm!('L', 'U', 'T', 'N', one(T), parent(parent(A)), B) 172 | LinearAlgebra.ldiv!(A::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 173 | CUBLAS.trsm!('L', 'U', 'T', 'U', one(T), parent(parent(A)), B) 174 | LinearAlgebra.ldiv!(A::Transpose{T,LowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 175 | CUBLAS.trsm!('L', 'L', 'T', 'N', one(T), parent(parent(A)), B) 176 | LinearAlgebra.ldiv!(A::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}, B::CuMatrix{T}) where T<:CublasFloat = 177 | CUBLAS.trsm!('L', 'L', 'T', 'U', one(T), parent(parent(A)), B) 178 | 179 | # rdiv! 180 | ## No transpose/adjoint 181 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 182 | CUBLAS.trsm!('R', 'U', 'N', 'N', one(T), parent(B), A) 183 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitUpperTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 184 | CUBLAS.trsm!('R', 'U', 'N', 'U', one(T), parent(B), A) 185 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::LowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 186 | CUBLAS.trsm!('R', 'L', 'N', 'N', one(T), parent(B), A) 187 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::UnitLowerTriangular{T,CuMatrix{T}}) where T<:CublasFloat = 188 | CUBLAS.trsm!('R', 'L', 'N', 'U', one(T), parent(B), A) 189 | ## Adjoint 190 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 191 | CUBLAS.trsm!('R', 'U', 'C', 'N', one(T), parent(parent(B)), A) 192 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 193 | CUBLAS.trsm!('R', 'U', 'C', 'U', one(T), parent(parent(B)), A) 194 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 195 | CUBLAS.trsm!('R', 'L', 'C', 'N', one(T), parent(parent(B)), A) 196 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Adjoint{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 197 | CUBLAS.trsm!('R', 'L', 'C', 'U', one(T), parent(parent(B)), A) 198 | ## Transpose 199 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 200 | CUBLAS.trsm!('R', 'U', 'T', 'N', one(T), parent(parent(B)), A) 201 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitUpperTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 202 | CUBLAS.trsm!('R', 'U', 'T', 'U', one(T), parent(parent(B)), A) 203 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,LowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 204 | CUBLAS.trsm!('R', 'L', 'T', 'N', one(T), parent(parent(B)), A) 205 | LinearAlgebra.rdiv!(A::CuMatrix{T}, B::Transpose{T,UnitLowerTriangular{T,CuMatrix{T}}}) where T<:CublasFloat = 206 | CUBLAS.trsm!('R', 'L', 'T', 'U', one(T), parent(parent(B)), A) 207 | -------------------------------------------------------------------------------- /src/sparse/array.jl: -------------------------------------------------------------------------------- 1 | # custom extension of CuArray in CUDArt for sparse vectors/matrices 2 | # using CSC format for interop with Julia's native sparse functionality 3 | 4 | import Base: length, size, ndims, eltype, similar, pointer, stride, 5 | copy, convert, reinterpret, show, summary, copyto!, get!, fill!, collect 6 | import LinearAlgebra: BlasFloat, Hermitian, HermOrSym, issymmetric, 7 | ishermitian, istriu, istril, Symmetric, UpperTriangular, LowerTriangular 8 | import SparseArrays: sparse, SparseMatrixCSC 9 | 10 | abstract type AbstractCuSparseArray{Tv, N} <: AbstractSparseArray{Tv, Cint, N} end 11 | const AbstractCuSparseVector{Tv} = AbstractCuSparseArray{Tv,1} 12 | const AbstractCuSparseMatrix{Tv} = AbstractCuSparseArray{Tv,2} 13 | 14 | mutable struct CuSparseVector{Tv} <: AbstractCuSparseVector{Tv} 15 | iPtr::CuVector{Cint} 16 | nzVal::CuVector{Tv} 17 | dims::NTuple{2,Int} 18 | nnz::Cint 19 | 20 | function CuSparseVector{Tv}(iPtr::CuVector{Cint}, nzVal::CuVector{Tv}, dims::Int, nnz::Cint) where Tv 21 | new(iPtr,nzVal,(dims,1),nnz) 22 | end 23 | end 24 | 25 | function CuArrays.unsafe_free!(xs::CuSparseVector) 26 | unsafe_free!(xs.iPtr) 27 | unsafe_free!(xs.nzVal) 28 | return 29 | end 30 | 31 | mutable struct CuSparseMatrixCSC{Tv} <: AbstractCuSparseMatrix{Tv} 32 | colPtr::CuVector{Cint} 33 | rowVal::CuVector{Cint} 34 | nzVal::CuVector{Tv} 35 | dims::NTuple{2,Int} 36 | nnz::Cint 37 | 38 | function CuSparseMatrixCSC{Tv}(colPtr::CuVector{Cint}, rowVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv 39 | new(colPtr,rowVal,nzVal,dims,nnz) 40 | end 41 | end 42 | 43 | function CuSparseMatrixCSC!(xs::CuSparseVector) 44 | unsafe_free!(xs.colPtr) 45 | unsafe_free!(xs.rowVal) 46 | unsafe_free!(xs.nzVal) 47 | return 48 | end 49 | 50 | """ 51 | Container to hold sparse matrices in compressed sparse row (CSR) format on the 52 | GPU. 53 | 54 | **Note**: Most CUSPARSE operations work with CSR formatted matrices, rather 55 | than CSC. 56 | """ 57 | mutable struct CuSparseMatrixCSR{Tv} <: AbstractCuSparseMatrix{Tv} 58 | rowPtr::CuVector{Cint} 59 | colVal::CuVector{Cint} 60 | nzVal::CuVector{Tv} 61 | dims::NTuple{2,Int} 62 | nnz::Cint 63 | 64 | function CuSparseMatrixCSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int}, nnz::Cint) where Tv 65 | new(rowPtr,colVal,nzVal,dims,nnz) 66 | end 67 | end 68 | 69 | function CuSparseMatrixCSR!(xs::CuSparseVector) 70 | unsafe_free!(xs.rowPtr) 71 | unsafe_free!(xs.colVal) 72 | unsafe_free!(xs.nzVal) 73 | return 74 | end 75 | 76 | """ 77 | Container to hold sparse matrices in block compressed sparse row (BSR) format on 78 | the GPU. BSR format is also used in Intel MKL, and is suited to matrices that are 79 | "block" sparse - rare blocks of non-sparse regions. 80 | """ 81 | mutable struct CuSparseMatrixBSR{Tv} <: AbstractCuSparseMatrix{Tv} 82 | rowPtr::CuVector{Cint} 83 | colVal::CuVector{Cint} 84 | nzVal::CuVector{Tv} 85 | dims::NTuple{2,Int} 86 | blockDim::Cint 87 | dir::SparseChar 88 | nnz::Cint 89 | 90 | function CuSparseMatrixBSR{Tv}(rowPtr::CuVector{Cint}, colVal::CuVector{Cint}, nzVal::CuVector{Tv}, dims::NTuple{2,Int},blockDim::Cint, dir::SparseChar, nnz::Cint) where Tv 91 | new(rowPtr,colVal,nzVal,dims,blockDim,dir,nnz) 92 | end 93 | end 94 | 95 | function CuSparseMatrixBSR!(xs::CuSparseVector) 96 | unsafe_free!(xs.rowPtr) 97 | unsafe_free!(xs.colVal) 98 | unsafe_free!(xs.nzVal) 99 | return 100 | end 101 | 102 | """ 103 | Container to hold sparse matrices in NVIDIA's hybrid (HYB) format on the GPU. 104 | HYB format is an opaque struct, which can be converted to/from using 105 | CUSPARSE routines. 106 | """ 107 | mutable struct CuSparseMatrixHYB{Tv} <: AbstractCuSparseMatrix{Tv} 108 | Mat::cusparseHybMat_t 109 | dims::NTuple{2,Int} 110 | nnz::Cint 111 | 112 | function CuSparseMatrixHYB{Tv}(Mat::cusparseHybMat_t, dims::NTuple{2,Int}, nnz::Cint) where Tv 113 | new(Mat,dims,nnz) 114 | end 115 | end 116 | 117 | """ 118 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref), 119 | and `Hermitian` and `Symmetric` versions of these two containers. A function accepting 120 | this type can make use of performance improvements by only indexing one triangle of the 121 | matrix if it is guaranteed to be hermitian/symmetric. 122 | """ 123 | const CompressedSparse{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T},HermOrSym{T,CuSparseMatrixCSC{T}},HermOrSym{T,CuSparseMatrixCSR{T}}} 124 | 125 | """ 126 | Utility union type of [`CuSparseMatrixCSC`](@ref), [`CuSparseMatrixCSR`](@ref), 127 | [`CuSparseMatrixBSR`](@ref), and [`CuSparseMatrixHYB`](@ref). 128 | """ 129 | const CuSparseMatrix{T} = Union{CuSparseMatrixCSC{T},CuSparseMatrixCSR{T}, CuSparseMatrixBSR{T}, CuSparseMatrixHYB{T}} 130 | 131 | Hermitian{T}(Mat::CuSparseMatrix{T}) where T = Hermitian{T,typeof(Mat)}(Mat,'U') 132 | 133 | length(g::CuSparseVector) = prod(g.dims) 134 | size(g::CuSparseVector) = g.dims 135 | ndims(g::CuSparseVector) = 1 136 | length(g::CuSparseMatrix) = prod(g.dims) 137 | size(g::CuSparseMatrix) = g.dims 138 | ndims(g::CuSparseMatrix) = 2 139 | 140 | function size(g::CuSparseVector, d::Integer) 141 | if d == 1 142 | return g.dims[d] 143 | elseif d > 1 144 | return 1 145 | else 146 | throw(ArgumentError("dimension must be ≥ 1, got $d")) 147 | end 148 | end 149 | 150 | function size(g::CuSparseMatrix, d::Integer) 151 | if d in [1, 2] 152 | return g.dims[d] 153 | elseif d > 1 154 | return 1 155 | else 156 | throw(ArgumentError("dimension must be ≥ 1, got $d")) 157 | end 158 | end 159 | 160 | issymmetric(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR})= false 161 | ishermitian(M::Union{CuSparseMatrixCSC,CuSparseMatrixCSR}) where T = false 162 | issymmetric(M::Symmetric{CuSparseMatrixCSC})= true 163 | ishermitian(M::Hermitian{CuSparseMatrixCSC}) = true 164 | 165 | for mat_type in [:CuSparseMatrixCSC, :CuSparseMatrixCSR, :CuSparseMatrixBSR, :CuSparseMatrixHYB] 166 | @eval begin 167 | istriu(M::UpperTriangular{$mat_type}) = true 168 | istril(M::UpperTriangular{$mat_type}) = false 169 | istriu(M::LowerTriangular{$mat_type}) = false 170 | istril(M::LowerTriangular{$mat_type}) = true 171 | end 172 | end 173 | eltype(g::CuSparseMatrix{T}) where T = T 174 | 175 | function collect(Vec::CuSparseVector) 176 | SparseVector(Vec.dims[1], collect(Vec.iPtr), collect(Vec.nzVal)) 177 | end 178 | 179 | function collect(Mat::CuSparseMatrixCSC) 180 | SparseMatrixCSC(Mat.dims[1], Mat.dims[2], collect(Mat.colPtr), collect(Mat.rowVal), collect(Mat.nzVal)) 181 | end 182 | function collect(Mat::CuSparseMatrixCSR) 183 | rowPtr = collect(Mat.rowPtr) 184 | colVal = collect(Mat.colVal) 185 | nzVal = collect(Mat.nzVal) 186 | #construct Is 187 | I = similar(colVal) 188 | counter = 1 189 | for row = 1 : size(Mat)[1], k = rowPtr[row] : (rowPtr[row+1]-1) 190 | I[counter] = row 191 | counter += 1 192 | end 193 | return sparse(I,colVal,nzVal,Mat.dims[1],Mat.dims[2]) 194 | end 195 | 196 | summary(g::CuSparseMatrix) = string(g) 197 | summary(g::CuSparseVector) = string(g) 198 | 199 | CuSparseVector(iPtr::Vector{Ti}, nzVal::Vector{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(CuArray(convert(Vector{Cint},iPtr)), CuArray(nzVal), dims, convert(Cint,length(nzVal))) 200 | CuSparseVector(iPtr::CuArray{Ti}, nzVal::CuArray{T}, dims::Int) where {T<:BlasFloat, Ti<:Integer} = CuSparseVector{T}(iPtr, nzVal, dims, convert(Cint,length(nzVal))) 201 | 202 | CuSparseMatrixCSC(colPtr::Vector{Ti}, rowVal::Vector{Ti}, nzVal::Vector{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(CuArray(convert(Vector{Cint},colPtr)), CuArray(convert(Vector{Cint},rowVal)), CuArray(nzVal), dims, convert(Cint,length(nzVal))) 203 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, convert(Cint,length(nzVal))) 204 | CuSparseMatrixCSC(colPtr::CuArray{Ti}, rowVal::CuArray{Ti}, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where {T<:BlasFloat,Ti<:Integer} = CuSparseMatrixCSC{T}(colPtr, rowVal, nzVal, dims, nnz) 205 | 206 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, convert(Cint,length(nzVal))) 207 | CuSparseMatrixCSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixCSR{T}(rowPtr, colVal, nzVal, dims, nnz) 208 | 209 | CuSparseMatrixBSR(rowPtr::CuArray, colVal::CuArray, nzVal::CuArray{T}, blockDim, dir, nnz, dims::NTuple{2,Int}) where T = CuSparseMatrixBSR{T}(rowPtr, colVal, nzVal, dims, blockDim, dir, nnz) 210 | 211 | CuSparseVector(Vec::SparseVector) = CuSparseVector(Vec.nzind, Vec.nzval, size(Vec)[1]) 212 | CuSparseMatrixCSC(Vec::SparseVector) = CuSparseMatrixCSC([1], Vec.nzind, Vec.nzval, size(Vec)) 213 | CuSparseVector(Mat::SparseMatrixCSC) = size(Mat,2) == 1 ? CuSparseVector(Mat.rowval, Mat.nzval, size(Mat)[1]) : throw(ArgumentError()) 214 | CuSparseMatrixCSC(Mat::SparseMatrixCSC) = CuSparseMatrixCSC(Mat.colptr, Mat.rowval, Mat.nzval, size(Mat)) 215 | CuSparseMatrixCSR(Mat::SparseMatrixCSC) = switch2csr(CuSparseMatrixCSC(Mat)) 216 | 217 | similar(Vec::CuSparseVector) = CuSparseVector(copy(Vec.iPtr), similar(Vec.nzVal), Vec.dims[1]) 218 | similar(Mat::CuSparseMatrixCSC) = CuSparseMatrixCSC(copy(Mat.colPtr), copy(Mat.rowVal), similar(Mat.nzVal), Mat.nnz, Mat.dims) 219 | similar(Mat::CuSparseMatrixCSR) = CuSparseMatrixCSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.nnz, Mat.dims) 220 | similar(Mat::CuSparseMatrixBSR) = CuSparseMatrixBSR(copy(Mat.rowPtr), copy(Mat.colVal), similar(Mat.nzVal), Mat.blockDim, Mat.dir, Mat.nnz, Mat.dims) 221 | 222 | function copyto!(dst::CuSparseVector, src::CuSparseVector) 223 | if dst.dims != src.dims 224 | throw(ArgumentError("Inconsistent Sparse Vector size")) 225 | end 226 | copyto!(dst.iPtr, src.iPtr) 227 | copyto!(dst.nzVal, src.nzVal) 228 | dst.nnz = src.nnz 229 | dst 230 | end 231 | 232 | function copyto!(dst::CuSparseMatrixCSC, src::CuSparseMatrixCSC) 233 | if dst.dims != src.dims 234 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 235 | end 236 | copyto!(dst.colPtr, src.colPtr) 237 | copyto!(dst.rowVal, src.rowVal) 238 | copyto!(dst.nzVal, src.nzVal) 239 | dst.nnz = src.nnz 240 | dst 241 | end 242 | 243 | function copyto!(dst::CuSparseMatrixCSR, src::CuSparseMatrixCSR) 244 | if dst.dims != src.dims 245 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 246 | end 247 | copyto!(dst.rowPtr, src.rowPtr) 248 | copyto!(dst.colVal, src.colVal) 249 | copyto!(dst.nzVal, src.nzVal) 250 | dst.nnz = src.nnz 251 | dst 252 | end 253 | 254 | function copyto!(dst::CuSparseMatrixBSR, src::CuSparseMatrixBSR) 255 | if dst.dims != src.dims 256 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 257 | end 258 | copyto!(dst.rowPtr, src.rowPtr) 259 | copyto!(dst.colVal, src.colVal) 260 | copyto!(dst.nzVal, src.nzVal) 261 | dst.dir = src.dir 262 | dst.nnz = src.nnz 263 | dst 264 | end 265 | 266 | function copyto!(dst::CuSparseMatrixHYB, src::CuSparseMatrixHYB) 267 | if dst.dims != src.dims 268 | throw(ArgumentError("Inconsistent Sparse Matrix size")) 269 | end 270 | dst.Mat = src.Mat 271 | dst.nnz = src.nnz 272 | dst 273 | end 274 | 275 | copy(Vec::CuSparseVector) = copyto!(similar(Vec),Vec) 276 | copy(Mat::CuSparseMatrixCSC) = copyto!(similar(Mat),Mat) 277 | copy(Mat::CuSparseMatrixCSR) = copyto!(similar(Mat),Mat) 278 | copy(Mat::CuSparseMatrixBSR) = copyto!(similar(Mat),Mat) 279 | -------------------------------------------------------------------------------- /test/solver.jl: -------------------------------------------------------------------------------- 1 | @testset "CUSOLVER" begin 2 | 3 | if !isdefined(CuArrays, :CUSOLVER) 4 | @warn "Not testing CUSOLVER" 5 | else 6 | using CuArrays.CUSOLVER 7 | @info "Testing CUSOLVER $(CUSOLVER.version())" 8 | 9 | using LinearAlgebra 10 | 11 | m = 15 12 | n = 10 13 | l = 13 14 | k = 1 15 | 16 | @test_throws ArgumentError CUSOLVER.cusolverjob('M') 17 | 18 | @testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64] 19 | @testset "Cholesky (po)" begin 20 | A = rand(elty,n,n) 21 | A = A*A' #posdef 22 | B = rand(elty,n,n) 23 | d_A = CuArray(A) 24 | d_B = CuArray(B) 25 | 26 | d_F = cholesky(d_A, Val(false)) 27 | F = cholesky(A, Val(false)) 28 | @test F.U ≈ collect(d_F.U) 29 | @test F\(A'B) ≈ collect(d_F\(d_A'd_B)) 30 | 31 | d_F = cholesky(Hermitian(d_A, :L), Val(false)) 32 | F = cholesky(Hermitian(A, :L), Val(false)) 33 | @test F.L ≈ collect(d_F.L) 34 | @test F\(A'B) ≈ collect(d_F\(d_A'd_B)) 35 | 36 | @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,CuArray(rand(elty,m,m))) 37 | 38 | A = rand(elty,m,n) 39 | d_A = CuArray(A) 40 | @test_throws DimensionMismatch cholesky(d_A) 41 | @test_throws DimensionMismatch LinearAlgebra.LAPACK.potrs!('U',d_A,d_B) 42 | 43 | A = zeros(elty,n,n) 44 | d_A = CuArray(A) 45 | @test_throws LinearAlgebra.PosDefException cholesky(d_A) 46 | end 47 | 48 | @testset "getrf!" begin 49 | A = rand(elty,m,n) 50 | d_A = CuArray(A) 51 | d_A,d_ipiv = CUSOLVER.getrf!(d_A) 52 | h_A = collect(d_A) 53 | h_ipiv = collect(d_ipiv) 54 | alu = LinearAlgebra.LU(h_A, convert(Vector{Int},h_ipiv), zero(Int)) 55 | @test A ≈ Array(alu) 56 | A = zeros(elty,n,n) 57 | d_A = CuArray(A) 58 | @test_throws LinearAlgebra.SingularException CUSOLVER.getrf!(d_A) 59 | end 60 | 61 | @testset "getrs!" begin 62 | A = rand(elty,n,n) 63 | d_A = CuArray(A) 64 | d_A,d_ipiv = CUSOLVER.getrf!(d_A) 65 | B = rand(elty,n,n) 66 | d_B = CuArray(B) 67 | d_B = CUSOLVER.getrs!('N',d_A,d_ipiv,d_B) 68 | h_B = collect(d_B) 69 | @test h_B ≈ A\B 70 | A = rand(elty,m,n) 71 | d_A = CuArray(A) 72 | @test_throws DimensionMismatch CUSOLVER.getrs!('N',d_A,d_ipiv,d_B) 73 | A = rand(elty,n,n) 74 | d_A = CuArray(A) 75 | B = rand(elty,m,n) 76 | d_B = CuArray(B) 77 | @test_throws DimensionMismatch CUSOLVER.getrs!('N',d_A,d_ipiv,d_B) 78 | end 79 | 80 | @testset "geqrf!" begin 81 | A = rand(elty,m,n) 82 | d_A = CuArray(A) 83 | d_A,d_tau = CUSOLVER.geqrf!(d_A) 84 | h_A = collect(d_A) 85 | h_tau = collect(d_tau) 86 | qra = LinearAlgebra.QR(h_A, h_tau) 87 | @test A ≈ Array(qra) 88 | end 89 | 90 | @testset "ormqr!" begin 91 | A = rand(elty, m, n) 92 | d_A = CuArray(A) 93 | d_A, d_tau = CUSOLVER.geqrf!(d_A) 94 | B = rand(elty, n, l) 95 | d_B = CuArray(B) 96 | d_B = CUSOLVER.ormqr!('L', 'N', d_A, d_tau, d_B) 97 | h_B = collect(d_B) 98 | F = qr!(A) 99 | @test h_B ≈ Array(F.Q)*B 100 | A = rand(elty, n, m) 101 | d_A = CuArray(A) 102 | d_A, d_tau = CUSOLVER.geqrf!(d_A) 103 | B = rand(elty, n, l) 104 | d_B = CuArray(B) 105 | d_B = CUSOLVER.ormqr!('L', 'N', d_A, d_tau, d_B) 106 | h_B = collect(d_B) 107 | F = qr!(A) 108 | @test h_B ≈ Array(F.Q)*B 109 | A = rand(elty, m, n) 110 | d_A = CuArray(A) 111 | d_A, d_tau = CUSOLVER.geqrf!(d_A) 112 | B = rand(elty, l, m) 113 | d_B = CuArray(B) 114 | d_B = CUSOLVER.ormqr!('R', 'N', d_A, d_tau, d_B) 115 | h_B = collect(d_B) 116 | F = qr!(A) 117 | @test h_B ≈ B*Array(F.Q) 118 | A = rand(elty, n, m) 119 | d_A = CuArray(A) 120 | d_A, d_tau = CUSOLVER.geqrf!(d_A) 121 | B = rand(elty, l, n) 122 | d_B = CuArray(B) 123 | d_B = CUSOLVER.ormqr!('R', 'N', d_A, d_tau, d_B) 124 | h_B = collect(d_B) 125 | F = qr!(A) 126 | @test h_B ≈ B*Array(F.Q) 127 | end 128 | 129 | @testset "orgqr!" begin 130 | A = rand(elty,n,m) 131 | d_A = CuArray(A) 132 | d_A,d_tau = CUSOLVER.geqrf!(d_A) 133 | d_Q = CUSOLVER.orgqr!(d_A, d_tau) 134 | h_Q = collect(d_Q) 135 | F = qr!(A) 136 | @test h_Q ≈ Array(F.Q) 137 | A = rand(elty,m,n) 138 | d_A = CuArray(A) 139 | d_A,d_tau = CUSOLVER.geqrf!(d_A) 140 | d_Q = CUSOLVER.orgqr!(d_A, d_tau) 141 | h_Q = collect(d_Q) 142 | F = qr!(A) 143 | @test h_Q ≈ Array(F.Q) 144 | end 145 | 146 | @testset "sytrf!" begin 147 | A = rand(elty,n,n) 148 | A = A + A' #symmetric 149 | d_A = CuArray(A) 150 | d_A,d_ipiv = CUSOLVER.sytrf!('U',d_A) 151 | h_A = collect(d_A) 152 | h_ipiv = collect(d_ipiv) 153 | A, ipiv = LAPACK.sytrf!('U',A) 154 | @test ipiv == h_ipiv 155 | @test A ≈ h_A 156 | A = rand(elty,m,n) 157 | d_A = CuArray(A) 158 | @test_throws DimensionMismatch CUSOLVER.sytrf!('U',d_A) 159 | A = zeros(elty,n,n) 160 | d_A = CuArray(A) 161 | @test_throws LinearAlgebra.SingularException CUSOLVER.sytrf!('U',d_A) 162 | end 163 | 164 | @testset "gebrd!" begin 165 | A = rand(elty,m,n) 166 | d_A = CuArray(A) 167 | d_A, d_D, d_E, d_TAUQ, d_TAUP = CUSOLVER.gebrd!(d_A) 168 | h_A = collect(d_A) 169 | h_D = collect(d_D) 170 | h_E = collect(d_E) 171 | h_TAUQ = collect(d_TAUQ) 172 | h_TAUP = collect(d_TAUP) 173 | A,d,e,q,p = LAPACK.gebrd!(A) 174 | #@test A ≈ h_A 175 | @test d ≈ h_D 176 | @test e ≈ h_E 177 | @test q ≈ h_TAUQ 178 | @test p ≈ h_TAUP 179 | end 180 | 181 | @testset "syevd!" begin 182 | A = rand(elty,m,m) 183 | A += A' 184 | d_A = CuArray(A) 185 | local d_W, d_V 186 | if( elty <: Complex ) 187 | d_W, d_V = CUSOLVER.heevd!('V','U', d_A) 188 | else 189 | d_W, d_V = CUSOLVER.syevd!('V','U', d_A) 190 | end 191 | h_W = collect(d_W) 192 | h_V = collect(d_V) 193 | Eig = eigen(A) 194 | @test Eig.values ≈ h_W 195 | @test abs.(Eig.vectors'*h_V) ≈ I 196 | d_A = CuArray(A) 197 | if( elty <: Complex ) 198 | d_W = CUSOLVER.heevd!('N','U', d_A) 199 | else 200 | d_W = CUSOLVER.syevd!('N','U', d_A) 201 | end 202 | h_W = collect(d_W) 203 | @test Eig.values ≈ h_W 204 | end 205 | 206 | @testset "sygvd!" begin 207 | A = rand(elty,m,m) 208 | B = rand(elty,m,m) 209 | A *= A' 210 | B *= B' 211 | d_A = CuArray(A) 212 | d_B = CuArray(B) 213 | local d_W, d_VA, d_VB 214 | if( elty <: Complex ) 215 | d_W, d_VA, d_VB = CUSOLVER.hegvd!(1, 'V','U', d_A, d_B) 216 | else 217 | d_W, d_VA, d_VB = CUSOLVER.sygvd!(1, 'V','U', d_A, d_B) 218 | end 219 | h_W = collect(d_W) 220 | h_VA = collect(d_VA) 221 | h_VB = collect(d_VB) 222 | Eig = eigen(Hermitian(A), Hermitian(B)) 223 | @test Eig.values ≈ h_W 224 | @test A*h_VA ≈ B*h_VA*Diagonal(h_W) rtol=1e-4 225 | # test normalization condition for eigtype 1 226 | @test abs.(h_VA'B*h_VA) ≈ Matrix(one(elty)*I, m, m) 227 | d_A = CuArray(A) 228 | d_B = CuArray(B) 229 | if( elty <: Complex ) 230 | d_W = CUSOLVER.hegvd!(1, 'N','U', d_A, d_B) 231 | else 232 | d_W = CUSOLVER.sygvd!(1, 'N','U', d_A, d_B) 233 | end 234 | h_W = collect(d_W) 235 | @test Eig.values ≈ h_W 236 | d_B = CuArray(rand(elty, m+1, m+1)) 237 | if( elty <: Complex ) 238 | @test_throws DimensionMismatch CUSOLVER.hegvd!(1, 'N','U', d_A, d_B) 239 | else 240 | @test_throws DimensionMismatch CUSOLVER.sygvd!(1, 'N','U', d_A, d_B) 241 | end 242 | end 243 | 244 | @testset "syevj!" begin 245 | A = rand(elty,m,m) 246 | B = rand(elty,m,m) 247 | A *= A' 248 | B *= B' 249 | d_A = CuArray(A) 250 | d_B = CuArray(B) 251 | local d_W, d_VA, d_VB 252 | if( elty <: Complex ) 253 | d_W, d_VA, d_VB = CUSOLVER.hegvj!(1, 'V','U', d_A, d_B) 254 | else 255 | d_W, d_VA, d_VB = CUSOLVER.sygvj!(1, 'V','U', d_A, d_B) 256 | end 257 | h_W = collect(d_W) 258 | h_VA = collect(d_VA) 259 | h_VB = collect(d_VB) 260 | Eig = eigen(Hermitian(A), Hermitian(B)) 261 | @test Eig.values ≈ h_W 262 | @test A*h_VA ≈ B*h_VA*Diagonal(h_W) rtol=1e-4 263 | # test normalization condition for eigtype 1 264 | @test abs.(h_VA'B*h_VA) ≈ Matrix(one(elty)*I, m, m) 265 | d_A = CuArray(A) 266 | d_B = CuArray(B) 267 | if( elty <: Complex ) 268 | d_W = CUSOLVER.hegvj!(1, 'N','U', d_A, d_B) 269 | else 270 | d_W = CUSOLVER.sygvj!(1, 'N','U', d_A, d_B) 271 | end 272 | h_W = collect(d_W) 273 | @test Eig.values ≈ h_W 274 | end 275 | 276 | @testset "svd with $method method" for 277 | method in (CUSOLVER.QRAlgorithm, CUSOLVER.JacobiAlgorithm), 278 | (_m, _n) in ((m, n), (n, m)) 279 | 280 | A = rand(elty, _m, _n) 281 | U, S, V = svd(A, full=true) 282 | d_A = CuArray(A) 283 | 284 | if _m > _n || method == CUSOLVER.JacobiAlgorithm 285 | d_U, d_S, d_V = svd(d_A, method, full=true) 286 | h_S = collect(d_S) 287 | h_U = collect(d_U) 288 | h_V = collect(d_V) 289 | @test abs.(h_U'h_U) ≈ I 290 | @test abs.(h_U[:,1:min(_m,_n)]'U[:,1:min(_m,_n)]) ≈ I 291 | @test collect(svdvals(d_A, method)) ≈ svdvals(A) 292 | @test abs.(h_V'h_V) ≈ I 293 | @test abs.(h_V[:,1:min(_m,_n)]'*V[:,1:min(_m,_n)]) ≈ I 294 | @test collect(d_U'*d_A*d_V) ≈ U'*A*V 295 | @test collect(svd(d_A, method).V') == h_V[:,1:min(_m,_n)]' 296 | else 297 | @test_throws ArgumentError svd(d_A, method) 298 | end 299 | end 300 | # Check that constant propagation works 301 | _svd(A) = svd(A, CUSOLVER.QRAlgorithm) 302 | @inferred _svd(CuArrays.CURAND.curand(Float32, 4, 4)) 303 | 304 | 305 | @testset "qr" begin 306 | tol = min(m, n)*eps(real(elty))*(1 + (elty <: Complex)) 307 | 308 | A = rand(elty, m, n) 309 | d_A = CuArray(A) 310 | d_F = qr(d_A) 311 | d_RR = d_F.Q'*d_A 312 | @test d_RR[1:n,:] ≈ d_F.R atol=tol*norm(A) 313 | @test norm(d_RR[n+1:end,:]) < tol*norm(A) 314 | A = rand(elty, n, m) 315 | d_A = CuArray(A) 316 | d_F = qr(d_A) 317 | @test d_F.Q'*d_A ≈ d_F.R atol=tol*norm(A) 318 | A = rand(elty, m, n) 319 | d_A = CuArray(A) 320 | h_q, h_r = qr(d_A) 321 | q, r = qr(A) 322 | @test Array(h_q) ≈ Array(q) 323 | @test Array(h_r) ≈ Array(r) 324 | A = rand(elty, n, m) 325 | d_A = CuArray(A) 326 | h_q, h_r = qr(d_A) # FixMe! Use iteration protocol when implemented 327 | q, r = qr(A) 328 | @test Array(h_q) ≈ Array(q) 329 | @test Array(h_r) ≈ Array(r) 330 | end 331 | 332 | end 333 | 334 | end 335 | 336 | end 337 | --------------------------------------------------------------------------------